<a href="https://colab.research.google.com/github/sumedhekaru/flower_species_classification_random_forest/blob/master/flower_species_classification_random_forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Loading Library
from sklearn.datasets import load_iris

# Loading scikit's random forest classifier
from sklearn.ensemble import RandomForestClassifier

# Loading pandas
import pandas as pd

# Loading numpy
import numpy as np

# Setting random seed
np.random.seed(0)

# confusin matrix
from sklearn.metrics import confusion_matrix

In [2]:
#Exploring the data  

# Creating an object called iris with the iris data
iris = load_iris()
# creating data frame with the four feature variables
df = pd.DataFrame(iris.data,columns=iris.feature_names)
df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [3]:
# Adding new column for the species name
df['specieis'] = pd.Categorical.from_codes(iris.target, iris.target_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),specieis
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [5]:
# Let's factorize the species
df['species_id'] = pd.factorize(df.specieis)[0]
df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),specieis,species_id
0,5.1,3.5,1.4,0.2,setosa,0
1,4.9,3.0,1.4,0.2,setosa,0
2,4.7,3.2,1.3,0.2,setosa,0
3,4.6,3.1,1.5,0.2,setosa,0
4,5.0,3.6,1.4,0.2,setosa,0


In [6]:
# Create Test and Train Data
df['is_train'] = np.random.uniform(0,1,len(df)) <= 0.75
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),specieis,species_id,is_train
0,5.1,3.5,1.4,0.2,setosa,0,True
1,4.9,3.0,1.4,0.2,setosa,0,True
2,4.7,3.2,1.3,0.2,setosa,0,True
3,4.6,3.1,1.5,0.2,setosa,0,True
4,5.0,3.6,1.4,0.2,setosa,0,True


In [9]:
test = df[df.is_train == False]
print('Number of test data = %i' %(len(test)))

train = df[df.is_train == True]
print('Number of train data = %i' %(len(train)))



Number of test data = 32
Number of train data = 118


In [10]:
# Create a list of the feature column's name
features = df.columns[:4]
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [15]:
# Create random forest classifier
clf = RandomForestClassifier(n_jobs=2, random_state=0)

# Training the classifier
clf.fit(train[features],train.species_id)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [17]:
# Applying the the trained classifier to the test
y_predict = clf.predict(test[features])
y_predict

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [19]:
# Viewing the predicted probabilities
y_predict_probs = clf.predict_proba(test[features])[10:20]
y_predict_probs

array([[1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0.5, 0.5],
       [0. , 1. , 0. ],
       [0. , 0.9, 0.1],
       [0. , 0.2, 0.8],
       [0. , 0.3, 0.7],
       [0. , 1. , 0. ],
       [0. , 0.8, 0.2]])

In [21]:
# Mapping names for the plants for the each predicted plant class
preds = iris.target_names[clf.predict(test[features])]
preds[0:25]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'versicolor', 'versicolor', 'versicolor', 'virginica',
       'virginica', 'versicolor', 'versicolor', 'virginica', 'virginica',
       'virginica', 'virginica', 'virginica'], dtype='<U10')

In [22]:
# Creating confusion matrix
pd.crosstab(test.specieis,preds,rownames=['Actual Species'],colnames =['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,13,0,0
versicolor,0,5,2
virginica,0,0,12


array([[13,  0,  0],
       [ 0,  5,  2],
       [ 0,  0, 12]])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 2 1 1 1 1 1 2 2 2 2 2 2 2 2 1 1
 2 2 2]
