<a href="https://colab.research.google.com/github/tiwaripari/ML/blob/main/RandomForestIris.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
np.random.seed(0)

In [3]:
iris = load_iris()
df = pd.DataFrame(iris.data, columns= iris.feature_names)
df.head()



Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [4]:
df['Species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [5]:
#Creating Test and Train Data
# 75% of the data to train the model while rest 25% to test
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
#View top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [6]:
#Creating dataframes with test rows and training rows
train, test = df[df['is_train']==True], df[df['is_train']==False]

#Show the number of observations for the  test and training dataframes
print("training data ", len(train))
print("testing data ", len(test))


training data  118
testing data  32


In [7]:
#Create a list of feature column's names
features = df.columns[:4]

#view features
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [8]:
#Converting each species name into digits
y = pd.factorize(train['Species'])[0]
#viewing target
y


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2])

In [9]:
x = pd.factorize(test['Species'])[0]
x

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [10]:
#Crea ting random forest classifier 
clf = RandomForestClassifier(n_jobs=2, random_state=0)
#n_jobs sets priority useful when very large data
#random_state tells how it starts
#Training the Classifier
clf.fit(train[features], y)


In [11]:
#Applying the trained Classifier to the test

clf.predict(test[features])


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [12]:
#viewing predicted probablities of first 10 observation
clf.predict_proba(test[features])[10:20]

array([[1.  , 0.  , 0.  ],
       [0.99, 0.01, 0.  ],
       [1.  , 0.  , 0.  ],
       [0.  , 0.67, 0.33],
       [0.  , 1.  , 0.  ],
       [0.  , 0.82, 0.18],
       [0.  , 0.03, 0.97],
       [0.  , 0.42, 0.58],
       [0.  , 0.99, 0.01],
       [0.  , 0.96, 0.04]])

In [13]:
#mapping names for the plants for each predicted plant class
preds = iris.target_names[clf.predict(test[features])]

#View the predicted species for firat five observations
preds[10:20]

array(['setosa', 'setosa', 'setosa', 'versicolor', 'versicolor',
       'versicolor', 'virginica', 'virginica', 'versicolor', 'versicolor'],
      dtype='<U10')

In [14]:
#Viewing the Actual species for the first five observation
test['Species'].head()

7     setosa
8     setosa
10    setosa
13    setosa
17    setosa
Name: Species, dtype: category
Categories (3, object): ['setosa', 'versicolor', 'virginica']

In [15]:
#Creating confusion matrix
#crosstab takes two sets of data and creates chaart out of it
pd.crosstab(test['Species'], preds, rownames = ['Actual Species'],
colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,13,0,0
versicolor,0,5,2
virginica,0,0,12


In [20]:
# accuracy = ((T/T+F) * 100)
accuracy = accuracy_score(x, clf.predict(test[features]))

In [21]:
accuracy

0.9375

In [24]:
scores = cross_val_score(clf, df[features], df['Species'], cv =5 )
print(scores)
print(scores.mean()*100)

[0.96666667 0.96666667 0.93333333 0.96666667 1.        ]
96.66666666666669
