In [5]:
# Loading the Library with the iris dataset
from sklearn.datasets import load_iris
# Loading scikit'srandom forest classifier Library
from sklearn.ensemble import RandomForestClassifier
# Loading pandas 
import pandas as pd
# loading numpy
import numpy as np
# Setting random seed
np.random.seed(0)

In [6]:
# Creating an object called iris with the iris data
iris = load_iris()
# print(iris)
# Creating a dataframewith the four feature variables
df = pd.DataFrame(iris.data, columns=iris.feature_names)
# Viewing the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [7]:
# Adding a new column for the species name
df['species'] = pd.Categorical.from_codes(iris.target,iris.target_names)
# Viewing the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [8]:
# Creating Test and Train Data
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
# view the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [12]:
# Creating dataframes with test and training rows
train, test = df[df['is_train']==True], df[df['is_train']==False]
# Show the number of obesrvations for the test and training dataframe
print('Number of obeservations in the training data:', len(train))
print('Number of obeservations in the test data:', len(test))

Number of obeservations in the training data: 118
Number of obeservations in the test data: 32


In [13]:
# Creating a list of the feature columns's names
features = df.columns[:4]
# View features
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [14]:
# Converting each species name into digits
y = pd.factorize(train['species'])[0]
# Viewing target
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [18]:
# Creating a random froest Classifier
clf = RandomForestClassifier(n_jobs=2, random_state=0)
# Traing the classifier
clf.fit(train[features],y)

RandomForestClassifier(n_jobs=2, random_state=0)

In [22]:
# Applying the trained Classifier to the test
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [26]:
# viewing the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[10:20]

array([[1.  , 0.  , 0.  ],
       [0.99, 0.01, 0.  ],
       [1.  , 0.  , 0.  ],
       [0.  , 0.67, 0.33],
       [0.  , 1.  , 0.  ],
       [0.  , 0.82, 0.18],
       [0.  , 0.03, 0.97],
       [0.  , 0.42, 0.58],
       [0.  , 0.99, 0.01],
       [0.  , 0.96, 0.04]])

In [28]:
# mapping names for the plants for each predicted plant class
preds = iris.target_names[clf.predict(test[features])]
# View the PREDICTED species for the first five obeservations
preds[0:25]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'versicolor', 'versicolor', 'versicolor', 'virginica',
       'virginica', 'versicolor', 'versicolor', 'virginica', 'virginica',
       'virginica', 'virginica', 'virginica'], dtype='<U10')

In [29]:
# Viewing the ACTUAlL species for the first five obesrvations
test['species'].head()

7     setosa
8     setosa
10    setosa
13    setosa
17    setosa
Name: species, dtype: category
Categories (3, object): ['setosa', 'versicolor', 'virginica']

In [31]:
# Creating confusuin matrix
pd.crosstab(test['species'],preds, rownames=["Actual Species"], colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,13,0,0
versicolor,0,5,2
virginica,0,0,12


In [32]:
preds = iris.target_names[clf.predict([[5.0,3.6,1.4,2.0],[5.0,3.6,1.4,2.0]])]
preds

array(['setosa', 'setosa'], dtype='<U10')