In [10]:
#Loading the dataset with the iris library from scikit learn
from sklearn.datasets import load_iris

#importing the algorithm RFC from scikit learn
from sklearn.ensemble import RandomForestClassifier

#Loading pandas and numpy
import pandas as pd
import numpy as np

#Setting a random seed
np.random.seed(0)



In [16]:
#crating an iris data set and then loading the data into a variable
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
print(df.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2


In [17]:
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
print(df.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

  species  
0  setosa  
1  setosa  
2  setosa  
3  setosa  
4  setosa  


In [20]:
#Creating test and train data
#loads 75% of data into train randomly 
df['is_train'] = np.random.uniform(0,1,len(df)) <= 0.75
print(df.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

  species  is_train  
0  setosa      True  
1  setosa     False  
2  setosa      True  
3  setosa      True  
4  setosa      True  


In [22]:
train, test = df[df['is_train'] == True] , df[df['is_train'] == False]
print("Number of training data = ",  len(train))
print("Number of testing data = ", len(test))


Number of training data =  112
Number of testing data =  38


In [24]:
#Create a list of feature column names
features = df.columns[:4]
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [27]:
#Assigning an integer value to each species
y = pd.factorize(train['species'])[0]
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2], dtype=int64)

In [28]:
 #Creating the RFC
 clf = RandomForestClassifier(n_jobs=2, random_state=0)
 
 #Training the classifier
 clf.fit(train[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [29]:
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [31]:
test[features].head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
1,4.9,3.0,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
13,4.3,3.0,1.1,0.1
14,5.8,4.0,1.2,0.2


In [33]:
#Viewing the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[10:20]

array([[0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 0.9, 0.1],
       [0. , 1. , 0. ],
       [0. , 0.2, 0.8]])

In [35]:
#Mapping the target features with with integers
preds = iris.target_names[clf.predict(test[features])]
preds[0:5]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa'], dtype='<U10')

In [38]:
#Printng the actual values of our testing data
test['species'].head()

1     setosa
5     setosa
6     setosa
13    setosa
14    setosa
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]

In [39]:
#Creating confusion matrix
pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,10,0,0
versicolor,0,9,0
virginica,0,1,18


In [41]:
#Predicting flowers directly by inputting numbers
preds_num = iris.target_names[clf.predict( [[5.0,3.6,1.4,2.0],[5.0,3.6,1.4,2.0]])]
preds_num

array(['virginica', 'virginica'], dtype='<U10')