In [1]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier 
import pandas as pd
import numpy as np



In [2]:
iris = load_iris()

df = pd.DataFrame(iris.data, columns=iris.feature_names)

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [3]:
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [18]:
df['is_train'] = np.random.uniform(0,1, len(df)) <= .75

df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,False
1,4.9,3.0,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [19]:
train, test = df[df['is_train'] == True], df[df['is_train'] == False]

print("numbers of training data = " , len(train))

print("numbers of test data = " , len(test))



numbers of training data =  110
numbers of test data =  40


In [20]:
features = df.columns[:4]

features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [21]:
y = pd.factorize(train['species'])[0]

print(y)


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]


In [22]:
clf = RandomForestClassifier(n_jobs=2, random_state=0)

clf.fit(train[features], y )

print(test[features])



     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                  5.1               3.5                1.4               0.2
5                  5.4               3.9                1.7               0.4
8                  4.4               2.9                1.4               0.2
17                 5.1               3.5                1.4               0.3
26                 5.0               3.4                1.6               0.4
32                 5.2               4.1                1.5               0.1
34                 4.9               3.1                1.5               0.2
35                 5.0               3.2                1.2               0.2
45                 4.8               3.0                1.4               0.3
46                 5.1               3.8                1.6               0.2
50                 7.0               3.2                4.7               1.4
57                 4.9               2.4                3.3     

In [23]:
# prediction

clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       1, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2], dtype=int64)

In [25]:
preds = iris.target_names[clf.predict(test[features])]

print(preds)

['setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'setosa' 'versicolor' 'versicolor' 'versicolor' 'versicolor'
 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor'
 'versicolor' 'versicolor' 'virginica' 'versicolor' 'versicolor'
 'versicolor' 'versicolor' 'virginica' 'virginica' 'versicolor'
 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica'
 'virginica' 'versicolor' 'virginica' 'virginica' 'virginica']


In [26]:
test['species'].head()

0     setosa
5     setosa
8     setosa
17    setosa
26    setosa
Name: species, dtype: category
Categories (3, object): ['setosa', 'versicolor', 'virginica']

In [29]:
pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames = ['Predicted Species '])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,10,0,0
versicolor,0,15,1
virginica,0,2,12
