In [5]:
# Random forest
# Load the library with the iris dataset
from sklearn.datasets import load_iris
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier
import numpy as np # Load numpy
import pandas as pd
np.random.seed(0) # Set random seed

In [6]:
# Create an object called iris with the iris data
iris = load_iris()
# Create a dataframe with the four feature variables
df = pd.DataFrame(iris.data, columns=iris.feature_names)
# View the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [7]:
print(iris.keys())

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [None]:
print(iris)

In [12]:
# Add a new column with the species names, this is what we are going to try to predict
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
# View the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [None]:
# Create a new column that for each row, generates a random number between 0 and 1,
# and if that value is less than or equal to .75 then sets True or False otherwise.
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
# View the top 5 rows
df.head()

In [17]:
# Create two new dataframes, one with the training rows, another with the test rows
train, test = df[df['is_train'] == True], df[df['is_train'] == False]

In [18]:
print('Number of observations in the training data:',len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 113
Number of observations in the test data: 37


In [19]:
# Create a list of feature column's names
features = df.columns[:4]

In [None]:
# View features
features

In [21]:
# Train['species'] contains the actual species names. Before we can use it,
# we need to convert each species into a digit. So, in this case there 
# are 3 species, which is coded as 0, 1, or 2.
y = pd.factorize(train['species'])[0]
# View target
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2], dtype=int64)

In [None]:
# Create a random forest classifier.
clf = RandomForestClassifier(n_jobs=2, random_state=0)
# Train the classifier to take the training features and learn how the relate
# to the training y (the species)
clf.fit(train[features],y)

In [24]:
# Apply the classifier we trained to the test data
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 1, 1, 1, 2,
       2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2], dtype=int64)

In [28]:
# View the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[0:10]

array([[1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0.9, 0.1, 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0.9, 0.1, 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ]])

In [30]:
# Create actual English names for the plants for each predicted plant class 
# View the predicted species for the first five observations
preds = iris.target_names[clf.predict(test[features])]
preds[0:5]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa'], dtype='<U10')

In [31]:
# View the actual species for the first five observations
test['species'].head()

1     setosa
4     setosa
9     setosa
18    setosa
21    setosa
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]

In [32]:
# Create confusion matrix
pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,13,0,0
versicolor,0,7,1
virginica,0,2,14


In [33]:
# Regression