In [None]:
# Random forest
# load the library with the iris dataset
from sklearn.datasets import load_iris 
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier 
import pandas as pd # Load pandas
import numpy as np # Load numpy
np.random.seed(0) # Set random seed

In [None]:
# Create an object called iris with the iris data
iris = load_iris()
# Create a dataframe with the four feature variables
df = pd.DataFrame(iris.data, columns=iris.feature_names)
# View the top 5 rows
df.head()

In [None]:
print(iris.keys())
print(iris)

In [None]:
# Add a new column with the species names, this is what we are going to try to predict
df['species'] = pd.Categorical.from_codes(iris.target,iris.target_names)
# View the top 5 rows
df.head()

In [6]:
# Create a new column that for each row, generates a random number between 0 and 1,
# and if that value is less than or equal to .75, then sets the value of that cell 
# as True and False otherwise. This is a way of randomly assigning some rows to 
# be used as the training data and some as the test data.
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,False
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [9]:
# Create two new dataframes, one with the training rows, another with the test rows
train, test = df[df['is_train']==True],df[df['is_train']==False]

In [11]:
print("Number of observations in the training data:",len(train))
print("Number of observations in the test data:",len(test))

Number of observations in the training data: 112
Number of observations in the test data: 38


In [12]:
# Create a list of the feature column's names
features = df.columns[:4]
# View features
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [13]:
# Train['species'] contains the actual species names. Before we can use it,
# we need to convert each species name into a digit. So, in this case there
# are three species, which have been coded 0, 1, or 2
y = pd.factorize(train['species'])[0]
# View target
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2], dtype=int64)

In [15]:
# Create a random forest classifier. 
clf = RandomForestClassifier(n_jobs=2,random_state=0)
# Train the classifier to take the training features and learn how they relate
# to the training y
clf.fit(train[features],y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [19]:
# Apply the classifier we trained to the test data (which, remember, it has never seen before)
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [29]:
# View the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[0:10]

array([[1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0.9, 0.1, 0. ],
       [0.9, 0.1, 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ]])

In [30]:
# Create a actual English names for the plants for each predicted plat class
preds = iris.target_names[clf.predict(test[features])]

In [31]:
# View the predicted species fo the first five observations
preds[0:5]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa'], dtype='<U10')

In [35]:
# View the actual species for the first five observations
test['species'].head()

1     setosa
5     setosa
6     setosa
13    setosa
14    setosa
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]

In [None]:
# Create confusion matrix
pd.crosstab(test['species'], preds, rownames=['Actual Species'], columns=['Predicted Species'])

In [None]:
# Regression