In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
import pydotplus
import numpy as np
import pandas as pd

In [2]:
iris = pd.read_excel("./Data/Iris.xls")
iris.sample(5)

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,iris
147,6.5,3.0,5.2,2.0,Iris-virginica
64,5.6,2.9,3.6,1.3,Iris-versicolor
14,5.8,4.0,1.2,0.2,Iris-setosa
113,5.7,2.5,5.0,2.0,Iris-virginica
92,5.8,2.6,4.0,1.2,Iris-versicolor


In [3]:
X = iris[['sepallength', 'sepalwidth', 'petallength', 'petalwidth']]
y = iris['iris']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [8]:
model = RandomForestClassifier(n_estimators=58)
model.fit(X_train, y_train)

In [14]:
model.estimators_[:3]

[DecisionTreeClassifier(max_features='sqrt', random_state=1471780350),
 DecisionTreeClassifier(max_features='sqrt', random_state=270971456),
 DecisionTreeClassifier(max_features='sqrt', random_state=56733930)]

In [15]:
# Evaluate the model on the test dataset
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9555555555555556


#### Find Important Features In Sklearn

In [32]:
model = RandomForestClassifier(n_estimators=58)
model.fit(X_train, y_train)
imp_features = pd.Series(model.feature_importances_,
                         index=X.columns).sort_values(ascending=False)

In [33]:
model.feature_importances_

array([0.54479961, 0.45520039])

In [34]:
imp_features

petallength    0.5448
petalwidth     0.4552
dtype: float64

In [35]:
X = iris[['petallength', 'petalwidth']]
y = iris['iris']

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [37]:
model_imp = RandomForestClassifier(n_estimators=58)
model_imp.fit(X_train, y_train)

In [38]:
model_imp.estimators_[:3]

[DecisionTreeClassifier(max_features='sqrt', random_state=74752251),
 DecisionTreeClassifier(max_features='sqrt', random_state=1921285421),
 DecisionTreeClassifier(max_features='sqrt', random_state=1119055923)]

In [39]:
# Evaluate the model on the test dataset
y_pred = model_imp.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [40]:
# Make new prediction
X_new = np.array([[1.3, 0.2],
                  [4.4, 1.4],
                  [5.1, 1.8]])

In [41]:
yhat_new = model_imp.predict(X_new)
yhat_new



array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [43]:
# Avg max depth 
max_depth = list()
for tree in model_imp.estimators_:
    max_depth.append(tree.tree_.max_depth)
print("avg max depth %0.1f" % (sum(max_depth) / len(max_depth)))

avg max depth 5.3
