In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os

## Decisiontree Classifier multiple targets

In [2]:
from sklearn import tree
from sklearn.datasets import load_iris

In [3]:
# Load the Iris Dataset
iris = load_iris()
#print(iris.DESCR)

In [4]:
#iris.data

In [5]:
# if you want to split into train and test groups
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=42)

In [6]:
y_train

array([0, 0, 2, 1, 1, 0, 0, 1, 2, 2, 1, 2, 1, 2, 1, 0, 2, 1, 0, 0, 0, 1,
       2, 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 0, 2, 2, 1, 1, 2, 1, 0, 1, 2, 0,
       0, 1, 1, 0, 2, 0, 0, 1, 1, 2, 1, 2, 2, 1, 0, 0, 2, 2, 0, 0, 0, 1,
       2, 0, 2, 2, 0, 1, 1, 2, 1, 2, 0, 2, 1, 2, 1, 1, 1, 0, 1, 1, 0, 1,
       2, 2, 0, 1, 2, 2, 0, 2, 0, 1, 2, 2, 1, 2, 1, 1, 2, 2, 0, 1, 2, 0,
       1, 2])

In [7]:
#notice here we are not used the spliting train and test groups
# Create and score a decision tree classifier
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

1.0

In [14]:
# Decisiontree in sklearn will automatically calculate feature importance
decimportances = clf.feature_importances_
decimportances

array([0.01787567, 0.01787567, 0.39794324, 0.56630542])

In [8]:
sorted(zip(clf.feature_importances_, iris.feature_names), reverse=True)

[(0.8997460415815836, 'petal length (cm)'),
 (0.08237829007590591, 'petal width (cm)'),
 (0.017875668342510573, 'sepal length (cm)'),
 (0.0, 'sepal width (cm)')]

In [9]:
#deploy the train test groups, name the new classifier as clf2
clf2 = clf.fit(X_train, y_train)
clf2.score(X_test, y_test)

1.0

In [10]:
sorted(zip(clf2.feature_importances_, iris.feature_names), reverse=True)

[(0.5663054235122565, 'petal width (cm)'),
 (0.39794323980272234, 'petal length (cm)'),
 (0.01787566834251057, 'sepal width (cm)'),
 (0.01787566834251057, 'sepal length (cm)')]

In [11]:
#iris.data
iris.data.shape

(150, 4)

In [12]:
iris.target.shape

(150,)

In [13]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [20]:
# notice 3 targets 
#iris.target

## Random Forest Classifier deal with multi targets

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

In [22]:
# Load the Iris Dataset
iris = load_iris()
#print(iris.DESCR)

In [17]:
#iris.data
iris.data.shape

(150, 4)

In [18]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [23]:
# notice 3 targets 
#iris.target

In [24]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [25]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(iris.data, iris.target)
rf.score(iris.data, iris.target)

1.0

In [26]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
importances

array([0.08229387, 0.0224637 , 0.40705154, 0.48819089])

In [27]:
# We can sort the features by their importance
sorted(zip(rf.feature_importances_, iris.feature_names), reverse=True)

[(0.48819088973467756, 'petal width (cm)'),
 (0.4070515435570247, 'petal length (cm)'),
 (0.08229386547587784, 'sepal length (cm)'),
 (0.022463701232419853, 'sepal width (cm)')]

## Diabeties sample to use both decisiontree classifier and randomforest classifier

In [28]:
#df = pd.read_csv(os.path.join("..", "Resources", "diabetes.csv"))

df = pd.read_csv("data/diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [29]:
target = df["Outcome"]
#target_names = ["negative", "positive"]

In [30]:
#use data.columns to get all the features names
data = df.drop("Outcome", axis=1)
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [31]:
feature_names = data.columns
feature_names

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

### decision tree classifier

In [33]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.6979166666666666

### random forest classifier

In [34]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.734375

In [35]:
#notice must be zipped so that the pair will not be damaged
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.25886717532890574, 'Glucose'),
 (0.1637494342987226, 'BMI'),
 (0.14049954881718638, 'Age'),
 (0.11831834036519061, 'DiabetesPedigreeFunction'),
 (0.08681688382179256, 'BloodPressure'),
 (0.08219504359738158, 'Pregnancies'),
 (0.07498351256283925, 'Insulin'),
 (0.07457006120798149, 'SkinThickness')]

## Save and reload the models

In [36]:
import pickle

In [37]:
# save the model to disk
filename = 'rf_model.sav'
pickle.dump(rf, open(filename, 'wb'))

In [38]:
#reload in the above model
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.734375
