## Introduction to Scikit-Learn (sklearn)

### Topics:
0. End-to-end sklearn workflow
1. Getting the data ready
2. Choosing the right algorithm
3. Fitting the model for prediction
4. Evaluate model
5. Improve model
6. Save and load trained model
7. Putting it all together

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

In [2]:
#loading data
heart_disease = pd.read_csv('heart-disease.csv')
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
# Create X (features matrix)
X = heart_disease.drop('target', axis = 1)   #every column except the target column

#create y (labels)
y = heart_disease['target']

In [4]:
# 2. choose the right model and hyparameters
clf = RandomForestClassifier(n_estimators = 100)
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [5]:
# 3. Fit the model to the training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

#make prediction
X_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
280,42,1,0,136,315,0,1,125,1,1.8,1,0,1
228,59,1,3,170,288,0,0,159,0,0.2,1,0,3
84,42,0,0,102,265,0,0,122,0,0.6,1,0,2
292,58,0,0,170,225,1,0,146,1,2.8,1,2,1
130,54,0,2,160,201,0,1,163,0,0.0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,38,1,3,120,231,0,1,182,1,3.8,1,0,3
123,54,0,2,108,267,0,0,167,0,0.0,2,0,2
125,34,0,1,118,210,0,1,192,0,0.7,2,0,2
256,58,1,0,128,259,0,0,130,1,3.0,1,2,3


In [6]:
clf.fit(X_train, y_train);

In [7]:
# Make a prediction
y_preds = clf.predict(X_test)
y_preds

array([1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0], dtype=int64)

In [8]:
y_test

200    0
298    0
143    1
185    0
282    0
      ..
264    0
116    1
150    1
201    0
181    0
Name: target, Length: 61, dtype: int64

In [9]:
# 4. Evaluate the model
clf.score(X_train, y_train)

1.0

In [10]:
clf.score(X_test, y_test)

0.7868852459016393

In [11]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [12]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.90      0.72      0.80        36
           1       0.69      0.88      0.77        25

    accuracy                           0.79        61
   macro avg       0.79      0.80      0.79        61
weighted avg       0.81      0.79      0.79        61



In [13]:
print(confusion_matrix(y_test, y_preds))

[[26 10]
 [ 3 22]]


In [14]:
print(accuracy_score(y_test, y_preds))

0.7868852459016393


In [15]:
# 5. Improve the model
#trying different n_estimator
np.random.seed(42)
for i in range(10,100,10):
    print(f"trying model with {i} estimators ..")
    clf = RandomForestClassifier(n_estimators = i).fit(X_train, y_train)
    print(f"Model accuracy on test set: {clf.score(X_test, y_test)*100:.2f}%")
    print("  ")      

trying model with 10 estimators ..
Model accuracy on test set: 78.69%
  
trying model with 20 estimators ..
Model accuracy on test set: 77.05%
  
trying model with 30 estimators ..
Model accuracy on test set: 78.69%
  
trying model with 40 estimators ..
Model accuracy on test set: 75.41%
  
trying model with 50 estimators ..
Model accuracy on test set: 77.05%
  
trying model with 60 estimators ..
Model accuracy on test set: 77.05%
  
trying model with 70 estimators ..
Model accuracy on test set: 77.05%
  
trying model with 80 estimators ..
Model accuracy on test set: 80.33%
  
trying model with 90 estimators ..
Model accuracy on test set: 80.33%
  


In [21]:
#Save a model and load it
import pickle
pickle.dump(clf, open("random_forest_model_1.pkl", "wb"))

In [22]:
loaded_model = pickle.load(open("random_forest_model_1.pkl", "rb"))
loaded_model.score(X_test, y_test)

0.8032786885245902