# Introduction to Scikit-learn
This notebook will demonstrate some of the most useful functions of the wonderful scikit-learn library.

what is going to be covered:
0. An end to end scikit-learn workflow
1. get the data ready
2. choose the right model and hyperparameters
3. fit the model with training data
4. evaluation model with training data and test data
5. improve the model
6. save the model

In [115]:
#lightgbm
#xgboost

In [96]:
# 1. Get the data ready

%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [97]:
import warnings
warnings.filterwarnings("ignore")

In [98]:
import sklearn
#sklearn.show_versions()

In [99]:
heart_disease = pd.read_csv("data/heart-disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [100]:
# Create X (features matrix)
X = heart_disease.drop("target", axis=1)

# Create y (Labels)
y = heart_disease["target"]

In [101]:
# 2. choose the right model and hyperparameters
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 50)

# keep the default hyperparameters
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 50,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [102]:
# 3. fit the model with training data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [103]:
X_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
252,62,0,0,138,294,1,1,106,0,1.9,1,3,2
106,69,1,3,160,234,1,0,131,0,0.1,1,1,2
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1
179,57,1,0,150,276,0,0,112,1,0.6,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
276,58,1,0,146,218,0,1,105,0,2.0,1,1,3
177,64,1,2,140,335,0,1,158,0,0.0,2,0,2
23,61,1,2,150,243,1,1,137,1,1.0,1,0,2
264,54,1,0,110,206,0,0,108,1,0.0,1,1,2


In [104]:
clf.fit(X_train, y_train);

In [105]:
y_preds = clf.predict(X_test)

In [106]:
y_preds

array([0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1], dtype=int64)

In [107]:
# 4. Evaluate the model on the training data and test data
clf.score(X_train, y_train)

1.0

In [108]:
clf.score(X_test, y_test)

0.819672131147541

In [109]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.88      0.72      0.79        29
           1       0.78      0.91      0.84        32

    accuracy                           0.82        61
   macro avg       0.83      0.82      0.82        61
weighted avg       0.83      0.82      0.82        61



In [110]:
print(confusion_matrix(y_test, y_preds))

[[21  8]
 [ 3 29]]


In [111]:
print(accuracy_score(y_test, y_preds))

0.819672131147541


In [112]:
# 5. improve the model
# try different amount of n_estimators
def ModelTest(model_features, model_label, test_features, test_label):
    np.random.seed(56)
    for i in range(10, 100, 10):
        print(f"Trying model with {i} estimators")
        model_clf = RandomForestClassifier(n_estimators=i)
        model_clf.fit(model_features, model_label)
        print(f"Model accuracy on test set: {model_clf.score(test_features, test_label)*100:.2f}%")
        
ModelTest(X_train, y_train, X_test, y_test)

Trying model with 10 estimators
Model accuracy on test set: 83.61%
Trying model with 20 estimators
Model accuracy on test set: 83.61%
Trying model with 30 estimators
Model accuracy on test set: 83.61%
Trying model with 40 estimators
Model accuracy on test set: 83.61%
Trying model with 50 estimators
Model accuracy on test set: 85.25%
Trying model with 60 estimators
Model accuracy on test set: 81.97%
Trying model with 70 estimators
Model accuracy on test set: 85.25%
Trying model with 80 estimators
Model accuracy on test set: 85.25%
Trying model with 90 estimators
Model accuracy on test set: 85.25%


In [113]:
# 6. Save a model
import pickle

pickle.dump(clf, open("Random_forest_pickle_1.pkl","wb"))


In [114]:
# load model
loaded_model = pickle.load(open("Random_forest_pickle_1.pkl", "rb"))
loaded_model.score(X_test, y_test)

0.819672131147541

## Getting data ready for machine learning
    1. Split data into features and label (`X` & `y`)
    2. Filling (also called imputing) or disregarding missing values
    3. Converting non-numerical values to numerical values (also called feature encoding)

In [116]:
heart_disease.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
