# Introduction to Scikit-Learn (sklearn)

Some of most Useful ML functions of SKLearn in this notebook

What is covered : 

0. An end-to-end Scikit-Learn workflow
1. Getting the data ready
2. Choose the right estimator/algorithm for our problems
3. Fit the model/algorithm and use it to make predictions on our data
4. Evaluating a model
5. Improve a model
6. Save and load a trained model
7. Put it all together!


## 0. An end-to-end Scikit-Learn WorkFlow

In [12]:
#1. Get the Data Ready
import pandas as pd
import numpy as np
heart_disease = pd.read_csv("heart-disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [7]:
#Create X (Features matrix or variable)
X = heart_disease.drop("target",axis=1)

#Create Y for Target (Labels)
y = heart_disease["target"]

In [8]:
# 2. Choose right model and hyperparameters
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier() #add n_estimators=100 if warning seen

#we'll keep default hyper parameters
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [9]:
#3. Fit the model to training data
from sklearn.model_selection import train_test_split

X_Train, X_Test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [11]:
clf.fit(X_Train,y_train);

In [14]:
#make a prediction
y_preds = clf.predict(X_Test)
y_preds

array([0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1], dtype=int64)

In [15]:
y_test

214    0
72     1
220    0
244    0
232    0
      ..
35     1
278    0
134    1
184    0
286    0
Name: target, Length: 61, dtype: int64

In [16]:
#4. Evaluate the model on the training data and test data
clf.score(X_Train,y_train)

1.0

In [17]:
clf.score(X_Test,y_test)

0.819672131147541

In [18]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.77      0.80      0.78        25
           1       0.86      0.83      0.85        36

    accuracy                           0.82        61
   macro avg       0.81      0.82      0.81        61
weighted avg       0.82      0.82      0.82        61



In [19]:
confusion_matrix(y_test,y_preds)

array([[20,  5],
       [ 6, 30]], dtype=int64)

In [20]:
accuracy_score(y_test,y_preds)

0.819672131147541

In [22]:
#5. Improve a model
#Try a different amount of n_estimators
np.random.seed(42)
for i in range(10,100,10):
    print(f"Trying model with {i} estimators...") 
    clf = RandomForestClassifier(n_estimators=i).fit(X_Train,y_train)
    print(f"Model accuracy on test set: {clf.score(X_Test,y_test)*100:.2f}")

Trying model with 10 estimators...
Model accuracy on test set: 81.97
Trying model with 20 estimators...
Model accuracy on test set: 78.69
Trying model with 30 estimators...
Model accuracy on test set: 85.25
Trying model with 40 estimators...
Model accuracy on test set: 80.33
Trying model with 50 estimators...
Model accuracy on test set: 80.33
Trying model with 60 estimators...
Model accuracy on test set: 81.97
Trying model with 70 estimators...
Model accuracy on test set: 83.61
Trying model with 80 estimators...
Model accuracy on test set: 86.89
Trying model with 90 estimators...
Model accuracy on test set: 85.25


In [23]:
#6. Save a model and load it
import pickle

pickle.dump(clf, open("random_forest_model_1.pkl","wb"))

In [24]:
loaded_model = pickle.load(open("random_forest_model_1.pkl","rb"))
loaded_model.score(X_Test,y_test)

0.8524590163934426

In [None]:
# # Working with warnings
# import warnings
# warnings.filterwarnings('ignore') #/or ('default) - for going back to default

#### Upgrade something in conda environment

Use GUI in Navigator
commands in CLI. - conda update 'packagename=v.er.si.on'

conda search packagename - lists available versions

In [1]:
#7. Put it all together

import pandas as pd
import numpy as np
heart_disease = pd.read_csv("heart-disease.csv")
heart_disease

#Create X (Features matrix or variable)
X = heart_disease.drop("target",axis=1)

#Create Y for Target (Labels)
y = heart_disease["target"]

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier() #add n_estimators=100 if warning seen

#we'll keep default hyper parameters
clf.get_params()

from sklearn.model_selection import train_test_split

X_Train, X_Test, y_train, y_test = train_test_split(X,y, test_size=0.2)

clf.fit(X_Train,y_train);

#make a prediction
y_preds = clf.predict(X_Test)
y_preds

clf.score(X_Train,y_train)
clf.score(X_Test,y_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test,y_preds))

confusion_matrix(y_test,y_preds)
accuracy_score(y_test,y_preds)

#Try a different amount of n_estimators
np.random.seed(42)
for i in range(10,100,10):
    print(f"Trying model with {i} estimators...") 
    clf = RandomForestClassifier(n_estimators=i).fit(X_Train,y_train)
    print(f"Model accuracy on test set: {clf.score(X_Test,y_test)*100:.2f}")

import pickle

pickle.dump(clf, open("random_forest_model_1.pkl","wb"))

loaded_model = pickle.load(open("random_forest_model_1.pkl","rb"))
loaded_model.score(X_Test,y_test)

              precision    recall  f1-score   support

           0       0.93      0.79      0.85        33
           1       0.79      0.93      0.85        28

    accuracy                           0.85        61
   macro avg       0.86      0.86      0.85        61
weighted avg       0.86      0.85      0.85        61

Trying model with 10 estimators...
Model accuracy on test set: 78.69
Trying model with 20 estimators...
Model accuracy on test set: 88.52
Trying model with 30 estimators...
Model accuracy on test set: 85.25
Trying model with 40 estimators...
Model accuracy on test set: 81.97
Trying model with 50 estimators...
Model accuracy on test set: 85.25
Trying model with 60 estimators...
Model accuracy on test set: 86.89
Trying model with 70 estimators...
Model accuracy on test set: 85.25
Trying model with 80 estimators...
Model accuracy on test set: 85.25
Trying model with 90 estimators...
Model accuracy on test set: 85.25


0.8524590163934426

## 1. Getting our data ready to be used with Machine Learning

Three main things to do:

    1. split the data into features and labels (usually 'X' & 'Y')

    2. Filling (also called imputing) or disregarding missing values

    3. converting non-numericals values to numerical values (also called feature encoding)

In [2]:
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [7]:
X = heart_disease.drop("target",axis=1)
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [6]:
y = heart_disease["target"]
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [8]:
# Split the data into training and test splits

from sklearn.model_selection import train_test_split
X_Train,X_Test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [9]:
X_Train.shape,X_Test.shape,y_train.shape,y_test.shape

((242, 13), (61, 13), (242,), (61,))

### 1.1 make sure it's numerical

In [12]:
car_sales = pd.read_csv("car-sales-extended.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [13]:
len(car_sales)

1000

In [14]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [22]:
# Split inti X/y
X = car_sales.drop("Price",axis=1)
y = car_sales["Price"]

#Split into trainign and testing set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)


In [16]:
#Building machine learning model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train,y_train)
model.score(X_test,y_test)

ValueError: could not convert string to float: 'Toyota'

In [35]:
#convert to numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make","Colour","Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot,categorical_features)],
                                remainder="passthrough")

transformed_X = transformer.fit_transform(X)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [36]:
pd.DataFrame(transformed_X)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [37]:
dummies = pd.get_dummies(car_sales[["Make","Colour","Doors"]])
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,False,True,False,False,False,False,False,False,True
1,5,True,False,False,False,False,True,False,False,False
2,4,False,True,False,False,False,False,False,False,True
3,4,False,False,False,True,False,False,False,False,True
4,3,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...
995,4,False,False,False,True,True,False,False,False,False
996,3,False,False,True,False,False,False,False,False,True
997,4,False,False,True,False,False,True,False,False,False
998,4,False,True,False,False,False,False,False,False,True


In [38]:
#Refit Model

np.random.seed(42)

X_train,X_test,y_train,y_test = train_test_split(transformed_X,y,test_size=0.2)

model.fit(X_train,y_train)

In [39]:
model.score(X_test,y_test)

0.3235867221569877

### 1.2 What If missing values?

1. Fill them with some value (imputation)
2. Remove the samples with missing data altogether

In [42]:
import sklearn
print(sklearn.__version__)

1.2.2


In [87]:
#Import Car sales missing data
car_sales_missing = pd.read_csv("car-sales-extended-missing-data.csv")


car_sales_missing.dtypes, car_sales_missing.head(25)

(Make              object
 Colour            object
 Odometer (KM)    float64
 Doors            float64
 Price            float64
 dtype: object,
       Make Colour  Odometer (KM)  Doors    Price
 0    Honda  White        35431.0    4.0  15323.0
 1      BMW   Blue       192714.0    5.0  19943.0
 2    Honda  White        84714.0    4.0  28343.0
 3   Toyota  White       154365.0    4.0  13434.0
 4   Nissan   Blue       181577.0    3.0  14043.0
 5    Honda    Red        42652.0    4.0  23883.0
 6   Toyota   Blue       163453.0    4.0   8473.0
 7    Honda  White            NaN    4.0  20306.0
 8      NaN  White       130538.0    4.0   9374.0
 9    Honda   Blue        51029.0    4.0  26683.0
 10  Nissan  White       167421.0    4.0  16259.0
 11  Nissan  Green        17119.0    4.0   6160.0
 12  Nissan  White       102303.0    4.0  16909.0
 13     NaN  White       134181.0    4.0  11121.0
 14   Honda   Blue       199833.0    4.0  18946.0
 15  Toyota   Blue       205592.0    4.0  16290.0
 16 

In [88]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [95]:
# Fill missing data

car_sales_missing["Make"].fillna("missing",inplace=True)
car_sales_missing["Colour"].fillna("missing",inplace=True)
car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(),inplace=True)
car_sales_missing["Doors"].fillna(4,inplace=True)

car_sales_missing.dtypes, len(car_sales_missing), car_sales_missing.tail(25)

(Make              object
 Colour            object
 Odometer (KM)    float64
 Doors            float64
 Price            float64
 dtype: object,
 950,
         Make   Colour  Odometer (KM)  Doors    Price
 975    Honda  missing   22409.000000    4.0  10429.0
 976   Toyota     Blue   95317.000000    4.0   7435.0
 977   Toyota     Blue  128016.000000    4.0  16835.0
 978      BMW    White   85739.000000    5.0  48419.0
 979   Toyota    Black   17975.000000    4.0  17940.0
 980   Toyota     Blue  230314.000000    4.0   6720.0
 981   Toyota    White  129454.000000    4.0   6446.0
 982    Honda    White  238172.000000    4.0  13273.0
 983   Toyota      Red  131253.237895    4.0  14671.0
 984   Nissan     Blue  157235.000000    4.0   4196.0
 985  missing     Blue  216250.000000    4.0   9691.0
 986    Honda    White   71934.000000    4.0  26882.0
 987    Honda    White  215235.000000    4.0   3825.0
 988   Nissan    Black  248736.000000    4.0   8358.0
 989   Toyota      Red   41735.000000 

In [91]:
#chweck for na values
car_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [92]:
#Remove pricing columns from data
car_sales_missing.dropna(inplace=True)
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [93]:
len(car_sales_missing)

950

In [94]:
X = car_sales_missing.drop("Price",axis=1)
y = car_sales_missing["Price"]

In [96]:
#One Hot Encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make","Colour","Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot,categorical_features)],
                                remainder="passthrough")

transformed_X2 = transformer.fit_transform(car_sales_missing)
transformed_X2

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.54310e+04, 1.53230e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        1.92714e+05, 1.99430e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        8.47140e+04, 2.83430e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        6.66040e+04, 3.15700e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.15883e+05, 4.00100e+03],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.48360e+05, 1.27320e+04]])

### Missing values with scikit learn


In [97]:
car_sales_missing = pd.read_csv("car-sales-extended-missing-data.csv")

car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [99]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [100]:
#Drop label wit no labels
car_sales_missing.dropna(subset=["Price"], inplace=True)
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [101]:
# Split into X and Y
X = car_sales_missing.drop("Price",axis=1)
y = car_sales_missing["Price"]

In [103]:
# Filling missing values with scikit learn

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with missing & numerical values with mean()
cat_imputer = SimpleImputer(strategy="constant",fill_value="missing")
door_imputer = SimpleImputer(strategy="constant",fill_value=4)
num_imputer = SimpleImputer(strategy="constant")

#Define columns
cat_features = ["Make","Colour"]
door_features = ["Doors"]
num_features = ["Odometer (KM)"]

#Create an imputer (something that fills missing data)
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer",door_imputer,door_features),
    ("num_imputer",num_imputer,num_features)
])

#Transform Data
filled_X = imputer.fit_transform(X)
filled_X

array([['Honda', 'White', 4.0, 35431.0],
       ['BMW', 'Blue', 5.0, 192714.0],
       ['Honda', 'White', 4.0, 84714.0],
       ...,
       ['Nissan', 'Blue', 4.0, 66604.0],
       ['Honda', 'White', 4.0, 215883.0],
       ['Toyota', 'Blue', 4.0, 248360.0]], dtype=object)

In [104]:
car_sales_filled = pd.DataFrame(filled_X,columns=["Make","Colour","Doors","Odometer (KM)"])
car_sales_filled.head()

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,White,4.0,35431.0
1,BMW,Blue,5.0,192714.0
2,Honda,White,4.0,84714.0
3,Toyota,White,4.0,154365.0
4,Nissan,Blue,3.0,181577.0


In [106]:
car_sales_filled.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [107]:
#One Hot Encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make","Colour","Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot,categorical_features)],
                                remainder="passthrough")

transformed_X2 = transformer.fit_transform(car_sales_filled)
transformed_X2

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3752 stored elements in Compressed Sparse Row format>

In [109]:
# Data is Numbers and filled missing values
# Fit a model

np.random.seed(42)

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_Train,X_Test, y_train, y_test = train_test_split(transformed_X2,y,test_size=0.2)

model = RandomForestRegressor()
model.fit(X_Train, y_train)
model.score(X_Test, y_test)


0.19329020020134935