![](SciKitLearnStructure.png)

###

### Intro to SciKit learn

In [50]:
from IPython.terminal.shortcuts.filters import pass_through

what_were_covering = [
    "0. And end-to-end Scikit-Learn Workflow",
    "1. Getting the Data Ready",
    "2. Choose the right estimator/algorithm for our problems",
    "3. Fit the model/algorithms and use it to make predictions on our data",
    "4. Evaluating the Model",
    "5. Improve the Model",
    "6. Save and load a trained model",
    "7. Putting it all together"
]

In [51]:
# from Xlib.rdb import value_escape_re

#from mysample import car_sales
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import numpy as np

### 0. An end-to-end Scikit Learn Workflow

### 1. Get the Data ready

In [52]:
heart_disease = pd.read_csv("data/heart-disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [53]:
# Create X (Features Matrix)
X = heart_disease.drop("target", axis=1)

# Create y (Labels)
y = heart_disease["target"]

### 2.  Choose the right model and hyperparameters

In [54]:
# This is a 'classification' problem because we want to determine whether or not a person has heart disease

# This is a classification machine-learning model (RandomForest Classifier)
from sklearn.ensemble import RandomForestClassifier
# instantiate the class
clf = RandomForestClassifier()
# keep the default hyper parameters
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### 3. Fit the model to the training data

In [55]:
from sklearn.model_selection import train_test_split
# test size is 20%, while 80% is used for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [56]:
clf.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [57]:
# Now we can make a prediction (still in step 3)
##### y_label = clf.predict(np.array([0, 2, 3, 4]))

In [58]:
# The y preditions needs to work on the X_test data set
y_preds = clf.predict(X_test)
y_preds

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1])

In [59]:
y_test

47     1
101    1
51     1
63     1
146    1
      ..
208    0
174    0
161    1
227    0
53     1
Name: target, Length: 61, dtype: int64

### 4. Evaluate the model on the training data and the test data

In [60]:
clf.score(X_train, y_train)


1.0

In [61]:
clf.score(X_test, y_test)

0.819672131147541

In [62]:
# Show some classification metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.90      0.78      0.84        36
           1       0.73      0.88      0.80        25

    accuracy                           0.82        61
   macro avg       0.82      0.83      0.82        61
weighted avg       0.83      0.82      0.82        61



In [63]:
confusion_matrix(y_test, y_preds)

array([[28,  8],
       [ 3, 22]])

In [64]:
accuracy_score(y_test, y_preds)

0.819672131147541

### 5. Improve a model

In [65]:
# Try different amount of n_estimators
np.random.seed(42)
for i in range(20, 100, 20):
    print(f"Trying model with {i} estimators ... ")
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model accuracy on test set: {clf.score(X_test, y_test) * 100:.2f}%")
    print("")

Trying model with 20 estimators ... 
Model accuracy on test set: 77.05%

Trying model with 40 estimators ... 
Model accuracy on test set: 83.61%

Trying model with 60 estimators ... 
Model accuracy on test set: 83.61%

Trying model with 80 estimators ... 
Model accuracy on test set: 83.61%



### 6. Save a model and load it

In [66]:
import pickle
pickle.dump(clf, open('random_forest_model.pkl', 'wb'))

In [67]:
loaded_model = pickle.load(open('random_forest_model.pkl', 'rb'))

In [68]:
print(loaded_model.score(X_test, y_test))

0.8360655737704918


### ***** STARTING NEW SECTION *****

### Standard Imports

In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
### import seaborn as sns

### 1. Getting our data ready to be used with machine learning
Three things we have to do:
   1. Split the data into feature and labels (usually 'X' & 'y') Feature,X ... Labels,y
   2. Filling (also called imputing) or disregarding missing values
   3. Converting non-numerical values to numerical values (also called feature encoding)

## Split the data into training and test data

In [70]:
#Import the heart_disease data set
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [71]:
# We want to use the feature columns to predict 'y'
# the feature columns are the age, sex, cp, trestbps, etc.
# the y is the target (whether patient has heart disease or not
X = heart_disease.drop("target", axis=1) # Axis 1 = the column headers so saying Axis 1-> target, will drop() the "target" item from the Axis column headers
# X is going to be every single column except for "target"
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [72]:
# y will now equal the target column
y = heart_disease["target"]
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [73]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split
# When we call this it will return 4 values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [74]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

In [75]:
# in the above the split took place - there are 303 total samples, and now the training data is split into 80/20
# since X = heart_disease with the target column dropped, it should contain the entire number of rows still...
print(X.shape)
# 303
print(X.shape[0] * 0.8)

(303, 13)
242.4


In [76]:
len(heart_disease)
# 303

303

### Let's make sure all the data is numerical

In [77]:
what_were_covering

['0. And end-to-end Scikit-Learn Workflow',
 '1. Getting the Data Ready',
 '2. Choose the right estimator/algorithm for our problems',
 '3. Fit the model/algorithms and use it to make predictions on our data',
 '4. Evaluating the Model',
 '5. Improve the Model',
 '6. Save and load a trained model',
 '7. Putting it all together']

In [78]:
# Import the Extended Car Sales Data for detailed analysis
car_sales = pd.read_csv("data/car-sales-extended.csv")
car_sales.head()


Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [79]:
len(car_sales)

1000

In [80]:
car_sales.dtypes
#car_sales.describe()

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [81]:
# On the outset we are going to use the 4 columns: Make, Color, Odometer, Doors... to predict the car's price.
# In terms of the objects like the car's color, will have to make those into numeric values ... will see how to make the strings into numbers.


In [82]:
# Split into X/y
X = car_sales.drop("Price", axis=1)
y = car_sales["Price"]

# We need to use the X values to predict the y values - doors, odometer, etc. to predict Price.

### Split into training and test set first

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [84]:
# Learn from X-train and y_train and then predict on the test data X_test, y_test
# Now, build a machine learning model:
from sklearn.ensemble import RandomForestRegressor
# in this case the 'regressor' will try to predict a number
# CREATE A MODEL
model = RandomForestRegressor()


In [85]:
# TRAIN IT
########## model.fit(X_train, y_train)
# Learning the patterns between the X variable and the Price


In [86]:
# And then we want to score it on the test data
# RUN IT
########## model.score(X_test, y_test)

### Now running this in the current state will fail because some of the data re not numbers, they are strings and strings are Objects... they need to be Integers.

## Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Note that 'Doors' is numerical, then why does it need to be transformed?  because it can ALSO be considered categorical -i.e. all these cars have 4 doors (SEDAN), 3 doors (coupe), 5 doors (SUV)
categorical_features = ["Make", "Colour", "Doors"]

one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                   one_hot,
                                   categorical_features)],
                                   remainder = "passthrough")

transformed_X = transformer.fit_transform(X)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]], shape=(1000, 13))

### Now let's look at the data in a pd DataFrame

### Now let's look at the data in a pd DataFrame

In [87]:
pd.DataFrame(transformed_X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0


In [88]:
X.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


In [89]:
y

0      15323
1      19943
2      28343
3      13434
4      14043
       ...  
995    32042
996     5716
997    31570
998     4001
999    12732
Name: Price, Length: 1000, dtype: int64

# Let's see how the data was encoded

In [90]:
dummies = pd.get_dummies(X)
dummies

Unnamed: 0,Odometer (KM),Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,35431,4,False,True,False,False,False,False,False,False,True
1,192714,5,True,False,False,False,False,True,False,False,False
2,84714,4,False,True,False,False,False,False,False,False,True
3,154365,4,False,False,False,True,False,False,False,False,True
4,181577,3,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
995,35820,4,False,False,False,True,True,False,False,False,False
996,155144,3,False,False,True,False,False,False,False,False,True
997,66604,4,False,False,True,False,False,True,False,False,False
998,215883,4,False,True,False,False,False,False,False,False,True


In [91]:
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,False,True,False,False,False,False,False,False,True
1,5,True,False,False,False,False,True,False,False,False
2,4,False,True,False,False,False,False,False,False,True
3,4,False,False,False,True,False,False,False,False,True
4,3,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...
995,4,False,False,False,True,True,False,False,False,False
996,3,False,False,True,False,False,False,False,False,True
997,4,False,False,True,False,False,True,False,False,False
998,4,False,True,False,False,False,False,False,False,True


## Refit the model using hte transformed data

In [94]:
np.random.seed(42)
# Notice the use of: 'transformed_X'
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [97]:
model.score(X_test, y_test)

0.3235867221569877

In [99]:
import sklearn
print(sklearn.__version__)

1.7.1
