# Machine Learning Model

In [273]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import os
import numpy as np

### 1. Clean & Prepare Dataset

In [274]:
# Read csv into dataframe
df = pd.read_csv(os.path.join("Resources", "customer_data.csv"))
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [275]:
# Check for null values
for x in df:
    print(f' Column {x}: Unique Value(s): {pd.isna(df[x]).unique()}') 

 Column id: Unique Value(s): [False]
 Column Gender: Unique Value(s): [False]
 Column Age: Unique Value(s): [False]
 Column Driving_License: Unique Value(s): [False]
 Column Region_Code: Unique Value(s): [False]
 Column Previously_Insured: Unique Value(s): [False]
 Column Vehicle_Age: Unique Value(s): [False]
 Column Vehicle_Damage: Unique Value(s): [False]
 Column Annual_Premium: Unique Value(s): [False]
 Column Policy_Sales_Channel: Unique Value(s): [False]
 Column Vintage: Unique Value(s): [False]
 Column Response: Unique Value(s): [False]


In [276]:
# Drop id column because it is not a customer's attribute
df = df.drop(columns=['id'])
df.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [277]:
# Check data types
df.dtypes

Gender                   object
Age                       int64
Driving_License           int64
Region_Code             float64
Previously_Insured        int64
Vehicle_Age              object
Vehicle_Damage           object
Annual_Premium          float64
Policy_Sales_Channel    float64
Vintage                   int64
Response                  int64
dtype: object

In [278]:
# Convert objects in Gender column into numeric values
df['Gender'] = df['Gender'].replace('Male', 1)
df['Gender'] = df['Gender'].replace('Female', 0)

In [279]:
# Convert objects in Vehicle Age column into numeric values
df['Vehicle_Age'] = df['Vehicle_Age'].replace('< 1 Year', 1)
df['Vehicle_Age'] = df['Vehicle_Age'].replace('1-2 Year', 2)
df['Vehicle_Age'] = df['Vehicle_Age'].replace('> 2 Years', 3)

In [280]:
# Convert objects in Vehicle Damage column into numeric values
df['Vehicle_Damage'] = df['Vehicle_Damage'].replace('Yes', 1)
df['Vehicle_Damage'] = df['Vehicle_Damage'].replace('No', 0)

In [281]:
# Check data types again to ensure all objects are converted
df.dtypes

Gender                    int64
Age                       int64
Driving_License           int64
Region_Code             float64
Previously_Insured        int64
Vehicle_Age               int64
Vehicle_Damage            int64
Annual_Premium          float64
Policy_Sales_Channel    float64
Vintage                   int64
Response                  int64
dtype: object

In [282]:
# View df
df.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,44,1,28.0,0,3,1,40454.0,26.0,217,1
1,1,76,1,3.0,0,2,0,33536.0,26.0,183,0
2,1,47,1,28.0,0,3,1,38294.0,26.0,27,1
3,1,21,1,11.0,1,1,0,28619.0,152.0,203,0
4,0,29,1,41.0,1,1,0,27496.0,152.0,39,0


### 2. Run K Nearest Neighborbors Algorithm

In [283]:
y = df['Response']

In [284]:
X = df.drop('Response', axis=1)
X.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,1,44,1,28.0,0,3,1,40454.0,26.0,217
1,1,76,1,3.0,0,2,0,33536.0,26.0,183
2,1,47,1,28.0,0,3,1,38294.0,26.0,27
3,1,21,1,11.0,1,1,0,28619.0,152.0,203
4,0,29,1,41.0,1,1,0,27496.0,152.0,39


In [285]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Loop through different k values to see which has the highest accuracy - UNSCALED DATA
# Note: We only use odd numbers because we don't want any ties
train_scores = []
test_scores = []
for k in range(1, 50, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_score = knn.score(X_train, y_train)
    test_score = knn.score(X_test, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")

k: 1, Train/Test Score: 1.000/0.798


In [None]:
plt.plot(range(1, 50, 2), train_scores, marker='o')
plt.plot(range(1, 50, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()
plt.savefig("model_shape.png")

In [None]:
# k = 17
knn = KNeighborsClassifier(n_neighbors=17)
knn.fit(X_train, y_train)
print('k=17 Test Acc: %.3f' % knn.score(X_test, y_test))

Cross check accurancy using the square root method

In [None]:
# # K using square root method i.e. 381110^(1/2)
# knn = KNeighborsClassifier(n_neighbors=617)
# knn.fit(X_train_scaled, y_train)
# print('k=617 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

Conclusion: Accuracy level is almost the same whether using 17 or 617 as k. Therefore, 17 is selected so that the model can be run more efficiently, while maintaining an acceptable level of accuracy.

In [None]:
# Use KNN model to predict responses
predictions = knn.predict(X_test)
print(f'First 10 predictions: {predictions[:10]}')
print(f'First 10 actual responses: {y_test[:10].tolist()}')

In [None]:
# Create dataframe to compare predictions & actual responses
predict_df = pd.DataFrame({'Predictions': predictions, 'Actual': y_test}).reset_index(drop=True)
predict_df.count()

In [None]:
# Check if we have predicted any 'positive' response
check = predict_df[predict_df['Predictions'] == 1]
check.count()

In [None]:
# Check how many times we have correctly predicted the responses
correct_predict = predict_df[predict_df['Predictions'] == predict_df['Actual']]
correct_predict.count()

In [None]:
print(f'Percentage of correct predictions: {correct_predict.count()/predict_df.count()}')

### 3. Save the Trained Model

In [None]:
import joblib
  
# Save the model as a pickle in a file 
joblib.dump(knn, 'recommender_model.pkl') 

In [None]:
# Load the model from the file 
knn_from_joblib = joblib.load('recommender_model.pkl')  
  
# Use the loaded model to make predictions 
knn_from_joblib.predict(X_test) 

In [None]:
!pip install plotly

In [286]:
import plotly.graph_objects as go
import numpy as np
from sklearn.datasets import make_moons

mesh_size = .02
margin = 0.25

In [287]:
# Load and split data
# X2, y2 = make_moons(noise=0.3, random_state=0)
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X, y.astype(str), test_size=0.25, random_state=0)

In [288]:
type(X)

pandas.core.frame.DataFrame

In [289]:
# X[:, 0].min()

In [290]:
margin

0.25

In [291]:
X.iloc[:, 0].max()

1

In [292]:
# Create a mesh grid on which we will run our model
x_min, x_max = X.iloc[:, 0].min() - margin, X.iloc[:, 0].max() + margin
y_min, y_max = X.iloc[:, 1].min() - margin, X.iloc[:, 1].max() + margin
xrange = np.arange(x_min, x_max, mesh_size)
yrange = np.arange(y_min, y_max, mesh_size)
xx, yy = np.meshgrid(xrange, yrange)

In [293]:
# Create classifier, run predictions on grid
clf = KNeighborsClassifier(15, weights='uniform')
clf.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='uniform')

In [294]:
# clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# print(np.c_[xx.ravel()])
# print(len(xx.ravel()))
print(type([xx.ravel(), yy.ravel()]))
print(type(np.c_[xx.ravel(), yy.ravel()]))
print(type((np.c_[xx.ravel(), yy.ravel()])[:, 1]))
print(type(np.c_[xx.ravel(), yy.ravel()]))
print(
#     type(
        ((np.c_[xx.ravel(), yy.ravel()])
        [:, 1])
)
# )
# print(
#     type(
#         ((np.c_[xx2.ravel(), yy2.ravel()])
#         [:, 1])
# ))

<class 'list'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[19.75 19.75 19.75 ... 85.23 85.23 85.23]


In [295]:
((np.c_[xx.ravel(), yy.ravel()])
        [:, 1]).shape
# X.shape
# X[0][0] #2-d because of rows and cols
type(X)
X.iloc[[0]]

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,1,44,1,28.0,0,3,1,40454.0,26.0,217


In [314]:
Z = clf.predict_proba(X)

In [297]:
# Z = Z.reshape(X.iloc[[0]].shape)
# Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# Z = Z.reshape(xx.shape)

In [298]:
# Plot the figure
fig = go.Figure(data=[
    go.Contour(
        x=xrange,
        y=yrange,
        z=Z,
        colorscale='RdBu'
    )
])
fig.show()

In [299]:
# import numpy as np
# from sklearn.datasets import make_moons
# from sklearn.model_selection import train_test_split
# from sklearn.neighbors import KNeighborsClassifier

mesh_size = .02
margin = 0.25

# Load and split data
X2, y2 = make_moons(noise=0.3, random_state=0)
X_train2, X_test2, y_train2, y_tes2t = train_test_split(
    X2, y2.astype(str), test_size=0.25, random_state=0)

# Create a mesh grid on which we will run our model
x_min2, x_max2 = X2[:, 0].min() - margin, X2[:, 0].max() + margin
y_min2, y_max2 = X2[:, 1].min() - margin, X2[:, 1].max() + margin
xrange2 = np.arange(x_min2, x_max2, mesh_size)
yrange2 = np.arange(y_min2, y_max2, mesh_size)
xx2, yy2 = np.meshgrid(xrange2, yrange2)

In [300]:
# Create classifier, run predictions on grid
clf2 = KNeighborsClassifier(15, weights='uniform')
clf2.fit(X2, y2)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='uniform')

In [301]:
Z2 = clf2.predict_proba(np.c_[xx2.ravel(), yy2.ravel()])[:, 1]
print(type([xx2.ravel(), yy2.ravel()]))
print(type(np.c_[xx2.ravel(), yy2.ravel()]))
print(type((np.c_[xx2.ravel(), yy2.ravel()])[:, 1]))
# print(type(np.c_[xx2.ravel(), yy2.ravel()]))
# print(Z2)
# print(type(Z2))
# Z = Z.reshape(xx.shape)

<class 'list'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [302]:
# Plot the figure
fig = go.Figure(data=[
    go.Contour(
        x=xrange2,
        y=yrange2,
        z=Z2,
        colorscale='RdBu'
    )
])
fig.show()
plt.savefig("model_shape.png")


<Figure size 432x288 with 0 Axes>

In [306]:
print(type(xrange))
print(type(yrange))
print(type(Z))

In [308]:
print(xrange.shape)
print(yrange.shape)
print(Z.shape)

(75,)
(3275,)
(1, 2)


In [307]:
print(type(xrange2))
print(type(yrange2))
print(type(Z2))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [309]:
print(xrange2.shape)
print(yrange2.shape)
print(Z2.shape)

(200,)
(156,)
(31200,)
