# Classification demo

### predict whether customer will purchase a product

In [1]:
# pre-requisites
import warnings
warnings.filterwarnings(action="ignore")

In [2]:
# import required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# load the data
df = pd.read_csv('./Social_Network_Ads.csv')
df.head(5)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


### EDA

In [4]:
# get the general info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [5]:
# get the statistical info about df
df.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


In [6]:
# remove the User ID as it will not add any value to the model
df.drop('User ID', axis=1,inplace=True)


### data cleansing

In [7]:
# convert the gender to numeric values

from sklearn.preprocessing import LabelEncoder
df['Gender']=LabelEncoder().fit_transform(df['Gender'])

In [8]:
# find the correlation
df.corr()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
Gender,1.0,-0.073741,-0.060435,-0.042469
Age,-0.073741,1.0,0.155238,0.622454
EstimatedSalary,-0.060435,0.155238,1.0,0.362083
Purchased,-0.042469,0.622454,0.362083,1.0


### ### split the data


In [9]:
# create x
x=df.drop(['Gender','Purchased'],axis=1)

# create y
y=df['Purchased']

In [10]:
from sklearn.model_selection import train_test_split

#split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=123456)

### ### model building


In [11]:
from sklearn.svm import SVC

# create the model
model_svc=SVC(C=2,kernel='rbf')

# train the model
model_svc.fit(x_train,y_train)

0,1,2
,C,2
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


### find the best values for the hyper-parameters

In [None]:
from sklearn.model_selection import GridSearchCV

#  create the list of hyper-parameters along with the possible values
parameters = {
   'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
   'C': [1, 2]
}

# create the grid search
grid_search_svm = GridSearchCV(model_svc, parameters)

# fit the values
grid_search_svm.fit(x_train,y_train)

In [None]:
# find the best combination of hyper-parameters
# grid_search_svm.best_params_

### Evaluate the SVM model

In [14]:
y_pred = model_svc.predict(x_test)

In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"accuracy = {accuracy:.2f}, precision = {precision:.2f}, recall = {recall:.2f}, f1 = {f1:.2f}")

accuracy = 0.85, precision = 0.93, recall = 0.54, f1 = 0.68


In [16]:
from sklearn.neighbors import KNeighborsClassifier

# create the model
model_knn = KNeighborsClassifier(n_neighbors=7)

# train the model
model_knn.fit(x_train,y_train)

0,1,2
,n_neighbors,7
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


### find the best values for hyperparameters

In [20]:
from sklearn.model_selection import GridSearchCV

#create the list of hyper-parameters along with the possible values
parameters = {
   'n_neighbors': range(3, 11),
   'weights': ['uniform', 'distance']
}

#create the grid search
grid_search_knn = GridSearchCV(model_knn, parameters)

#fit the values
grid_search_knn.fit(x_train,y_train)

0,1,2
,estimator,KNeighborsCla...n_neighbors=7)
,param_grid,"{'n_neighbors': range(3, 11), 'weights': ['uniform', 'distance']}"
,scoring,
,n_jobs,
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_neighbors,5
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [21]:
# find the best combination of hyper-parameters
grid_search_knn.best_params_

{'n_neighbors': 5, 'weights': 'distance'}

### KNN model evaluation

In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = model_knn.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"accuracy = {accuracy:.2f}, precision = {precision:.2f}, recall = {recall:.2f}, f1 = {f1:.2f}")

accuracy = 0.85, precision = 0.75, recall = 0.75, f1 = 0.75


In [None]:
# visualize the output