In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
ds = pd.read_csv('organics_stats.csv', index_col=0)

In [3]:
ds.head()

Unnamed: 0,Gender,Geographic Region,Loyalty Status,Neighborhood Cluster-7 Level,Affluence Grade,Age,Loyalty Card Tenure,Organics Purchase Count,Organics Purchase Indicator,Total Spend
0,0,0,2,3,5,70,8,1,1,0.02
1,1,0,3,5,10,65,7,1,1,0.01
2,0,0,3,0,11,68,8,0,0,0.01
3,0,0,3,0,11,74,8,0,0,0.01
4,1,1,3,3,13,62,5,0,0,0.01


## Classification Models: predict Organics Purchase Indicator

#### Scalling Data

In [4]:
X = ds.drop(columns=['Organics Purchase Count', 'Organics Purchase Indicator', 'Total Spend'])
y = ds['Organics Purchase Indicator']

In [5]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

#### Splitting Data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=29)

#### Support Vector Machine

In [7]:
degrees = [1,5,10,20] # the default was 3

for d in degrees:
    svm = SVC(degree=d)
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print('degree =', d)
    print('Accuracy:', svm.score(X_test, y_test)*100),
    print('Precision:', tp/(tp+fp)*100),
    print('Recall:', tp/(tp+fn)*100)
    print('F1:', 2*((tp/(tp+fp))*(tp/(tp+fn)))/((tp/(tp+fp))+(tp/(tp+fn)))*100)



degree = 1
Accuracy: 78.7248322147651
Precision: 80.0
Recall: 36.61327231121281
F1: 50.23547880690737




degree = 5
Accuracy: 78.7248322147651
Precision: 80.0
Recall: 36.61327231121281
F1: 50.23547880690737




degree = 10
Accuracy: 78.7248322147651
Precision: 80.0
Recall: 36.61327231121281
F1: 50.23547880690737




degree = 20
Accuracy: 78.7248322147651
Precision: 80.0
Recall: 36.61327231121281
F1: 50.23547880690737


#### K-Nearest Neighbour

In [8]:
neighbors = [1,2,3,4,5,6,7,8,9,10,15,20] # the default was 5

for n in neighbors:
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print('n_neighbors =',n)
    print('Accuracy:', svm.score(X_test, y_test)*100),
    print('Precision:', tp/(tp+fp)*100),
    print('Recall:', tp/(tp+fn)*100)
    print('F1:', 2*((tp/(tp+fp))*(tp/(tp+fn)))/((tp/(tp+fp))+(tp/(tp+fn)))*100)
    print('\n')

n_neighbors = 1
Accuracy: 78.7248322147651
Precision: 49.187935034802784
Recall: 48.51258581235698
F1: 48.8479262672811


n_neighbors = 2
Accuracy: 78.7248322147651
Precision: 63.23529411764706
Recall: 29.51945080091533
F1: 40.24960998439937


n_neighbors = 3
Accuracy: 78.7248322147651
Precision: 58.92857142857143
Recall: 45.30892448512586
F1: 51.228978007761974


n_neighbors = 4
Accuracy: 78.7248322147651
Precision: 68.14159292035397
Recall: 35.24027459954233
F1: 46.45550527903469


n_neighbors = 5
Accuracy: 78.7248322147651
Precision: 63.793103448275865
Recall: 42.33409610983982
F1: 50.894085281980736


n_neighbors = 6
Accuracy: 78.7248322147651
Precision: 69.68325791855203
Recall: 35.24027459954233
F1: 46.80851063829787


n_neighbors = 7
Accuracy: 78.7248322147651
Precision: 65.48042704626334
Recall: 42.10526315789473
F1: 51.25348189415041


n_neighbors = 8
Accuracy: 78.7248322147651
Precision: 68.14159292035397
Recall: 35.24027459954233
F1: 46.45550527903469


n_neighbors = 9
Accur