In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
ds = pd.read_csv('organics_stats.csv', index_col=0)

In [3]:
ds.head()

Unnamed: 0,Gender,Geographic Region,Loyalty Status,Neighborhood Cluster-7 Level,Affluence Grade,Age,Loyalty Card Tenure,Organics Purchase Count,Organics Purchase Indicator,Total Spend
0,0,0,2,3,5,70,8,1,1,0.02
1,1,0,3,5,10,65,7,1,1,0.01
2,0,0,3,0,11,68,8,0,0,0.01
3,0,0,3,0,11,74,8,0,0,0.01
4,1,1,3,3,13,62,5,0,0,0.01


## Classification Models: predict Organics Purchase Indicator

#### Scalling Data

In [4]:
X = ds.drop(columns=['Organics Purchase Count', 'Organics Purchase Indicator', 'Total Spend'])
y = ds['Organics Purchase Indicator']

In [5]:
scaler = RobustScaler()
X = scaler.fit_transform(X)

#### Splitting Data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=29)

#### Logistic Regression

In [7]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)



In [8]:
# Accuracy:
lr.score(X_test,y_test) * 100

78.11346089291709

In [9]:
# Confusion matrix:
confusion_matrix(y_test, y_pred)

array([[1969,  146],
       [ 506,  358]], dtype=int64)

In [10]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
tn, fp, fn, tp

(1969, 146, 506, 358)

In [11]:
# Precision:
tp/(tp+fp)*100

71.03174603174604

In [12]:
# Recall:
tp/(tp+fn)*100

41.43518518518518

In [13]:
# F1:
2*((tp/(tp+fp))*(tp/(tp+fn)))/((tp/(tp+fp))+(tp/(tp+fn)))*100

52.33918128654971

#### Decision Tree

In [14]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

In [15]:
# Accuracy:
dtc.score(X_test, y_test)*100

68.9828801611279

In [16]:
# Confusion matrix:
confusion_matrix(y_test, y_pred)

array([[1636,  479],
       [ 445,  419]], dtype=int64)

In [17]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
tn, fp, fn, tp

(1636, 479, 445, 419)

In [18]:
# Precision:
tp/(tp+fp)*100

46.659242761692646

In [19]:
# Recall:
tp/(tp+fn)*100

48.495370370370374

In [20]:
# F1:
2*((tp/(tp+fp))*(tp/(tp+fn)))/((tp/(tp+fp))+(tp/(tp+fn)))*100

47.559591373439275

#### Support Vector Machine

In [21]:
svm = SVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)



In [22]:
# Accuracy:
svm.score(X_test,y_test)*100

78.21416582745888

In [23]:
# Confusion matrix:
confusion_matrix(y_test, y_pred)

array([[2023,   92],
       [ 557,  307]], dtype=int64)

In [24]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
tn, fp, fn, tp

(2023, 92, 557, 307)

In [25]:
# Precision:
tp/(tp+fp)*100

76.94235588972431

In [26]:
# Recall:
tp/(tp+fn)*100

35.532407407407405

In [27]:
# F1:
2*((tp/(tp+fp))*(tp/(tp+fn)))/((tp/(tp+fp))+(tp/(tp+fn)))*100

48.61441013460015

#### K-Nearest Neighbour

In [28]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [29]:
# Accuracy:
knn.score(X_test,y_test)*100

75.73011077542799

In [30]:
# Confusion matrix:
confusion_matrix(y_test, y_pred)

array([[1891,  224],
       [ 499,  365]], dtype=int64)

In [31]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
tn, fp, fn, tp

(1891, 224, 499, 365)

In [32]:
# Precision:
tp/(tp+fp)*100

61.96943972835314

In [33]:
# Recall:
tp/(tp+fn)*100

42.245370370370374

In [34]:
# F1:
2*((tp/(tp+fp))*(tp/(tp+fn)))/((tp/(tp+fp))+(tp/(tp+fn)))*100

50.24088093599449