In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFE

In [2]:
ds = pd.read_csv('organics_stats.csv', index_col=0)

In [3]:
ds.head()

Unnamed: 0,Gender,Geographic Region,Loyalty Status,Neighborhood Cluster-7 Level,Affluence Grade,Age,Loyalty Card Tenure,Organics Purchase Count,Organics Purchase Indicator,Total Spend
0,0,0,2,3,5,70,8,1,1,0.02
1,1,0,3,5,10,65,7,1,1,0.01
2,0,0,3,0,11,68,8,0,0,0.01
3,0,0,3,0,11,74,8,0,0,0.01
4,1,1,3,3,13,62,5,0,0,0.01


## Classification Models: predict Organics Purchase Indicator

#### Scalling Data

In [4]:
X = ds.drop(columns=['Organics Purchase Count', 'Organics Purchase Indicator'])
y = ds['Organics Purchase Indicator']

In [5]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

#### Splitting Data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=29)

#### Logistic Regression

In [7]:
lr = LogisticRegression(solver='lbfgs')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [8]:
# Accuracy:
lr.score(X_test,y_test) * 100

78.65771812080537

In [9]:
# Confusion matrix:
confusion_matrix(y_test, y_pred)

array([[1001,   52],
       [ 266,  171]], dtype=int64)

In [10]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
tn, fp, fn, tp

(1001, 52, 266, 171)

In [11]:
# Precision:
tp/(tp+fp)*100

76.68161434977578

In [12]:
# Recall:
tp/(tp+fn)*100

39.130434782608695

In [13]:
# F1:
2*((tp/(tp+fp))*(tp/(tp+fn)))/((tp/(tp+fp))+(tp/(tp+fn)))*100

51.81818181818182

#### Decision Tree

In [14]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

In [15]:
# Accuracy:
dtc.score(X_test, y_test)*100

71.74496644295301

In [16]:
# Confusion matrix:
confusion_matrix(y_test, y_pred)

array([[842, 211],
       [210, 227]], dtype=int64)

In [17]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
tn, fp, fn, tp

(842, 211, 210, 227)

In [18]:
# Precision:
tp/(tp+fp)*100

51.82648401826484

In [19]:
# Recall:
tp/(tp+fn)*100

51.94508009153318

In [20]:
# F1:
2*((tp/(tp+fp))*(tp/(tp+fn)))/((tp/(tp+fp))+(tp/(tp+fn)))*100

51.88571428571429

#### Support Vector Machine

In [21]:
svm = SVC(gamma='scale')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

In [22]:
# Accuracy:
svm.score(X_test,y_test)*100

78.79194630872483

In [23]:
# Confusion matrix:
confusion_matrix(y_test, y_pred)

array([[1015,   38],
       [ 278,  159]], dtype=int64)

In [24]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
tn, fp, fn, tp

(1015, 38, 278, 159)

In [25]:
# Precision:
tp/(tp+fp)*100

80.71065989847716

In [26]:
# Recall:
tp/(tp+fn)*100

36.38443935926773

In [27]:
# F1:
2*((tp/(tp+fp))*(tp/(tp+fn)))/((tp/(tp+fp))+(tp/(tp+fn)))*100

50.1577287066246

#### K-Nearest Neighbour

In [52]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [53]:
# Accuracy:
knn.score(X_test,y_test)*100

75.83892617449665

In [54]:
# Confusion matrix:
confusion_matrix(y_test, y_pred)

array([[947, 106],
       [254, 183]], dtype=int64)

In [55]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
tn, fp, fn, tp

(947, 106, 254, 183)

In [56]:
# Precision:
tp/(tp+fp)*100

63.32179930795848

In [57]:
# Recall:
tp/(tp+fn)*100

41.87643020594965

In [58]:
# F1:
2*((tp/(tp+fp))*(tp/(tp+fn)))/((tp/(tp+fp))+(tp/(tp+fn)))*100

50.413223140495866

### Feature reduction

In [35]:
estimator = lr
selector = RFE(estimator)
selector = selector.fit(X, y)
selector.ranking_

array([1, 5, 1, 4, 1, 1, 3, 2])

In [36]:
"""
1 Gender / Loyalty Status / Affluence Grade / Age
2 Total Spend
3 Loyalty Card Tenure
4 Neighborhood Cluster-7 Level 
5 Geographic Region
"""

'\n1 Gender / Loyalty Status / Affluence Grade / Age\n2 Total Spend\n3 Loyalty Card Tenure\n4 Neighborhood Cluster-7 Level \n5 Geographic Region\n'

In [37]:
estimator = dtc
selector = RFE(estimator)
selector = selector.fit(X, y)
selector.ranking_

array([4, 3, 5, 2, 1, 1, 1, 1])

In [38]:
"""
1 Affluence Grade / Age / Loyalty Card Tenure / Total Spend
2 Neighborhood Cluster-7 Level
3 Geographic Region
4 Gender
5 Loyalty Status
"""

'\n1 Affluence Grade / Age / Loyalty Card Tenure / Total Spend\n2 Neighborhood Cluster-7 Level\n3 Geographic Region\n4 Gender\n5 Loyalty Status\n'

In [39]:
"""
estimator = svm
selector = RFE(estimator)
selector = selector.fit(X, y)
selector.ranking_
"""

'\nestimator = svm\nselector = RFE(estimator)\nselector = selector.fit(X, y)\nselector.ranking_\n'

In [40]:
"""
estimator = knn
selector = RFE(estimator)
selector = selector.fit(X, y)
selector.ranking_
"""

'\nestimator = knn\nselector = RFE(estimator)\nselector = selector.fit(X, y)\nselector.ranking_\n'

#### Removing 'Geographic Region' in Logistic Regression

In [43]:
X = ds.drop(columns=['Organics Purchase Count', 'Organics Purchase Indicator', 'Geographic Region'])
y = ds['Organics Purchase Indicator']

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=29)

lr = LogisticRegression(solver='lbfgs')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print('Accuracy:', lr.score(X_test, y_test)*100),
print('Precision:', tp/(tp+fp)*100),
print('Recall:', tp/(tp+fn)*100)
print('F1:', 2*((tp/(tp+fp))*(tp/(tp+fn)))/((tp/(tp+fp))+(tp/(tp+fn)))*100)

Accuracy: 78.59060402684564
Precision: 76.57657657657657
Recall: 38.901601830663616
F1: 51.593323216995444


#### Removing 'Loyalty Status' in Decision Tree

In [44]:
X = ds.drop(columns=['Organics Purchase Count', 'Organics Purchase Indicator', 'Loyalty Status'])
y = ds['Organics Purchase Indicator']

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=29)

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print('Accuracy:', dtc.score(X_test, y_test)*100),
print('Precision:', tp/(tp+fp)*100),
print('Recall:', tp/(tp+fn)*100)
print('F1:', 2*((tp/(tp+fp))*(tp/(tp+fn)))/((tp/(tp+fp))+(tp/(tp+fn)))*100)

Accuracy: 71.20805369127517
Precision: 50.92592592592593
Recall: 50.34324942791763
F1: 50.63291139240506
