In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

In [4]:
dataset=pd.read_excel("a1_Dataset_10Percent.xlsx")

In [5]:
dataset.shape

(22223, 13)

In [6]:
dataset.head()

Unnamed: 0,ID,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalSpend,LoyalTime,TargetBuy,SpendDollar,Buy
0,17147654,5.0,,,,,,Tin,0.00012,5.0,0,1e-06,0
1,8415498,15.0,,,M,,,Gold,8000.0,5.0,1,96.0,1
2,12107603,,,,M,Midlands,East,Tin,0.01,,1,0.00012,1
3,14400995,8.0,28.0,,F,,,Tin,0.01,,1,0.00012,1
4,28724674,14.0,67.0,,,,,Tin,0.01,7.0,0,0.00012,0


In [7]:
dataset=dataset.drop(['ID'],axis=1)
dataset=dataset.drop(['LoyalSpend'], axis=1)
dataset=dataset.drop(['TargetBuy'], axis=1)

dataset.head()

Unnamed: 0,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalTime,SpendDollar,Buy
0,5.0,,,,,,Tin,5.0,1e-06,0
1,15.0,,,M,,,Gold,5.0,96.0,1
2,,,,M,Midlands,East,Tin,,0.00012,1
3,8.0,28.0,,F,,,Tin,,0.00012,1
4,14.0,67.0,,,,,Tin,7.0,0.00012,0


In [8]:
dataset.isna().sum()

DemAffl            1085
DemAge             1508
DemClusterGroup     674
DemGender          2512
DemReg              465
DemTVReg            465
LoyalClass            0
LoyalTime           281
SpendDollar           0
Buy                   0
dtype: int64

In [9]:
# filling missing values with mean/mode*

dataset['DemAffl']=dataset['DemAffl'].fillna(dataset['DemAffl'].mode()[0])
dataset['DemAge']=dataset['DemAge'].fillna(dataset['DemAge'].mode()[0])
dataset['DemClusterGroup']=dataset['DemClusterGroup'].fillna(dataset['DemClusterGroup'].mode()[0])
dataset['DemGender']=dataset['DemGender'].fillna(dataset['DemGender'].mode()[0])
dataset['DemReg']=dataset['DemReg'].fillna(dataset['DemReg'].mode()[0])
dataset['DemTVReg']=dataset['DemTVReg'].fillna(dataset['DemTVReg'].mode()[0])
dataset['LoyalTime']=dataset['LoyalTime'].fillna(dataset['LoyalTime'].mean())

In [10]:
dataset.isna().sum()

DemAffl            0
DemAge             0
DemClusterGroup    0
DemGender          0
DemReg             0
DemTVReg           0
LoyalClass         0
LoyalTime          0
SpendDollar        0
Buy                0
dtype: int64

In [11]:
dataset.head()

Unnamed: 0,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalTime,SpendDollar,Buy
0,5.0,51.0,C,F,South East,London,Tin,5.0,1e-06,0
1,15.0,51.0,C,M,South East,London,Gold,5.0,96.0,1
2,8.0,51.0,C,M,Midlands,East,Tin,6.56467,0.00012,1
3,8.0,28.0,C,F,South East,London,Tin,6.56467,0.00012,1
4,14.0,67.0,C,F,South East,London,Tin,7.0,0.00012,0


In [12]:
# converting to mumeric

from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()

dataset['DemClusterGroup'] = number.fit_transform(dataset['DemClusterGroup'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['DemGender'] = number.fit_transform(dataset['DemGender'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['DemReg'] = number.fit_transform(dataset['DemReg'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['DemTVReg'] = number.fit_transform(dataset['DemTVReg'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['LoyalClass'] = number.fit_transform(dataset['LoyalClass'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'U': 6}
{'F': 0, 'M': 1, 'U': 2}
{'Midlands': 0, 'North': 1, 'Scottish': 2, 'South East': 3, 'South West': 4}
{'Border': 0, 'C Scotland': 1, 'East': 2, 'London': 3, 'Midlands': 4, 'N East': 5, 'N Scot': 6, 'N West': 7, 'S & S East': 8, 'S West': 9, 'Ulster': 10, 'Wales & West': 11, 'Yorkshire': 12}
{'Gold': 0, 'Platinum': 1, 'Silver': 2, 'Tin': 3}


In [13]:
dataset.head()

Unnamed: 0,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalTime,SpendDollar,Buy
0,5.0,51.0,2,0,3,3,3,5.0,1e-06,0
1,15.0,51.0,2,1,3,3,0,5.0,96.0,1
2,8.0,51.0,2,1,0,2,3,6.56467,0.00012,1
3,8.0,28.0,2,0,3,3,3,6.56467,0.00012,1
4,14.0,67.0,2,0,3,3,3,7.0,0.00012,0


In [14]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(z):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = z.columns
    vif["VIF"] = [variance_inflation_factor(z.values, i) for i in range(z.shape[1])]

    return(vif)

In [15]:
z = dataset.iloc[:,0:9]
calc_vif(z)

Unnamed: 0,variables,VIF
0,DemAffl,6.27863
1,DemAge,10.734656
2,DemClusterGroup,3.659632
3,DemGender,1.435472
4,DemReg,2.474645
5,DemTVReg,3.752279
6,LoyalClass,3.851766
7,LoyalTime,3.153032
8,SpendDollar,1.863196


In [16]:
y = dataset.iloc[:, 9].values
X = dataset.iloc[:, 0:9].values

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [18]:
classifier =  LogisticRegression(max_iter=500)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [19]:
# Exporting Logistic Regression Classifier to later use in prediction
import joblib
joblib.dump(classifier, './c2_Classifier_LoyalCustomers')

['./c2_Classifier_LoyalCustomers']

In [20]:
print(confusion_matrix(y_test,y_pred))

[[3186  181]
 [ 691  387]]


In [21]:
print(accuracy_score(y_test, y_pred))

0.8038245219347582


In [22]:
predictions = classifier.predict_proba(X_test)
predictions

array([[0.86590971, 0.13409029],
       [0.65505099, 0.34494901],
       [0.4927416 , 0.5072584 ],
       ...,
       [0.89253954, 0.10746046],
       [0.91192437, 0.08807563],
       [0.8681919 , 0.1318081 ]])

In [23]:
# writing model output file

df_prediction_prob = pd.DataFrame(predictions, columns = ['prob_0', 'prob_1'])
df_test_dataset = pd.DataFrame(y_test,columns= ['Actual Outcome'])
df_x_test = pd.DataFrame(X_test)

dfx=pd.concat([df_x_test,df_test_dataset, df_prediction_prob], axis=1)

dfx.to_excel("c1_ModelOutput_10Percent.xlsx")

dfx.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,Actual Outcome,prob_0,prob_1
0,10.0,58.0,3.0,1.0,3.0,8.0,0.0,4.0,151.59192,0,0.86591,0.13409
1,10.0,51.0,2.0,0.0,3.0,8.0,0.0,6.0,72.0,0,0.655051,0.344949
2,16.0,65.0,1.0,0.0,2.0,6.0,0.0,7.0,72.6402,0,0.492742,0.507258
3,5.0,60.0,3.0,0.0,3.0,3.0,2.0,1.0,60.0,0,0.909709,0.090291
4,9.0,52.0,3.0,0.0,0.0,4.0,2.0,6.0,42.0,0,0.710597,0.289403


Now we will start the predictor code

In [24]:
dataset=pd.read_excel("a2_Dataset_90Percent.xlsx")

In [25]:
dataset.head()

Unnamed: 0,ID,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalSpend,LoyalTime,SpendDollar
0,140,10.0,76.0,C,U,Midlands,Wales & West,Gold,16000.0,4.0,192.0
1,620,4.0,49.0,D,U,Midlands,Wales & West,Gold,6000.0,5.0,72.0
2,868,5.0,70.0,D,F,Midlands,Wales & West,Silver,0.02,8.0,0.00024
3,1120,10.0,65.0,F,M,Midlands,Midlands,Tin,0.01,7.0,0.00012
4,2313,11.0,68.0,A,F,Midlands,Midlands,Tin,0.01,8.0,0.00012


In [26]:
dataset=dataset.drop(['LoyalSpend'], axis=1)

In [27]:
dataset.head()

Unnamed: 0,ID,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalTime,SpendDollar
0,140,10.0,76.0,C,U,Midlands,Wales & West,Gold,4.0,192.0
1,620,4.0,49.0,D,U,Midlands,Wales & West,Gold,5.0,72.0
2,868,5.0,70.0,D,F,Midlands,Wales & West,Silver,8.0,0.00024
3,1120,10.0,65.0,F,M,Midlands,Midlands,Tin,7.0,0.00012
4,2313,11.0,68.0,A,F,Midlands,Midlands,Tin,8.0,0.00012


In [28]:
dataset.isna().sum()

ID                   0
DemAffl             48
DemAge              67
DemClusterGroup     28
DemGender          114
DemReg              18
DemTVReg            18
LoyalClass           0
LoyalTime           15
SpendDollar          0
dtype: int64

In [29]:
# filling missing values with mean/mode*
dataset['DemAffl']=dataset['DemAffl'].fillna(dataset['DemAffl'].mode()[0])
dataset['DemAge']=dataset['DemAge'].fillna(dataset['DemAge'].mode()[0])
dataset['DemClusterGroup']=dataset['DemClusterGroup'].fillna(dataset['DemClusterGroup'].mode()[0])
dataset['DemGender']=dataset['DemGender'].fillna(dataset['DemGender'].mode()[0])
dataset['DemReg']=dataset['DemReg'].fillna(dataset['DemReg'].mode()[0])
dataset['DemTVReg']=dataset['DemTVReg'].fillna(dataset['DemTVReg'].mode()[0])
dataset['LoyalTime']=dataset['LoyalTime'].fillna(dataset['LoyalTime'].mean())

In [30]:
# explore missing values post missing value fix
dataset.isna().sum()

ID                 0
DemAffl            0
DemAge             0
DemClusterGroup    0
DemGender          0
DemReg             0
DemTVReg           0
LoyalClass         0
LoyalTime          0
SpendDollar        0
dtype: int64

In [31]:
# converting to mumeric
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()

dataset['DemClusterGroup'] = number.fit_transform(dataset['DemClusterGroup'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['DemGender'] = number.fit_transform(dataset['DemGender'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['DemReg'] = number.fit_transform(dataset['DemReg'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['DemTVReg'] = number.fit_transform(dataset['DemTVReg'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['LoyalClass'] = number.fit_transform(dataset['LoyalClass'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'U': 6}
{'F': 0, 'M': 1, 'U': 2}
{'Midlands': 0, 'North': 1, 'Scottish': 2, 'South East': 3, 'South West': 4}
{'Border': 0, 'C Scotland': 1, 'East': 2, 'London': 3, 'Midlands': 4, 'N East': 5, 'N Scot': 6, 'N West': 7, 'S & S East': 8, 'S West': 9, 'Ulster': 10, 'Wales & West': 11, 'Yorkshire': 12}
{'Gold': 0, 'Platinum': 1, 'Silver': 2, 'Tin': 3}


In [32]:
dataset.head()

Unnamed: 0,ID,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalTime,SpendDollar
0,140,10.0,76.0,2,2,0,11,0,4.0,192.0
1,620,4.0,49.0,3,2,0,11,0,5.0,72.0
2,868,5.0,70.0,3,0,0,11,2,8.0,0.00024
3,1120,10.0,65.0,5,1,0,4,3,7.0,0.00012
4,2313,11.0,68.0,0,0,0,4,3,8.0,0.00012


Predictions

In [33]:
X_fresh = dataset.iloc[:, 1:10].values

In [34]:
import joblib
classifier = joblib.load('c2_Classifier_LoyalCustomers')

In [35]:
y_pred = classifier.predict(X_fresh)
print(y_pred)

[0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0
 1 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0
 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 1 0 0 0 0 0 

In [36]:
predictions = classifier.predict_proba(X_fresh)
predictions

array([[0.97661269, 0.02338731],
       [0.97406313, 0.02593687],
       [0.94489562, 0.05510438],
       ...,
       [0.96802869, 0.03197131],
       [0.76497381, 0.23502619],
       [0.50823712, 0.49176288]])

In [37]:
# writing model output file
df_prediction_prob = pd.DataFrame(predictions, columns = ['prob_0', 'prob_1'])
dfx=pd.concat([dataset,df_prediction_prob], axis=1)
dfx.to_excel("d2_BuyProb_90Percent.xlsx")
dfx.head()

Unnamed: 0,ID,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalTime,SpendDollar,prob_0,prob_1
0,140,10.0,76.0,2,2,0,11,0,4.0,192.0,0.976613,0.023387
1,620,4.0,49.0,3,2,0,11,0,5.0,72.0,0.974063,0.025937
2,868,5.0,70.0,3,0,0,11,2,8.0,0.00024,0.944896,0.055104
3,1120,10.0,65.0,5,1,0,4,3,7.0,0.00012,0.892251,0.107749
4,2313,11.0,68.0,0,0,0,4,3,8.0,0.00012,0.789273,0.210727
