# Setting up the project




In [None]:
# Data is saved as Excel Worksheet file in Google Drive
# Connect Colab to the google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Colab Notebooks/4-Analytics-Enabled-Marketing
# !pwd

/content/drive/MyDrive/Colab Notebooks/4-Analytics-Enabled-Marketing


# Import Python Libraries



In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

# Read in and Inspect the Data

In [None]:
# Read in the data

dataset = pd.read_excel('a1_Dataset_10Percent.xlsx')

In [None]:
# Show count or rows and columns

dataset.shape

(22223, 11)

In [None]:
# Show first few rows of the dataset

dataset.head()

Unnamed: 0,ID,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalSpend,LoyalTime,TargetBuy
0,17147654,5.0,,,,,,Tin,0.01,5.0,0.0
1,8415498,15.0,,,M,,,Gold,8000.0,5.0,1.0
2,12107603,,,,M,Midlands,East,Tin,0.01,,1.0
3,14400995,8.0,28.0,,F,,,Tin,0.01,,1.0
4,28724674,14.0,67.0,,,,,Tin,0.01,7.0,0.0


# Data Preparation

In [None]:
# Dropping Customer ID column from the dataset

dataset = dataset.drop(['ID'], axis=1)

dataset.head()

Unnamed: 0,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalSpend,LoyalTime,TargetBuy
0,5.0,,,,,,Tin,0.01,5.0,0.0
1,15.0,,,M,,,Gold,8000.0,5.0,1.0
2,,,,M,Midlands,East,Tin,0.01,,1.0
3,8.0,28.0,,F,,,Tin,0.01,,1.0
4,14.0,67.0,,,,,Tin,0.01,7.0,0.0


In [None]:
# Explore Missing values

dataset.isna().sum()

DemAffl            1085
DemAge             1508
DemClusterGroup     674
DemGender          2512
DemReg              465
DemTVReg            465
LoyalClass            0
LoyalSpend            0
LoyalTime           281
TargetBuy             0
dtype: int64

In [None]:
# Fill missing values with mean/mode

dataset['DemAffl']=dataset['DemAffl'].fillna(dataset['DemAffl'].mode()[0])
dataset['DemAge']=dataset['DemAge'].fillna(dataset['DemAge'].mode()[0])
dataset['DemClusterGroup']=dataset['DemClusterGroup'].fillna(dataset['DemClusterGroup'].mode()[0])
dataset['DemGender']=dataset['DemGender'].fillna(dataset['DemGender'].mode()[0])
dataset['DemReg']=dataset['DemReg'].fillna(dataset['DemReg'].mode()[0])
dataset['DemTVReg']=dataset['DemTVReg'].fillna(dataset['DemTVReg'].mode()[0])
dataset['LoyalTime']=dataset['LoyalTime'].fillna(dataset['LoyalTime'].mean())

In [None]:
# Explore miaaing values post-fix

dataset.isna().sum()

DemAffl            0
DemAge             0
DemClusterGroup    0
DemGender          0
DemReg             0
DemTVReg           0
LoyalClass         0
LoyalSpend         0
LoyalTime          0
TargetBuy          0
dtype: int64

In [None]:
dataset.head()

Unnamed: 0,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalSpend,LoyalTime,TargetBuy
0,5.0,51.0,C,F,South East,London,Tin,0.01,5.0,0.0
1,15.0,51.0,C,M,South East,London,Gold,8000.0,5.0,1.0
2,8.0,51.0,C,M,Midlands,East,Tin,0.01,6.56467,1.0
3,8.0,28.0,C,F,South East,London,Tin,0.01,6.56467,1.0
4,14.0,67.0,C,F,South East,London,Tin,0.01,7.0,0.0


# Numerizing Categorical Variables

In [None]:
# Covert Categorical labels to Numeric data

from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()

dataset['DemClusterGroup'] = number.fit_transform(dataset['DemClusterGroup'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['DemGender'] = number.fit_transform(dataset['DemGender'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['DemReg'] = number.fit_transform(dataset['DemReg'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['DemTVReg'] = number.fit_transform(dataset['DemTVReg'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['LoyalClass'] = number.fit_transform(dataset['LoyalClass'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'U': 6}
{'F': 0, 'M': 1, 'U': 2}
{'Midlands': 0, 'North': 1, 'Scottish': 2, 'South East': 3, 'South West': 4}
{'Border': 0, 'C Scotland': 1, 'East': 2, 'London': 3, 'Midlands': 4, 'N East': 5, 'N Scot': 6, 'N West': 7, 'S & S East': 8, 'S West': 9, 'Ulster': 10, 'Wales & West': 11, 'Yorkshire': 12}
{'Gold': 0, 'Platinum': 1, 'Silver': 2, 'Tin': 3}


In [None]:
dataset.head()

Unnamed: 0,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalSpend,LoyalTime,TargetBuy
0,5.0,51.0,2,0,3,3,3,0.01,5.0,0.0
1,15.0,51.0,2,1,3,3,0,8000.0,5.0,1.0
2,8.0,51.0,2,1,0,2,3,0.01,6.56467,1.0
3,8.0,28.0,2,0,3,3,3,0.01,6.56467,1.0
4,14.0,67.0,2,0,3,3,3,0.01,7.0,0.0


# Multicollinearity Check

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(z):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = z.columns
    vif["VIF"] = [variance_inflation_factor(z.values, i) for i in range(z.shape[1])]

    return(vif)

In [None]:
z = dataset.iloc[:,0:9]
calc_vif(z)

Unnamed: 0,variables,VIF
0,DemAffl,6.27863
1,DemAge,10.734656
2,DemClusterGroup,3.659632
3,DemGender,1.435472
4,DemReg,2.474645
5,DemTVReg,3.752279
6,LoyalClass,3.851766
7,LoyalSpend,1.863196
8,LoyalTime,3.153032


# Variable Selection

In [None]:
y = dataset.iloc[:, 9].values                   #The Column in index 9 (TargetBuy) is the dependent variable
X = dataset.iloc[:, 0:9].values                 #All columns from index 0 to index 8 are the input variables

In [None]:
# Splitting dataset into training and test (Ratio - 80:20)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Modelling

In [None]:
classifier =  LogisticRegression(max_iter=200)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
# Exporting Logistic Regression Classifier to later use in prediction

import joblib
joblib.dump(classifier, './c2_Classifier_LoyalCustomers')

['./c2_Classifier_LoyalCustomers']

In [None]:
# Check model performance by plotting the confusion matrix

print(confusion_matrix(y_test,y_pred))

[[3186  181]
 [ 688  390]]


In [None]:
# Compute model accuracy score

print(accuracy_score(y_test, y_pred))

0.8044994375703037


In [None]:
# Compute the probability of buying or not buying for the test set

predictions = classifier.predict_proba(X_test)
predictions

array([[0.86796329, 0.13203671],
       [0.63947925, 0.36052075],
       [0.4835099 , 0.5164901 ],
       ...,
       [0.89134643, 0.10865357],
       [0.90963079, 0.09036921],
       [0.87023801, 0.12976199]])

In [None]:
# Write model Output file

df_prediction_prob = pd.DataFrame(predictions, columns = ['prob_0', 'prob_1'])
df_test_dataset = pd.DataFrame(y_test,columns= ['Actual Outcome'])
df_x_test = pd.DataFrame(X_test)

dfx=pd.concat([df_x_test,df_test_dataset, df_prediction_prob], axis=1)

dfx.to_excel("ModelOutput_10Percent.xlsx")

dfx.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,Actual Outcome,prob_0,prob_1
0,10.0,58.0,3.0,1.0,3.0,8.0,0.0,12632.66,4.0,0.0,0.867963,0.132037
1,10.0,51.0,2.0,0.0,3.0,8.0,0.0,6000.0,6.0,0.0,0.639479,0.360521
2,16.0,65.0,1.0,0.0,2.0,6.0,0.0,6053.35,7.0,0.0,0.48351,0.51649
3,5.0,60.0,3.0,0.0,3.0,3.0,2.0,5000.0,1.0,0.0,0.912112,0.087888
4,9.0,52.0,3.0,0.0,0.0,4.0,2.0,3500.0,6.0,0.0,0.706179,0.293821
