# Libraries Needed

In [None]:
import tarfile
import pandas as pd
import numpy as np
import seaborn as sn
import tensorflow as tf
import itertools
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
import re
import keras
from collections import Counter
import plotly.express as px
seed = 50

from numpy import mean
from sklearn import decomposition
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_classif as MIC
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectFromModel
import xgboost as xgb
from dython.nominal import associations
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.utils import class_weight
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
import time

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score


from sklearn.datasets import make_classification

from sklearn.ensemble import RandomForestClassifier

from sklearn.compose import make_column_transformer

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from sklearn.cluster import KMeans
from kmodes.kprototypes import KPrototypes
from plotnine import *
from sklearn.metrics import silhouette_score

import tensorflow as tf
from keras.layers import Dense
from keras.models import Sequential
from sklearn.utils import class_weight
from imblearn.over_sampling import RandomOverSampler

pd.set_option('display.max_columns', None)

In [None]:
data = pd.read_csv("data.csv")

# Feature Selection

## Correlation Matrix

In [None]:
r = associations(X_mi_mob4, ax = ax, cmap = "Blues")

In [None]:
plt.figure(figsize=(16, 15))
# define the mask to set the values in the upper triangle to True
mask = np.triu(np.ones_like(r["corr"].round(2), dtype=bool))
heatmap = sns.heatmap(r["corr"].round(2), mask=mask, vmin=-1, vmax=1, annot=False, cmap='BrBG')
heatmap.set_title('Triangle Correlation Heatmap', fontdict={'fontsize':18}, pad=16);
fig = heatmap.get_figure()

## XGBoost

In [None]:
for col in categorical_columns:
    df[col] = df[col].astype('category')

X = df.drop('is_purchase', axis=1)  # drop the target variable from the features
y = df['is_purchase']  # select the target variable as the labels

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# convert the data into a DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)

# define the XGBoost parameters
params = {
    'max_depth': 3,
    'eta': 0.1,
    'objective': 'binary:logistic',
    'eval_metric': 'error'
}

# Train the model with early stopping
model = xgb.train(params, dtrain, num_boost_round=1000, evals=[(dtest, 'validation')], early_stopping_rounds=50)

# Use the number of rounds at which early stopping occurred as the optimal number of rounds
optimal_num_rounds = model.best_iteration


In [None]:
acc = []
num = [1,2,3,4,5,6,7,8,9,10]
for i in num:
  # train the XGBoost model
  xgb_model = xgb.train(params, dtrain, num_boost_round=i)

  # make predictions on the test set
  y_pred = xgb_model.predict(dtest)
  y_pred = [1 if p > 0.5 else 0 for p in y_pred]

  # evaluate the model accuracy
  temp = accuracy_score(y_test,y_pred)
  acc.append(temp)
  #print("XGBoost accuracy:", temp)

plt.plot(num,acc,'bx-')
plt.xlabel('Values of num_boost_round') 
plt.ylabel('Model Accuracy') 
plt.title('Model Accuracy For Optimal num_boost_round')
plt.show()

In [None]:
# evaluate the model accuracy
print("Optimal num rounds:", optimal_num_rounds)
print("XGBoost accuracy:", max(acc))

In [None]:
xgb_model = xgb.train(params, dtrain, num_boost_round=optimal_num_rounds)

# plot the feature importance
xgb.plot_importance(xgb_model, max_num_features=11)
plt.show()

## Mutual Information Score

In [None]:
# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)
# Load data

# Utility functions
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

In [None]:
#Mutual Information 

mi_scores = make_mi_scores(data[columns], data[is_purchase])
mi_scores.head()

plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores.head(20))

## PCA

In [None]:
pca = decomposition.PCA()
pca.n_components = 4
pca_data = pca.fit_transform(data)

pca_data = np.vstack((pca_data.T)).T
pca_df = pd.DataFrame(data=pca_data, columns=("1st_principal", "2nd_principal", "3rd_principal", "label"))

explained_variance = pca.explained_variance_ratio_
explained_variance

In [None]:
fig = plt.figure(figsize=(16,10)).gca(projection='3d')
s = fig.scatter(
    xs=pca_df["1st_principal"], 
    ys=pca_df["2nd_principal"], 
    zs=pca_df["3rd_principal"], 
    c=pca_df["label"], 
    cmap='tab10'
)
plt.legend(handles=s.legend_elements()[0], labels=[0,1,2,3,4,5,6,7,8,9], loc='best')
fig.set_xlabel('1st_principal')
fig.set_ylabel('2nd_principal')
fig.set_zlabel('3rd_principal')
plt.show()

In [None]:
pca = PCA().fit(data)
plt.plot(pca.explained_variance_ratio_.cumsum(), lw=3, color='#087E8B')
plt.title('Cumulative explained variance by number of principal components', size=20)
plt.show()

In [None]:
columns = numeric
loadings = pd.DataFrame(
    data=pca.components_.T.dot(np.diag(np.sqrt(pca.explained_variance_))), 
    columns=[f'PC{i}' for i in range(1, pca.n_components_ + 1)],
    index=columns
)
loadings.head(33)

In [None]:
pc1_loadings = loadings.sort_values(by='PC1', ascending=False)[['PC1']]
pc1_loadings = pc1_loadings.reset_index()
pc1_loadings.columns = ['Attribute', 'CorrelationWithPC1']

plt.bar(x=pc1_loadings['Attribute'], height=pc1_loadings['CorrelationWithPC1'], color='#087E8B')
plt.title('PCA loading scores (first principal component) - 4 clicks', size=20)
plt.xticks(rotation='vertical')
plt.show()

# Classification

## Logistic Regression

In [None]:
df = data.drop("is_purchase", axis = 1)
X = df.loc[:, ~df.columns.isin(categorical_columns)]
y = clicks4.is_purchasor
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# model
model = LogisticRegression(class_weight = 'balanced')
# model.compile(optimizer=optimizer,
#               loss='sparse_categorical_crossentropy',
#               metrics=['accuracy'])
model.fit(X_train, y_train)

# history = model.fit(X_train, y_train, epochs=10, validation_data=(x_test, y_test))
# plot_history(history)

y_pred = model.predict(X_test)

print("Accuracy score:", metrics.accuracy_score(y_test, y_pred))
print("Balanced accuracy score:", metrics.balanced_accuracy_score(y_test, y_pred))
print("ROC AUC score:", metrics.roc_auc_score(y_test, y_pred))
print("Precision score:", metrics.precision_score(y_test, y_pred))
print("Sensitivity score:", metrics.recall_score(y_test, y_pred))
print("Specificity score:", metrics.recall_score(y_test, y_pred, pos_label = 0))
print("F-score:", metrics.f1_score(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

## Random Forest

In [None]:
def evaluate_model(y_pred, probs,train_predictions, train_probs):
    baseline = {}
    baseline['recall']=recall_score(y_test,
                    [1 for _ in range(len(y_test))])
    baseline['precision'] = precision_score(y_test,
                    [1 for _ in range(len(y_test))])
    baseline['roc'] = 0.5
    results = {}
    results['recall'] = recall_score(y_test, y_pred)
    results['precision'] = precision_score(y_test, y_pred)
    results['roc'] = roc_auc_score(y_test, probs)
    train_results = {}
    train_results['recall'] = recall_score(y_train,       train_predictions)
    train_results['precision'] = precision_score(y_train, train_predictions)
    train_results['roc'] = roc_auc_score(y_train, train_probs)
    for metric in ['recall', 'precision', 'roc']:  
          print(f'{metric.capitalize()} Baseline: {round(baseline[metric], 2)} Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}')
     # Calculate false positive rates and true positive rates
    base_fpr, base_tpr, _ = roc_curve(y_test, [1 for _ in range(len(y_test))])
    model_fpr, model_tpr, _ = roc_curve(y_test, probs)
    plt.figure(figsize = (8, 6))
    plt.rcParams['font.size'] = 16
    # Plot both curves
    plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
    plt.plot(model_fpr, model_tpr, 'r', label = 'model')
    plt.legend();
    plt.xlabel('False Positive Rate');
    plt.ylabel('True Positive Rate'); plt.title('ROC Curves');
    plt.show();

def plot_confusion_matrix(cm, classes, normalize = False,
                          title='Confusion matrix',
                          cmap=plt.cm.Greens): # can change color 
    plt.figure(figsize = (10, 10))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, size = 24)
    plt.colorbar(aspect=4)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, size = 14)
    plt.yticks(tick_marks, classes, size = 14)
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    # Label the plot
    for i, j in itertools.product(range(cm.shape[0]),   range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), 
                 fontsize = 20,
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.grid(None)
    plt.tight_layout()
    plt.ylabel('True label', size = 18)
    plt.xlabel('Predicted label', size = 18)
def encode_and_bind(original_dataframe, features_to_encode):
    dummies = pd.get_dummies(original_dataframe[features_to_encode])
    res = pd.concat([dummies, original_dataframe], axis=1)
    res = res.drop(features_to_encode, axis=1)
    return(res)

In [None]:
# Random Forest With Class Weighting
## Creation of dataset
features_to_encode = list(X_train.select_dtypes(include = ['object']).columns) 

col_trans = make_column_transformer(
                        (OneHotEncoder(handle_unknown = 'ignore'),features_to_encode),
                        remainder = "passthrough"
                        )
# Model definition
rf_classifier = RandomForestClassifier(
                      min_samples_leaf=50,
                      n_estimators=150,
                      bootstrap=True,
                      oob_score=True,
                      n_jobs=-1,
                      random_state=seed,
                      max_features='auto', class_weight = "balanced")

# class_weight = balanced use inverse weighting from training dataset
# pipe = make_pipeline(col_trans, rf_classifier)
pipe = Pipeline(steps=[('preprocessor', col_trans),
                       ('selector', SelectFromModel(rf_classifier)),
                       ('classifier', rf_classifier)])
pipe.fit(X_train, y_train)

In [None]:
## Evaluating classfier - ACCURACY
y_pred = pipe.predict(X_test)
accuracy_score(y_test, y_pred)
print(f"The accuracy of the model is {round(accuracy_score(y_test,y_pred),3)*100} %")
train_probs = pipe.predict_proba(X_train)[:,1] 
probs = pipe.predict_proba(X_test)[:, 1]
train_predictions = pipe.predict(X_train)
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
print(f'Test ROC AUC  Score: {roc_auc_score(y_test, probs)}')
## ROC CURVE
evaluate_model(y_pred,probs,train_predictions,train_probs)

##CONFUSION MATRIX
# Let's plot it out
cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, classes = ['0 - Non-Purchaser', '1 - Purchaser'],
                      title = 'Purchaser Confusion Matrix')

In [None]:
val_Y = y_test
test_predictions = y_pred
print("Accuracy Score:", metrics.accuracy_score(val_Y, test_predictions))
print("Balanced Accuracy Score:", metrics.balanced_accuracy_score(val_Y, test_predictions)) # Accuracy for imbalanced data
print("ROC Score:", metrics.roc_auc_score(val_Y, test_predictions))
print("Precision", metrics.precision_score(val_Y, test_predictions)) # Of the positives predicted, what percentage is truly positive
print("Sensitivity", metrics.recall_score(val_Y, test_predictions)) # Of all the positive cases, what percentage are predicted positive
print("Specificity", metrics.recall_score(val_Y, test_predictions, pos_label = 0)) # How well the model is at prediciting negative results
print("F-Score", metrics.f1_score(val_Y, test_predictions)) # The "harmonic mean" of precision and sensitivity. Good for imbalanced datasets

## CNN

In [None]:
def plot_confusion_matrix(cm, classes, normalize = False,
                          title='Confusion matrix',
                          cmap=plt.cm.Greens): # can change color 
    plt.figure(figsize = (10, 10))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, size = 24)
    plt.colorbar(aspect=4)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, size = 14)
    plt.yticks(tick_marks, classes, size = 14)
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    # Label the plot
    for i, j in itertools.product(range(cm.shape[0]),   range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), 
                 fontsize = 20,
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.grid(None)
    plt.tight_layout()
    plt.ylabel('True label', size = 18)
    plt.xlabel('Predicted label', size = 18)

In [None]:
# prepare input data
def prepare_inputs(X_train, X_test):
  ohe = OneHotEncoder(handle_unknown = 'ignore')
  ohe.fit(X_train)
  X_train_enc = ohe.transform(X_train)
  X_test_enc = ohe.transform(X_test)
  return X_train_enc, X_test_enc
# prepare target
def prepare_targets(y_train, y_test):
  le = LabelEncoder()
  le.fit(y_train)
  y_train_enc = le.transform(y_train)
  y_test_enc = le.transform(y_test)
  return y_train_enc, y_test_enc


In [None]:
#preprocess the data
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X, y)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X_resampled,y_resampled,test_size=0.20, random_state = 42)
# prepare input data
train_X, val_X = prepare_inputs(Xtrain, Xtest)
# prepare output data
train_Y, val_Y = prepare_targets(Ytrain, Ytest)
# make output 3d
train_Y = train_Y.reshape((len(train_Y), 1, 1))
val_Y = val_Y.reshape((len(val_Y), 1, 1))
# define the  model
model = Sequential()
model.add(Dense(6, input_dim=train_X.shape[1], activation='relu', kernel_initializer='he_normal'))
model.add(Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset
model.fit(train_X, train_Y, epochs=10, batch_size=16, verbose=2)
# evaluate the keras model
_, accuracy = model.evaluate(val_X, val_Y, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

In [None]:
test_predictions = model.predict(val_X)
test_predictions = np.where(test_predictions >= 0.5, 1, 0)
test_predictions = test_predictions.reshape(Ytest.shape[0],)
cm = confusion_matrix(np.array(Ytest), np.array(test_predictions))
plot_confusion_matrix(cm, classes = ['0 - Non-Purchaser', '1 - Purchaser'],
                      title = 'Purchaser Confusion Matrix')

In [None]:
from sklearn import metrics
val_Y = np.array(Ytest)
test_predictions = np.array(test_predictions)
print("Accuracy Score:", metrics.accuracy_score(val_Y, test_predictions))
print("Balanced Accuracy Score:", metrics.balanced_accuracy_score(val_Y, test_predictions)) # Accuracy for imbalanced data
print("ROC Score:", metrics.roc_auc_score(val_Y, test_predictions))
print("Precision", metrics.precision_score(val_Y, test_predictions)) # Of the positives predicted, what percentage is truly positive
print("Sensitivity", metrics.recall_score(val_Y, test_predictions)) # Of all the positive cases, what percentage are predicted positive
print("Specificity", metrics.recall_score(val_Y, test_predictions, pos_label = 0)) # How well the model is at prediciting negative results
print("F-Score", metrics.f1_score(val_Y, test_predictions)) # The "harmonic mean" of precision and sensitivity. Good for imbalanced datasets

## KNN

In [None]:
# Separate majority and minority classes
df_majority = data[data.is_purchasor == 0]
df_minority = data[data.is_purchasor == 1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True, # sample with replacement
                                 n_samples=435253, # to match majority class
                                 random_state=12345) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.is_purchasor.value_counts()

In [None]:
knn1 = KNeighborsClassifier(n_neighbors = 1)
knn1.fit(X_train, y_train)
pickle.dump(knn1, open('knn1.pkl', 'wb'))
predicted1 = knn1.predict(X_test)
pickle.dump(predicted1, open('predicted1clicks.pkl', 'wb'))
error_rate.append(np.mean(predicted1 != y_test))

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, predicted1)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])
cm_display.plot()
plt.show()

In [None]:
print("Accuracy score:", metrics.accuracy_score(y_test, predicted9clicks))
print("Balanced accuracy score:", metrics.balanced_accuracy_score(y_test, predicted9clicks))
print("ROC AUC score:", metrics.roc_auc_score(y_test, predicted9clicks))
print("Precision score:", metrics.precision_score(y_test, predicted9clicks))
print("Sensitivity score:", metrics.recall_score(y_test, predicted9clicks))
print("Specificity score:", metrics.recall_score(y_test, predicted9clicks, pos_label = 0))
print("F-score:", metrics.f1_score(y_test, predicted9clicks))

# Clustering

# K Prototype

In [None]:
from plotnine import *

# Choose optimal K using Elbow method
cost = []
for cluster in range(1, 10):
    try:
        kprototype = KPrototypes(n_jobs = -1, n_clusters = cluster, init = 'Huang', random_state = 100)
        kprototype.fit_predict(dfMatrix, categorical = catColumnsPos)
        cost.append(kprototype.cost_)
        print('Cluster initiation: {}'.format(cluster))
    except:
        break
# Converting the results into a dataframe and plotting them
df_cost = pd.DataFrame({'Cluster':range(1, len(cost)+1), 'Cost':cost})
# Data viz
#plotnine.options.figure_size = (8, 4.8)
(
    ggplot(data = df_cost)+
    geom_line(aes(x = 'Cluster',
                  y = 'Cost'))+
    geom_point(aes(x = 'Cluster',
                   y = 'Cost'))+
    geom_label(aes(x = 'Cluster',
                   y = 'Cost',
                   label = 'Cluster'),
               size = 10,
               nudge_y = 1000) +
    labs(title = 'Optimal number of cluster with Elbow Method')+
    xlab('Number of Clusters k')+
    ylab('Cost')+
    theme_minimal()
)

In [None]:
# Initializing the model
kprototype = KPrototypes(n_jobs = -1, n_clusters = 3, init = 'Huang', random_state = 0)

# Fitting the model on the training data
clusters = kprototype.fit_predict(dfMatrix, categorical = catColumnsPos)

In [None]:
# Cluster centorid
print("Cluster centroid:", kprototype.cluster_centroids_)

In [None]:
# Add the cluster to the dataframe
df['Cluster Labels'] = kprototype.labels_
df['Segment'] = df['Cluster Labels'].map({0:'First', 1:'Second', 2:'Third'})
# Order the cluster
df['Segment'] = df['Segment'].astype('category')
df['Segment'] = df['Segment'].cat.reorder_categories(['First','Second','Third'])

In [None]:
# X is the data array used for clustering
# clusters is the array of cluster labels assigned to each data point
silhouette_avg = silhouette_score(data.drop(columns=['Segment']), clusters, metric='euclidean')
print("The average silhouette score is :", silhouette_avg)

## K Means

In [None]:
range_n_clusters = [2,3,4,5]
silhouette_avg = []
for num_clusters in range_n_clusters:
 # initialise kmeans
 kmeans = KMeans(init="random", n_clusters=num_clusters,n_init=10,random_state=100)
 kmeans.fit_predict(df)
 cluster_labels = kmeans.labels_
 # silhouette score
 score = silhouette_score(df, cluster_labels, metric='euclidean')
 silhouette_avg.append(score)
 print("Score:", score)
 
plt.plot(range_n_clusters,silhouette_avg,'bx-')
plt.xlabel('Values of K') 
plt.ylabel('Silhouette score') 
plt.title('Silhouette analysis For Optimal k')
plt.show()

In [None]:
silhouette_avg

In [None]:
# Running the k means model with our optimal K clusters 
k = 3#
kmeans = KMeans(init="random", n_clusters=k,n_init=10,random_state=100)
y_pred = kmeans.fit_predict(df)
df['cluster'] = y_pred

# output with the cluster attached
df.head()