In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.impute import KNNImputer
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, PrecisionRecallDisplay, RocCurveDisplay
from sklearn.metrics import f1_score, roc_curve, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay

In [42]:
import pandas as pd

# URLs for the datasets
customers_url = 'https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/customers_final.csv'
engagements_url = 'https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/engagements_final.csv'
marketing_url = 'https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/marketing_final.csv'
transactions_url = 'https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/transactions_final.csv'

# Reading the datasets
customers = pd.read_csv(customers_url)
engagements = pd.read_csv(engagements_url)
marketing = pd.read_csv(marketing_url)
transactions = pd.read_csv(transactions_url)

# Convert date columns to datetime type
customers['join_date'] = pd.to_datetime(customers['join_date'])
customers['last_purchase_date'] = pd.to_datetime(customers['last_purchase_date'])
transactions['transaction_date'] = pd.to_datetime(transactions['transaction_date'])
marketing['campaign_date'] = pd.to_datetime(marketing['campaign_date'])

# Mark missing values in 'age' as 'NA'
customers['age'].fillna('NA', inplace=True)

# Fill missing values in 'gender' with the mode
customers['gender'].fillna(customers['gender'].mode()[0], inplace=True)

# Display the first few rows of each dataset
print("Customers Data:")
print(customers.head())

print("\nEngagements Data:")
print(engagements.head())

print("\nMarketing Data:")
print(marketing.head())

print("\nTransactions Data:")
print(transactions.head())


Customers Data:
   customer_id  join_date last_purchase_date   age  gender           location
0            1 2023-11-20         2024-03-17  56.0  Female  North Shannonbury
1            2 2021-09-08         2023-10-25    NA    Male          Hillville
2            3 2021-06-01         2022-11-27    NA    Male   North Latoyatown
3            4 2022-01-01         2022-09-01  29.0    Male          Grossstad
4            5 2022-01-24         2023-06-02    NA    Male   East Matthewfort

Engagements Data:
   customer_id  number_of_site_visits  number_of_emails_opened  \
0            1                     10                       15   
1            2                    285                       49   
2            3                    192                       73   
3            4                    110                       30   
4            5                    161                        2   

   number_of_clicks  
0                 1  
1                51  
2                25  
3           

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customers['age'].fillna('NA', inplace=True)
  customers['age'].fillna('NA', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customers['gender'].fillna(customers['gender'].mode()[0], inplace=True)


In [43]:
# Check for missing values
print("Missing values in customers data:\n", customers.isnull().sum())
print("Missing values in engagements data:\n", engagements.isnull().sum())
print("Missing values in marketing data:\n", marketing.isnull().sum())
print("Missing values in transactions data:\n", transactions.isnull().sum())

Missing values in customers data:
 customer_id           0
join_date             0
last_purchase_date    0
age                   0
gender                0
location              0
dtype: int64
Missing values in engagements data:
 customer_id                0
number_of_site_visits      0
number_of_emails_opened    0
number_of_clicks           0
dtype: int64
Missing values in marketing data:
 campaign_id       0
customer_id       0
response          0
promotion_type    0
campaign_date     0
dtype: int64
Missing values in transactions data:
 transaction_id        0
customer_id           0
transaction_date      0
transaction_amount    0
product_category      0
dtype: int64


In [45]:
# Feature Engineering
customer_spending = transactions.groupby('customer_id')['transaction_amount'].sum().reset_index()
customer_spending.columns = ['customer_id', 'total_spending']

customer_transactions = transactions.groupby('customer_id')['transaction_id'].count().reset_index()
customer_transactions.columns = ['customer_id', 'total_transactions']

customer_last_purchase = transactions.groupby('customer_id')['transaction_date'].max().reset_index()
customer_last_purchase.columns = ['customer_id', 'last_transaction_date']

customer_first_purchase = transactions.groupby('customer_id')['transaction_date'].min().reset_index()
customer_first_purchase.columns = ['customer_id', 'first_transaction_date']

customers = customers.merge(customer_spending, on='customer_id', how='left')
customers = customers.merge(customer_transactions, on='customer_id', how='left')
customers = customers.merge(customer_last_purchase, on='customer_id', how='left')
customers = customers.merge(customer_first_purchase, on='customer_id', how='left')

# Display the updated customers dataframe
print("Updated Customers Data after Feature Engineering:")
print(customers.head())


Updated Customers Data after Feature Engineering:
   customer_id  join_date last_purchase_date   age  gender           location  \
0            1 2023-11-20         2024-03-17  56.0  Female  North Shannonbury   
1            2 2021-09-08         2023-10-25    NA    Male          Hillville   
2            3 2021-06-01         2022-11-27    NA    Male   North Latoyatown   
3            4 2022-01-01         2022-09-01  29.0    Male          Grossstad   
4            5 2022-01-24         2023-06-02    NA    Male   East Matthewfort   

   total_spending  total_transactions last_transaction_date  \
0         3509.48                   6            2024-03-12   
1         6081.32                   9            2023-02-26   
2         1454.87                   6            2022-11-18   
3         7874.68                  20            2022-08-31   
4        15524.55                  24            2023-06-01   

  first_transaction_date  
0             2024-01-09  
1             2021-09-29  
2  

In [13]:
customers.head()

Unnamed: 0,customer_id,join_date,last_purchase_date,age,gender,location,total_spending,total_transactions,last_transaction_date
0,1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,3509.48,6,2024-03-12
1,2,2021-09-08,2023-10-25,,Male,Hillville,6081.32,9,2023-02-26
2,3,2021-06-01,2022-11-27,,Male,North Latoyatown,1454.87,6,2022-11-18
3,4,2022-01-01,2022-09-01,29.0,Male,Grossstad,7874.68,20,2022-08-31
4,5,2022-01-24,2023-06-02,,Male,East Matthewfort,15524.55,24,2023-06-01


In [46]:
# Merge engagements data into customers_df
customers = customers.merge(engagements, on='customer_id', how='left')

# Perform one-hot encoding on the promotion_type column
promotion_one_hot = pd.get_dummies(marketing['promotion_type'], prefix='promo')
marketing = pd.concat([marketing, promotion_one_hot], axis=1)

# Check the columns after one-hot encoding
print("Columns after one-hot encoding:", marketing.columns)

# Aggregate the number of each promotion type received by each customer
promotion_summary = marketing.groupby('customer_id').agg({
    'promo_Discount': 'sum',
    'promo_Buy One Get One': 'sum',
    'promo_Free Shipping': 'sum',
    'response': lambda x: (x == 'Yes').sum()
}).reset_index()

promotion_summary.columns = ['customer_id', 'promo_Discount', 'promo_BuyOneGetOne', 'promo_FreeShipping', 'total_responses']

# Merge promotion summary into customers_df
customers = customers.merge(promotion_summary, on='customer_id', how='left')

# Display the updated customers dataframe
print("Updated Customers Data after merging marketing data:")
print(customers.head())



Columns after one-hot encoding: Index(['campaign_id', 'customer_id', 'response', 'promotion_type',
       'campaign_date', 'promo_Buy One Get One', 'promo_Discount',
       'promo_Free Shipping'],
      dtype='object')
Updated Customers Data after merging marketing data:
   customer_id  join_date last_purchase_date   age  gender           location  \
0            1 2023-11-20         2024-03-17  56.0  Female  North Shannonbury   
1            2 2021-09-08         2023-10-25    NA    Male          Hillville   
2            3 2021-06-01         2022-11-27    NA    Male   North Latoyatown   
3            4 2022-01-01         2022-09-01  29.0    Male          Grossstad   
4            5 2022-01-24         2023-06-02    NA    Male   East Matthewfort   

   total_spending  total_transactions last_transaction_date  \
0         3509.48                   6            2024-03-12   
1         6081.32                   9            2023-02-26   
2         1454.87                   6            202

In [39]:
# Perform one-hot encoding on the promotion_type column
promotion_one_hot = pd.get_dummies(marketing['promotion_type'], prefix='promo')
marketing = pd.concat([marketing, promotion_one_hot], axis=1)

# Check the columns after one-hot encoding
print("Columns after one-hot encoding:", marketing.columns)

# Aggregate the number of each promotion type received by each customer
promotion_summary = marketing.groupby('customer_id').agg({
    'promo_Discount': 'sum',
    'promo_Buy One Get One': 'sum',
    'promo_Free Shipping': 'sum',
    'response': lambda x: (x == 'Yes').sum()
}).reset_index()

promotion_summary.columns = ['customer_id', 'promo_Discount', 'promo_BuyOneGetOne', 'promo_FreeShipping', 'total_responses']

# Display the promotion summary dataframe
print("Promotion Summary Data:")
print(promotion_summary.head())


Columns after one-hot encoding: Index(['campaign_id', 'customer_id', 'response', 'promotion_type',
       'campaign_date', 'promo_Buy One Get One', 'promo_Discount',
       'promo_Free Shipping'],
      dtype='object')
Promotion Summary Data:
   customer_id  promo_Discount  promo_BuyOneGetOne  promo_FreeShipping  \
0            1               1                   2                   1   
1            2               1                   0                   3   
2            3               0                   2                   0   
3            4               1                   1                   2   
4            5               2                   1                   1   

   total_responses  
0                1  
1                2  
2                1  
3                1  
4                0  


In [48]:
customers.head()

Unnamed: 0,customer_id,join_date,last_purchase_date,age,gender,location,total_spending,total_transactions,last_transaction_date,first_transaction_date,...,number_of_emails_opened,number_of_clicks,promo_Discount,promo_BuyOneGetOne,promo_FreeShipping,total_responses,avg_purchase_value,purchase_frequency,customer_lifespan,CLV
0,1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,3509.48,6,2024-03-12,2024-01-09,...,15,1,1,2,1,1,584.913333,0.0006,63,22.109724
1,2,2021-09-08,2023-10-25,,Male,Hillville,6081.32,9,2023-02-26,2021-09-29,...,49,51,1,0,3,2,675.702222,0.0009,515,313.18798
2,3,2021-06-01,2022-11-27,,Male,North Latoyatown,1454.87,6,2022-11-18,2021-09-28,...,73,25,0,2,0,1,242.478333,0.0006,416,60.522592
3,4,2022-01-01,2022-09-01,29.0,Male,Grossstad,7874.68,20,2022-08-31,2022-01-11,...,30,17,1,1,2,1,393.734,0.002,232,182.692576
4,5,2022-01-24,2023-06-02,,Male,East Matthewfort,15524.55,24,2023-06-01,2022-03-02,...,2,7,2,1,1,0,646.85625,0.0024,456,707.91948
