In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.impute import KNNImputer

# Load the datasets
customers = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/customers_final.csv')
engagements = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/engagements_final.csv')
marketing = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/marketing_final.csv')
transactions = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/transactions_final.csv')


In [2]:
# Display the first few rows of each dataset
print(customers.head())
print(transactions.head())
print(engagements.head())
print(marketing.head())

   customer_id   join_date last_purchase_date   age  gender           location
0            1  2023-11-20         2024-03-17  56.0  Female  North Shannonbury
1            2  2021-09-08         2023-10-25   NaN    Male          Hillville
2            3  2021-06-01         2022-11-27   NaN     NaN   North Latoyatown
3            4  2022-01-01         2022-09-01  29.0    Male          Grossstad
4            5  2022-01-24         2023-06-02   NaN    Male   East Matthewfort
   transaction_id  customer_id transaction_date  transaction_amount  \
0               1            1       2024-02-03              165.56   
1               2            1       2024-03-02              699.01   
2               3            1       2024-03-12              146.86   
3               4            1       2024-01-20              927.46   
4               5            1       2024-02-25             1395.87   

  product_category  
0         Clothing  
1       Home Goods  
2       Home Goods  
3      Electron

In [3]:
# Check for missing values in the customers dataset
print(customers.isnull().sum())

# Fill missing values for gender with "Unknown"
customers['gender'].fillna('Unknown', inplace=True)

# Extract the columns with missing values except gender
columns_with_missing_values = ['age']

# Perform KNN Imputation
imputer = KNNImputer(n_neighbors=5)
customers[columns_with_missing_values] = imputer.fit_transform(customers[columns_with_missing_values])

# Check if there are any remaining missing values
print(customers.isnull().sum())

# Merge datasets on customer_id
data = pd.merge(transactions, customers, on='customer_id')
data = pd.merge(data, engagements, on='customer_id')
data = pd.merge(data, marketing, on='customer_id')

# Convert dates to datetime format
data['transaction_date'] = pd.to_datetime(data['transaction_date'])
data['join_date'] = pd.to_datetime(data['join_date'])
data['last_purchase_date'] = pd.to_datetime(data['last_purchase_date'])

# Calculate customer lifespan in months
data['customer_lifespan'] = ((data['last_purchase_date'] - data['join_date']).dt.days / 30).apply(np.ceil).astype(int)
data['customer_lifespan'] = data['customer_lifespan'].apply(lambda x: max(x, 1))

# Calculate CLV (sum of all transactions per customer)
customer_clv = data.groupby('customer_id')['transaction_amount'].sum().reset_index()
customer_clv.columns = ['customer_id', 'CLV']

# Merge the CLV back to the customers dataframe
customers = pd.merge(customers, customer_clv, on='customer_id')

# Calculate CLV per month
customers = pd.merge(customers, data[['customer_id', 'customer_lifespan']].drop_duplicates(), on='customer_id')
customers['CLV_per_month'] = customers['CLV'] / customers['customer_lifespan']

# Calculate total CLV per month
total_clv_per_month = customers['CLV_per_month'].sum()

# Sort customers by CLV per month in descending order
customers = customers.sort_values(by='CLV_per_month', ascending=False)

# Calculate cumulative sum of CLV per month and identify high-value customers
customers['cumulative_clv_per_month'] = customers['CLV_per_month'].cumsum()
customers['percent_of_total_clv'] = customers['cumulative_clv_per_month'] / total_clv_per_month

# Classify high-value customers (top 80% of total CLV per month)
customers['Is_high_value_customer'] = customers['percent_of_total_clv'] <= 0.8

print(customers.head())


customer_id              0
join_date                0
last_purchase_date       0
age                   1009
gender                 533
location                 0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customers['gender'].fillna('Unknown', inplace=True)


customer_id           0
join_date             0
last_purchase_date    0
age                   0
gender                0
location              0
dtype: int64
      customer_id   join_date last_purchase_date        age  gender  \
3822         3823  2022-11-29         2022-12-29  60.000000    Male   
6876         6877  2023-07-14         2023-07-14  43.467467  Female   
2641         2642  2020-06-16         2020-06-26  20.000000  Female   
5825         5826  2024-05-20         2024-05-30  45.000000    Male   
9424         9425  2024-01-25         2024-02-02  43.467467  Female   

          location       CLV  customer_lifespan  CLV_per_month  \
3822   Monicaburgh  82406.76                  1       82406.76   
6876      Garystad  74802.00                  1       74802.00   
2641   Kurtchester  74736.88                  1       74736.88   
5825     Aprilport  69442.72                  1       69442.72   
9424  Kennethhaven  68155.88                  1       68155.88   

      cumulative_cl

In [4]:
# Calculate recency
last_transaction_date = transactions['transaction_date'].max()
customers = pd.merge(customers, transactions.groupby('customer_id')['transaction_date'].max().reset_index(), on='customer_id')
customers['recency'] = (last_transaction_date - customers['transaction_date']).dt.days

# Calculate frequency
frequency = transactions.groupby('customer_id').size().reset_index(name='frequency')
customers = pd.merge(customers, frequency, on='customer_id')

# Calculate usage lifespan
customers['usage_lifespan'] = (last_transaction_date - customers['join_date']).dt.days

# Calculate positive response per transaction
positive_responses = marketing[marketing['response'] == 'Yes'].groupby('customer_id').size().reset_index(name='positive_responses')
customers = pd.merge(customers, positive_responses, on='customer_id', how='left')
customers['positive_responses'].fillna(0, inplace=True)
customers['positive_response_per_transaction'] = customers['positive_responses'] / customers['frequency']

# Calculate site per transaction
customers = pd.merge(customers, engagements[['customer_id', 'number_of_site_visits']], on='customer_id')
customers['site_per_transaction'] = customers['number_of_site_visits'] / customers['frequency']

# Calculate email per transaction
customers = pd.merge(customers, engagements[['customer_id', 'number_of_emails_opened']], on='customer_id')
customers['email_per_transaction'] = customers['number_of_emails_opened'] / customers['frequency']

# Calculate click per transaction
customers = pd.merge(customers, engagements[['customer_id', 'number_of_clicks']], on='customer_id')
customers['click_per_transaction'] = customers['number_of_clicks'] / customers['frequency']

# Ensure no division by zero issues
customers['positive_response_per_transaction'].replace([np.inf, -np.inf], 0, inplace=True)
customers['site_per_transaction'].replace([np.inf, -np.inf], 0, inplace=True)
customers['email_per_transaction'].replace([np.inf, -np.inf], 0, inplace=True)
customers['click_per_transaction'].replace([np.inf, -np.inf], 0, inplace=True)

print(customers.head())


TypeError: unsupported operand type(s) for -: 'str' and 'str'