In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the datasets
customers = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/customers_final.csv')
engagements = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/engagements_final.csv')
marketing = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/marketing_final.csv')
transactions = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/transactions_final.csv')

# Convert date columns to datetime format
customers['join_date'] = pd.to_datetime(customers['join_date'])
customers['last_purchase_date'] = pd.to_datetime(customers['last_purchase_date'])
transactions['transaction_date'] = pd.to_datetime(transactions['transaction_date'])
marketing['campaign_date'] = pd.to_datetime(marketing['campaign_date'])

# Fill missing values for gender with "Unknown"
customers['gender'].fillna('Unknown', inplace=True)

# Fill missing values in the age column with "Unknown"
customers['age'] = customers['age'].fillna('Unknown')

# Convert age into categorical ranges
bins = [-1, 29, 39, 49, 59, np.inf]
labels = ['0-29', '30-39', '40-49', '50-59', '60+']
customers['age'] = pd.cut(customers['age'].replace('Unknown', -1).astype(float), bins=bins, labels=labels)
customers['age'] = customers['age'].cat.add_categories('Unknown').fillna('Unknown')

# Define the date range for the analysis
start_date = '2023-06-01'
end_date = '2024-05-31'
reference_date = pd.to_datetime(end_date)

# Filter transactions within the specified period
filtered_transactions = transactions[(transactions['transaction_date'] >= start_date) & (transactions['transaction_date'] <= end_date)]

# Filter marketing data within the specified period
filtered_marketing = marketing[(marketing['campaign_date'] >= start_date) & (marketing['campaign_date'] <= end_date)]

# Feature Engineering
# Recency: Days since last transaction
last_transaction_date = filtered_transactions.groupby('customer_id')['transaction_date'].max().reset_index()
last_transaction_date.columns = ['customer_id', 'last_transaction_date']
customers = pd.merge(customers, last_transaction_date, on='customer_id', how='left')
customers['recency'] = (reference_date - customers['last_purchase_date']).dt.days
customers['recency'].fillna((reference_date - customers['join_date']).dt.days, inplace=True)  # Fill NaN with join_date if no transaction

# Frequency: Number of transactions within the year
transaction_frequency = filtered_transactions.groupby('customer_id').size().reset_index(name='frequency')
customers = pd.merge(customers, transaction_frequency, on='customer_id', how='left')
customers['frequency'].fillna(0, inplace=True)

# Lifespan: Days from join date to reference date
customers['lifespan'] = (reference_date - customers['join_date']).dt.days

# Positive Response: Number of 'Yes' responses within the year
positive_responses = filtered_marketing[filtered_marketing['response'] == 'Yes'].groupby('customer_id').size().reset_index(name='positive_response')
customers = pd.merge(customers, positive_responses, on='customer_id', how='left')
customers['positive_response'].fillna(0, inplace=True)

# Transaction Diversity: Number of different product categories purchased within the year
transaction_diversity = filtered_transactions.groupby('customer_id')['product_category'].nunique().reset_index(name='transaction_diversity')
customers = pd.merge(customers, transaction_diversity, on='customer_id', how='left')
customers['transaction_diversity'].fillna(0, inplace=True)

# Transaction Standard Deviation: Standard deviation of transaction amounts within the year
transaction_std = filtered_transactions.groupby('customer_id')['transaction_amount'].std().reset_index(name='transaction_standard_deviation')
customers = pd.merge(customers, transaction_std, on='customer_id', how='left')
customers['transaction_standard_deviation'].fillna(0, inplace=True)

# Calculate total transactions per customer for proportion
total_transactions = transactions.groupby('customer_id').size().reset_index(name='total_transactions')
customers = pd.merge(customers, total_transactions, on='customer_id', how='left')
customers['total_transactions'].fillna(0, inplace=True)

# Calculate engagement frequency
customers['engagement_frequency'] = np.where(customers['total_transactions'] == 0, 0, customers['frequency'] / customers['total_transactions'])
customers['engagement_frequency'].replace([np.inf, -np.inf], 0, inplace=True)  # Handle division by zero

# Merge engagement data and calculate engagement metrics
customers = pd.merge(customers, engagements, on='customer_id', how='left')
customers['site_visit'] = customers['number_of_site_visits'] * customers['engagement_frequency']
customers['email_per_transaction'] = customers['number_of_emails_opened'] * customers['engagement_frequency']
customers['click_per_transaction'] = customers['number_of_clicks'] * customers['engagement_frequency']

# Fill NaN values in engagement features
customers[['site_visit', 'email_per_transaction', 'click_per_transaction']] = customers[['site_visit', 'email_per_transaction', 'click_per_transaction']].fillna(0)

# Create age-gender interaction term
customers['age_gender'] = customers['age'].astype(str) + '_' + customers['gender'].astype(str)

# Filter out customers who joined after 2023-06-01
modeling_data = customers[customers['join_date'] < pd.to_datetime(start_date)]

# Calculate CLV for the latest year (2023-06-01 to 2024-05-31)
clv_latest_year = filtered_transactions.groupby('customer_id')['transaction_amount'].sum().reset_index()
clv_latest_year.columns = ['customer_id', 'CLV_latest_year']

# Merge the CLV data with modeling_data
modeling_data = pd.merge(modeling_data, clv_latest_year, on='customer_id', how='left')
modeling_data['CLV_latest_year'].fillna(0, inplace=True)

# Calculate the threshold for the top 25% of customers
threshold = modeling_data['CLV_latest_year'].quantile(0.75)

# Assign labels based on the threshold
modeling_data['highvalue_customer'] = (modeling_data['CLV_latest_year'] >= threshold).astype(int)

# Drop the frequency column
modeling_data = modeling_data.drop(columns=['frequency'])

# Independent variables
independent_features = ['recency', 'lifespan', 'positive_response', 'site_visit', 'email_per_transaction', 'click_per_transaction', 'age_gender']

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), independent_features[:-1]),  # All columns except 'age_gender'
        ('cat', OneHotEncoder(), ['age_gender'])
    ]
)

# Apply the transformations
X = preprocessor.fit_transform(modeling_data[independent_features])

# Convert the result back to a DataFrame for ease of use
num_features = independent_features[:-1]
cat_features = list(preprocessor.named_transformers_['cat'].get_feature_names_out(['age_gender']))
feature_names = num_features + cat_features
X_df = pd.DataFrame(X, columns=feature_names)

# Print the transformed features
print(X_df.head())

# Prepare the target variable
y = modeling_data['highvalue_customer']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.3, random_state=42)

# Initialize the models
logreg = LogisticRegression(max_iter=1000)
nb = GaussianNB()
knn = KNeighborsClassifier()

# Train and evaluate Logistic Regression
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)

logreg_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_logreg),
    'Precision': precision_score(y_test, y_pred_logreg),
    'Recall': recall_score(y_test, y_pred_logreg),
    'F1 Score': f1_score(y_test, y_pred_logreg)
}

print("\nLogistic Regression Metrics:")
for metric, value in logreg_metrics.items():
    print(f"{metric}: {value:.4f}")

print("\nLogistic Regression Coefficients:")
logreg_coefficients = pd.DataFrame(logreg.coef_.flatten(), index=feature_names, columns=['Coefficient'])
print(logreg_coefficients)

# Train and evaluate Naive Bayes
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

nb_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_nb),
    'Precision': precision_score(y_test, y_pred_nb),
    'Recall': recall_score(y_test, y_pred_nb),
    'F1 Score': f1_score(y_test, y_pred_nb)
}

print("\nNaive Bayes Metrics:")
for metric, value in nb_metrics.items():
    print(f"{metric}: {value:.4f}")

# Train and evaluate KNN
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

knn_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_knn),
    'Precision': precision_score(y_test, y_pred_knn),
    'Recall': recall_score(y_test, y_pred_knn),
    'F1 Score': f1_score(y_test, y_pred_knn)
}

print("\nK-Nearest Neighbors Metrics:")
for metric, value in knn_metrics.items():
    print(f"{metric}: {value:.4f}")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customers['gender'].fillna('Unknown', inplace=True)
  customers['age'] = pd.cut(customers['age'].replace('Unknown', -1).astype(float), bins=bins, labels=labels)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customers['recency'].fillna((reference_date - customers['join_date']).d

ValueError: Shape of passed values is (7711, 1), indices imply (7711, 24)