In [74]:
import pandas as pd
import numpy as np
import datetime
from sklearn.pipeline import Pipeline
from dateutil.relativedelta import relativedelta
import warnings
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb

In [75]:
# Suppress the SettingWithCopyWarning
pd.set_option('mode.chained_assignment', None)
pd.set_option('future.no_silent_downcasting', True)
warnings.simplefilter(action='ignore', category=FutureWarning)

In [76]:
#Load the data from the csv file
df = pd.read_csv('d:/code/data/data.csv', encoding='ISO-8859-1')
df_snapshot = pd.read_csv('d:/code/data/customer_behavior_ecom_snapshot.csv')

In [77]:
#Convert the date columns to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [78]:
df_snapshot

Unnamed: 0,customer_id,total_successful_amount_past_5_month,num_successful_orders_past_5_month,total_successful_amount_past_4_month,num_successful_orders_past_4_month,total_successful_amount_past_3_month,num_successful_orders_past_3_month,total_successful_amount_past_2_month,num_successful_orders_past_2_month,total_successful_amount_past_1_month,num_successful_orders_past_1_month,total_successful_amount_future_1_month,num_successful_orders_future_1_month,total_successful_amount_future_2_month,num_successful_orders_future_2_month,time_snapshot
0,12347.0,,,,,,,,,711.79,31,475.39,29.0,0.0,0.0,2011-01-01
1,12348.0,,,,,,,,,892.80,17,227.44,6.0,0.0,0.0,2011-01-01
2,12370.0,,,,,,,,,1868.02,91,0.00,0.0,0.0,0.0,2011-01-01
3,12377.0,,,,,,,,,1001.52,43,626.60,34.0,0.0,0.0,2011-01-01
4,12383.0,,,,,,,,,600.72,37,639.91,32.0,0.0,0.0,2011-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37431,14349.0,,,,,,,133.50,31.0,0.00,0,,,,,2012-01-01
37432,18058.0,,,,,,,170.16,3.0,0.00,0,,,,,2012-01-01
37433,12953.0,,,,,,,329.85,17.0,0.00,0,,,,,2012-01-01
37434,12966.0,,,,,,,160.18,10.0,0.00,0,,,,,2012-01-01


In [79]:
df_snapshot_truncate = df_snapshot[(df_snapshot.time_snapshot>='2011-03-01') & (df_snapshot.time_snapshot<='2011-12-1')]

In [80]:
df_snapshot_truncate.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30747 entries, 2317 to 33063
Data columns (total 16 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   customer_id                             30747 non-null  float64
 1   total_successful_amount_past_5_month    17705 non-null  float64
 2   num_successful_orders_past_5_month      17705 non-null  float64
 3   total_successful_amount_past_4_month    21065 non-null  float64
 4   num_successful_orders_past_4_month      21065 non-null  float64
 5   total_successful_amount_past_3_month    24723 non-null  float64
 6   num_successful_orders_past_3_month      24723 non-null  float64
 7   total_successful_amount_past_2_month    27785 non-null  float64
 8   num_successful_orders_past_2_month      27785 non-null  float64
 9   total_successful_amount_past_1_month    30747 non-null  float64
 10  num_successful_orders_past_1_month      30747 non-null  int6

In [81]:
df_snapshot_truncate['is_churn_future_1_month'] = df_snapshot_truncate.num_successful_orders_future_1_month==0

In [82]:
df_snapshot_truncate.is_churn_future_1_month.value_counts(normalize=True)

is_churn_future_1_month
True     0.722802
False    0.277198
Name: proportion, dtype: float64

In [83]:
numeric_df = df_snapshot_truncate.select_dtypes(include=['number', 'bool'])
# Choose a target column for correlation
target_column = 'is_churn_future_1_month'

# Compute correlation with other numeric columns
correlations = numeric_df.corr()[target_column].drop(target_column)

# Print result
print(correlations)

customer_id                               0.018265
total_successful_amount_past_5_month     -0.161218
num_successful_orders_past_5_month       -0.198524
total_successful_amount_past_4_month     -0.157037
num_successful_orders_past_4_month       -0.197525
total_successful_amount_past_3_month     -0.147537
num_successful_orders_past_3_month       -0.195385
total_successful_amount_past_2_month     -0.143955
num_successful_orders_past_2_month       -0.185957
total_successful_amount_past_1_month     -0.144822
num_successful_orders_past_1_month       -0.156168
total_successful_amount_future_1_month   -0.275715
num_successful_orders_future_1_month     -0.412650
total_successful_amount_future_2_month   -0.131212
num_successful_orders_future_2_month     -0.142277
Name: is_churn_future_1_month, dtype: float64


In [84]:
time_split = '2011-11-01'
df_train = df_snapshot_truncate[df_snapshot_truncate.time_snapshot<time_split]
df_test = df_snapshot_truncate[df_snapshot_truncate.time_snapshot>=time_split]

In [85]:
df_train.shape, df_test.shape

((22406, 17), (8341, 17))

In [95]:
df_train.is_churn_future_1_month.value_counts(normalize=True), df_test.is_churn_future_1_month.value_counts(normalize=True)

(is_churn_future_1_month
 True     0.710435
 False    0.289565
 Name: proportion, dtype: float64,
 is_churn_future_1_month
 True     0.756024
 False    0.243976
 Name: proportion, dtype: float64)

In [96]:
features = ['customer_id', 
            'total_successful_amount_past_5_month', 'num_successful_orders_past_5_month',
            'total_successful_amount_past_4_month', 'num_successful_orders_past_4_month',
            'total_successful_amount_past_3_month', 'num_successful_orders_past_3_month', 
            'total_successful_amount_past_2_month', 'num_successful_orders_past_2_month',
            'total_successful_amount_past_1_month', 'num_successful_orders_past_1_month']
target = 'is_churn_future_1_month'
X_train = df_train[features].fillna(0)
y_train = df_train[target]
X_test = df_test[features].fillna(0)
y_test = df_test[target]

In [97]:
# Standardize the features (important for Logistic Regression)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train Logistic Regression model
model = model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.3f}")


Model Accuracy: 0.768


In [98]:
# Standardize the features (important for Logistic Regression)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train Logistic Regression model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.3f}")


Model Accuracy: 0.756


In [99]:
# Standardize the features (important for Logistic Regression)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train Logistic Regression model
model = AdaBoostClassifier()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.3f}")

Model Accuracy: 0.776
