In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Load Data
df = pd.read_csv('/Users/usamahameed/Downloads/Ecommerce Churn Project/data/fact_customer.csv')

# Drop unnecessary columns
df.drop(columns=['Unnamed: 3', 'invoice_no', 'customer_id', 'email', 'phone_number', 'first_name', 'last_name', 'dob'], 
        inplace=True, errors='ignore')

# Handle missing values
df.fillna(method='ffill', inplace=True)

# Convert date columns to datetime
df['invoice_date'] = pd.to_datetime(df['invoice_date'])
df['last_purchase_date'] = pd.to_datetime(df['last_purchase_date'])

# Feature Engineering
df['recency'] = (df['invoice_date'].max() - df['last_purchase_date']).dt.days
df['purchase_rate'] = df['purchase_frequency'] / (df['days_since_last_purchase'] + 1)
df['effective_spend'] = df['price'] * (1 - df['discount_used'])

# Drop weak features (category, item, location-based columns)
df.drop(columns=['category', 'item', 'shopping_mall', 'city', 'province_state', 'country'], inplace=True)

# Encode categorical variables
label_encoders = {}
categorical_cols = ['payment_method', 'card_type', 'gender']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Normalize numerical features
scaler = StandardScaler()
numerical_cols = ['quantity', 'price', 'days_since_last_purchase', 'tenure', 'discount_used', 
                  'purchase_frequency', 'avg_purchase_value', 'recency', 'purchase_rate', 'effective_spend']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Prepare Data
X = df.drop(columns=['is_churned'])
y = df['is_churned']

# # Feature Selection using RFE
# lr_model = LogisticRegression()
# selector = RFE(lr_model, n_features_to_select=10)
# X_selected = selector.fit_transform(X, y)

# # Print selected features
# selected_features = X.columns[selector.support_]
# print("Selected Features:", selected_features)

# # Convert back to DataFrame
# X = pd.DataFrame(X_selected, columns=selected_features)


  df.fillna(method='ffill', inplace=True)


In [3]:
X

Unnamed: 0.1,Unnamed: 0,quantity,price,location_id,invoice_date,payment_id,payment_method,card_type,gender,age,days_since_last_purchase,tenure,discount_used,last_purchase_date,purchase_frequency,avg_purchase_value,recency,purchase_rate,effective_spend
0,0,-1.329069,-0.556498,1,2023-03-22,2,4,2,0,25.0,0.276269,0.821407,-1.011870,2021-08-15,0.320442,0.222170,0.618873,-0.129455,0.085835
1,1,1.346191,-0.419132,21,2024-04-06,9,0,2,1,30.0,1.112218,-1.584991,-1.011870,2020-01-19,0.320442,-1.381175,1.703033,-0.139456,0.229709
2,2,1.346191,0.404655,17,2022-05-31,10,8,2,0,22.0,-1.002465,0.133864,0.988269,2021-04-06,1.143267,-0.584486,0.866303,-0.033061,-0.661364
3,3,-0.437316,-0.145627,9,2021-01-01,10,8,2,2,32.0,-0.819244,-0.897449,0.988269,2022-05-17,0.183305,1.253934,0.099458,-0.093867,-0.661364
4,4,-1.329069,-0.598198,55,2020-02-10,7,2,2,2,35.0,0.759135,-1.241220,0.988269,2023-01-28,0.389011,0.505608,-0.384071,-0.135130,-0.661364
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,1.346191,-0.031155,12,2020-05-07,6,9,3,0,33.0,1.732500,0.477636,0.988269,2024-03-29,-0.228108,0.032368,-1.188691,-0.149053,-0.661364
9996,9996,1.346191,2.454923,57,2021-09-22,3,3,1,0,56.0,0.188476,0.133864,-1.011870,2024-06-26,-0.090970,1.025246,-1.356793,-0.135040,3.239916
9997,9997,-0.437316,-1.101056,38,2021-02-25,9,0,1,0,65.0,1.022516,1.165178,-1.011870,2023-10-26,-0.639520,-0.272690,-0.895930,-0.150302,-0.484520
9998,9998,-0.437316,2.326551,25,2023-06-24,5,6,2,1,48.0,-0.271487,1.508949,0.988269,2020-09-30,-1.393776,-1.012733,1.221394,-0.155782,-0.661364
