In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, classification_report

In [2]:
pip install pandas openpyxl



In [3]:
# Load the dataset
df = pd.read_excel('marketing_campaign.xlsx')
df

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,5,0,0,0,0,0,0,3,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10870,1967,Graduation,Married,61223.0,0,1,2013-06-13,46,709,...,5,0,0,0,0,0,0,3,11,0
2236,4001,1946,PhD,Together,64014.0,2,1,2014-06-10,56,406,...,7,0,0,0,1,0,0,3,11,0
2237,7270,1981,Graduation,Divorced,56981.0,0,0,2014-01-25,91,908,...,6,0,1,0,0,0,0,3,11,0
2238,8235,1956,Master,Together,69245.0,0,1,2014-01-24,8,428,...,3,0,0,0,0,0,0,3,11,0


In [4]:
#Identify missing value
print(df.isnull().sum())

ID                      0
Year_Birth              0
Education               0
Marital_Status          0
Income                 24
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
AcceptedCmp3            0
AcceptedCmp4            0
AcceptedCmp5            0
AcceptedCmp1            0
AcceptedCmp2            0
Complain                0
Z_CostContact           0
Z_Revenue               0
Response                0
dtype: int64


In [5]:
# Impute missing values with the mean value
imp_mean = SimpleImputer(strategy="mean")
df['Income'] = imp_mean.fit_transform(df[['Income']])

In [6]:
#Recheck for missing value
print(df.isnull().sum())

ID                     0
Year_Birth             0
Education              0
Marital_Status         0
Income                 0
Kidhome                0
Teenhome               0
Dt_Customer            0
Recency                0
MntWines               0
MntFruits              0
MntMeatProducts        0
MntFishProducts        0
MntSweetProducts       0
MntGoldProds           0
NumDealsPurchases      0
NumWebPurchases        0
NumCatalogPurchases    0
NumStorePurchases      0
NumWebVisitsMonth      0
AcceptedCmp3           0
AcceptedCmp4           0
AcceptedCmp5           0
AcceptedCmp1           0
AcceptedCmp2           0
Complain               0
Z_CostContact          0
Z_Revenue              0
Response               0
dtype: int64


In [7]:
# Encode categorical variables
df = pd.get_dummies(df, drop_first=True)

# Find duplicate rows
duplicates = df.duplicated()
print("Duplicate rows:\n", df[duplicates])

Duplicate rows:
 Empty DataFrame
Columns: [ID, Year_Birth, Income, Kidhome, Teenhome, Recency, MntWines, MntFruits, MntMeatProducts, MntFishProducts, MntSweetProducts, MntGoldProds, NumDealsPurchases, NumWebPurchases, NumCatalogPurchases, NumStorePurchases, NumWebVisitsMonth, AcceptedCmp3, AcceptedCmp4, AcceptedCmp5, AcceptedCmp1, AcceptedCmp2, Complain, Z_CostContact, Z_Revenue, Response, Education_Basic, Education_Graduation, Education_Master, Education_PhD, Marital_Status_Alone, Marital_Status_Divorced, Marital_Status_Married, Marital_Status_Single, Marital_Status_Together, Marital_Status_Widow, Marital_Status_YOLO, Dt_Customer_2012-07-31, Dt_Customer_2012-08-01, Dt_Customer_2012-08-02, Dt_Customer_2012-08-03, Dt_Customer_2012-08-04, Dt_Customer_2012-08-05, Dt_Customer_2012-08-06, Dt_Customer_2012-08-07, Dt_Customer_2012-08-08, Dt_Customer_2012-08-09, Dt_Customer_2012-08-10, Dt_Customer_2012-08-11, Dt_Customer_2012-08-12, Dt_Customer_2012-08-13, Dt_Customer_2012-08-14, Dt_Customer_2

In [8]:
# Define features and target variable
# Update feature list to match encoded DataFrame
features = [col for col in df.columns if col != 'Response']
target = 'Response'

# Split the data into features and target variable
X = df[features]
y = df[target]

In [9]:
# Convert categorical variables into dummy/indicator variables
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
# Initialize and train the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set using Random Forest
y_pred_rf = rf_model.predict(X_test)

# Evaluate the Random Forest model
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.8660714285714286
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.98      0.93       577
           1       0.58      0.20      0.30        95

    accuracy                           0.87       672
   macro avg       0.73      0.59      0.61       672
weighted avg       0.84      0.87      0.84       672



In [11]:
# Initialize and train the Gradient Boosting Classifier
gbm_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gbm_model.fit(X_train, y_train)

# Predict on the test set using GBM
y_pred_gbm = gbm_model.predict(X_test)

# Evaluate the model
print("GBM Accuracy:", accuracy_score(y_test, y_pred_gbm))
print("GBM Classification Report:\n", classification_report(y_test, y_pred_gbm))

GBM Accuracy: 0.8705357142857143
GBM Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.97      0.93       577
           1       0.60      0.26      0.36        95

    accuracy                           0.87       672
   macro avg       0.74      0.62      0.65       672
weighted avg       0.85      0.87      0.85       672



In [12]:
# Function to calculate evaluation metrics
def evaluate_model(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel()
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    return TN, FP, FN, TP, precision, recall, f1, accuracy

In [13]:
# Evaluate Random Forest model
TN_rf, FP_rf, FN_rf, TP_rf, precision_rf, recall_rf, f1_rf, accuracy_rf = evaluate_model(y_test, y_pred_rf)
print("Random Forest Evaluation:")
print(f"Confusion Matrix: TN={TN_rf}, FP={FP_rf}, FN={FN_rf}, TP={TP_rf}")
print(f"Precision: {precision_rf:.2f}")
print(f"Recall: {recall_rf:.2f}")
print(f"F1 Score: {f1_rf:.2f}")
print(f"Accuracy: {accuracy_rf:.2f}\n")

Random Forest Evaluation:
Confusion Matrix: TN=563, FP=14, FN=76, TP=19
Precision: 0.58
Recall: 0.20
F1 Score: 0.30
Accuracy: 0.87



In [14]:
# Evaluate GBM model
TN_gbm, FP_gbm, FN_gbm, TP_gbm, precision_gbm, recall_gbm, f1_gbm, accuracy_gbm = evaluate_model(y_test, y_pred_gbm)
print("GBM Evaluation:")
print(f"Confusion Matrix: TN={TN_gbm}, FP={FP_gbm}, FN={FN_gbm}, TP={TP_gbm}")
print(f"Precision: {precision_gbm:.2f}")
print(f"Recall: {recall_gbm:.2f}")
print(f"F1 Score: {f1_gbm:.2f}")
print(f"Accuracy: {accuracy_gbm:.2f}")

GBM Evaluation:
Confusion Matrix: TN=560, FP=17, FN=70, TP=25
Precision: 0.60
Recall: 0.26
F1 Score: 0.36
Accuracy: 0.87


In [15]:
from sklearn.feature_selection import RFE

In [16]:
# Apply RFE with Random Forest
rfe_rf = RFE(estimator=rf_model, n_features_to_select=10)
rfe_rf.fit(X_train, y_train)
selected_features_rf = X_train.columns[rfe_rf.support_].tolist()

In [17]:
# Apply RFE with GBM
rfe_gbm = RFE(estimator=gbm_model, n_features_to_select=10)
rfe_gbm.fit(X_train, y_train)
selected_features_gbm = X_train.columns[rfe_gbm.support_].tolist()

In [18]:
# Combine the selected features from both models
top_features = list(set(selected_features_rf).union(set(selected_features_gbm)))

print("Top features selected by Random Forest with RFE:", selected_features_rf)
print("Top features selected by GBM with RFE:", selected_features_gbm)
print("Combined top features:", top_features)

Top features selected by Random Forest with RFE: ['ID', 'Year_Birth', 'Income', 'Recency', 'MntWines', 'MntMeatProducts', 'MntFishProducts', 'MntGoldProds', 'NumStorePurchases', 'AcceptedCmp1']
Top features selected by GBM with RFE: ['ID', 'Income', 'Recency', 'MntWines', 'MntMeatProducts', 'MntGoldProds', 'NumWebVisitsMonth', 'AcceptedCmp3', 'AcceptedCmp5', 'AcceptedCmp1']
Combined top features: ['MntMeatProducts', 'NumWebVisitsMonth', 'AcceptedCmp3', 'MntFishProducts', 'MntWines', 'Year_Birth', 'AcceptedCmp5', 'Income', 'Recency', 'AcceptedCmp1', 'ID', 'MntGoldProds', 'NumStorePurchases']


In [19]:
# Filter the data to include only the top features
X_train_selected = X_train[top_features]
X_test_selected = X_test[top_features]

# Retrain the models with selected features
rf_model_selected = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_selected.fit(X_train_selected, y_train)

gbm_model_selected = GradientBoostingClassifier(n_estimators=100, random_state=42)
gbm_model_selected.fit(X_train_selected, y_train)

# Predict on the test set using the models trained on selected features
y_pred_rf_selected = rf_model_selected.predict(X_test_selected)
y_pred_gbm_selected = gbm_model_selected.predict(X_test_selected)

In [20]:
# Evaluate the models
print("Random Forest with Selected Features Accuracy:", accuracy_score(y_test, y_pred_rf_selected))
print("Random Forest with Selected Features Classification Report:\n", classification_report(y_test, y_pred_rf_selected))

print("GBM with Selected Features Accuracy:", accuracy_score(y_test, y_pred_gbm_selected))
print("GBM with Selected Features Classification Report:\n", classification_report(y_test, y_pred_gbm_selected))

# Function to calculate evaluation metrics
def evaluate_model(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel()
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    return TN, FP, FN, TP, precision, recall, f1, accuracy

# Evaluate Random Forest model with selected features
TN_rf, FP_rf, FN_rf, TP_rf, precision_rf, recall_rf, f1_rf, accuracy_rf = evaluate_model(y_test, y_pred_rf_selected)
print("Random Forest with Selected Features Evaluation:")
print(f"Confusion Matrix: TN={TN_rf}, FP={FP_rf}, FN={FN_rf}, TP={TP_rf}")
print(f"Precision: {precision_rf:.2f}")
print(f"Recall: {recall_rf:.2f}")
print(f"F1 Score: {f1_rf:.2f}")
print(f"Accuracy: {accuracy_rf:.2f}\n")

# Evaluate GBM model with selected features
TN_gbm, FP_gbm, FN_gbm, TP_gbm, precision_gbm, recall_gbm, f1_gbm, accuracy_gbm = evaluate_model(y_test, y_pred_gbm_selected)
print("GBM with Selected Features Evaluation:")
print(f"Confusion Matrix: TN={TN_gbm}, FP={FP_gbm}, FN={FN_gbm}, TP={TP_gbm}")
print(f"Precision: {precision_gbm:.2f}")
print(f"Recall: {recall_gbm:.2f}")
print(f"F1 Score: {f1_gbm:.2f}")
print(f"Accuracy: {accuracy_gbm:.2f}")

Random Forest with Selected Features Accuracy: 0.875
Random Forest with Selected Features Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.97      0.93       577
           1       0.63      0.27      0.38        95

    accuracy                           0.88       672
   macro avg       0.76      0.62      0.66       672
weighted avg       0.85      0.88      0.85       672

GBM with Selected Features Accuracy: 0.875
GBM with Selected Features Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.97      0.93       577
           1       0.63      0.28      0.39        95

    accuracy                           0.88       672
   macro avg       0.76      0.63      0.66       672
weighted avg       0.85      0.88      0.85       672

Random Forest with Selected Features Evaluation:
Confusion Matrix: TN=562, FP=15, FN=69, TP=26
Precision: 0.63
Recall: 0.27
F1 Score: 0.38