In [13]:
# Import/load
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv('Data/cleaned_marketing_campaign.csv')
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,Age,Recency_Months
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,0,0,0,0,0,3,11,1,69,162
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,0,0,0,0,0,3,11,0,72,144
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,0,0,0,0,0,3,11,0,61,151
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,0,0,0,0,0,3,11,0,42,145
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,0,0,0,0,0,3,11,0,45,146


In [14]:
# Preprocessing
# Convert categorical features to 'category'
df['Education'] = df['Education'].astype('category')
df['Marital_Status'] = df['Marital_Status'].astype('category')

# Create dummy variables
df = pd.get_dummies(df, columns=['Education', 'Marital_Status'], drop_first=True)

In [15]:
# Feature Scaling
num_cols = ['Income', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts',
            'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
            'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'Age', 'Recency']

scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [16]:
# Test/Train
X = df.drop(columns=['Response', 'ID', 'Dt_Customer'])
y = df['Response']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]} rows")
print(f"Testing set size: {X_test.shape[0]} rows")

Training set size: 1789 rows
Testing set size: 448 rows


In [17]:
# Model 1: Logistic Regression (balanced)
logreg = LogisticRegression(class_weight='balanced', max_iter=5000, random_state=42)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print("Logistic Regression (Balanced) Performance:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Logistic Regression (Balanced) Performance:
              precision    recall  f1-score   support

           0       0.95      0.83      0.88       376
           1       0.45      0.75      0.57        72

    accuracy                           0.81       448
   macro avg       0.70      0.79      0.72       448
weighted avg       0.87      0.81      0.83       448

Accuracy: 0.8147321428571429


In [18]:
# Model 2: Random Forest (baseline)
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("Random Forest Performance:")
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))

Random Forest Performance:
              precision    recall  f1-score   support

           0       0.89      0.98      0.93       376
           1       0.74      0.36      0.49        72

    accuracy                           0.88       448
   macro avg       0.82      0.67      0.71       448
weighted avg       0.87      0.88      0.86       448

Accuracy: 0.8772321428571429


In [19]:
# Model 3: Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)

print("Gradient Boosting Performance:")
print(classification_report(y_test, y_pred_gb))
print("Accuracy:", accuracy_score(y_test, y_pred_gb))

Gradient Boosting Performance:
              precision    recall  f1-score   support

           0       0.90      0.97      0.93       376
           1       0.74      0.43      0.54        72

    accuracy                           0.88       448
   macro avg       0.82      0.70      0.74       448
weighted avg       0.87      0.88      0.87       448

Accuracy: 0.8839285714285714


In [20]:
# Model Comparison: Accuracy
logreg_acc = accuracy_score(y_test, y_pred)
rf_acc = accuracy_score(y_test, y_pred_rf)
gb_acc = accuracy_score(y_test, y_pred_gb)

print(f"Logistic Regression Accuracy: {logreg_acc:.4f}")
print(f"Random Forest Accuracy: {rf_acc:.4f}")
print(f"Gradient Boosting Accuracy: {gb_acc:.4f}")

Logistic Regression Accuracy: 0.8147
Random Forest Accuracy: 0.8772
Gradient Boosting Accuracy: 0.8839


In [21]:
# Random Forest Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42),
                           param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
y_pred_best_rf = best_rf.predict(X_test)

print("Random Forest (Tuned) Performance:")
print(classification_report(y_test, y_pred_best_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_best_rf))
print("Best parameters found:", grid_search.best_params_)
print("Best CV score:", grid_search.best_score_)

Random Forest (Tuned) Performance:
              precision    recall  f1-score   support

           0       0.88      0.98      0.93       376
           1       0.74      0.32      0.45        72

    accuracy                           0.87       448
   macro avg       0.81      0.65      0.69       448
weighted avg       0.86      0.87      0.85       448

Accuracy: 0.8727678571428571
Best parameters found: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 100}
Best CV score: 0.884847346759933


In [22]:
# Model Comparison
from sklearn.metrics import recall_score, f1_score, accuracy_score

# Predictions
y_pred_logreg = logreg.predict(X_test)
y_pred_rf = rf.predict(X_test)
y_pred_gb = gb.predict(X_test)
y_pred_best_rf = best_rf.predict(X_test)

# Comparison DataFrame
comparison_df = pd.DataFrame({
    'Model': [
        'Logistic Regression (Balanced)', 
        'Random Forest', 
        'Gradient Boosting', 
        'Random Forest (Tuned)'
    ],
    'Accuracy': [
        accuracy_score(y_test, y_pred_logreg),
        accuracy_score(y_test, y_pred_rf),
        accuracy_score(y_test, y_pred_gb),
        accuracy_score(y_test, y_pred_best_rf)
    ],
    'Class 1 Recall': [
        recall_score(y_test, y_pred_logreg, pos_label=1),
        recall_score(y_test, y_pred_rf, pos_label=1),
        recall_score(y_test, y_pred_gb, pos_label=1),
        recall_score(y_test, y_pred_best_rf, pos_label=1)
    ],
    'Class 1 F1': [
        f1_score(y_test, y_pred_logreg, pos_label=1),
        f1_score(y_test, y_pred_rf, pos_label=1),
        f1_score(y_test, y_pred_gb, pos_label=1),
        f1_score(y_test, y_pred_best_rf, pos_label=1)
    ]
})

comparison_df


Unnamed: 0,Model,Accuracy,Class 1 Recall,Class 1 F1
0,Logistic Regression (Balanced),0.814732,0.75,0.565445
1,Random Forest,0.877232,0.361111,0.485981
2,Gradient Boosting,0.883929,0.430556,0.54386
3,Random Forest (Tuned),0.872768,0.319444,0.446602


In [None]:
### Final Model Selection

I evaluated four models on the development dataset:  

1. Logistic Regression (class-weighted, balanced)  
2. Random Forest (baseline)  
3. Gradient Boosting  
4. Random Forest (hyperparameter-tuned)  

Here’s a quick summary of the results:

| Model | Accuracy | Class 1 Recall | Class 1 F1 |
|-------|---------|----------------|------------|
| Logistic Regression (Balanced) | 0.815 | 0.75 | 0.57 |
| Random Forest | 0.877 | 0.36 | 0.49 |
| Gradient Boosting | 0.884 | 0.43 | 0.54 |
| Random Forest (Tuned) | 0.873 | 0.32 | 0.45 |

A few things stand out:  

- Gradient Boosting and Random Forest have the highest overall accuracy, but they miss a lot of positive responders.  
- Logistic Regression with class weighting catches most of the responders, giving the highest recall and F1 for the positive class.  
- Even after tuning, Random Forest didn’t improve on finding responders.

Because only about 15% of people responded, the main goal is to identify as many responders as possible. For this reason, I'm choosing class-weighted Logistic Regression as the final model.  

Note: The best Random Forest parameters were `{'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 100}`, but even with tuning it still missed too many responders.
