## Feature Engineering

In [7]:
# =============================================================================
# FEATURE ENGINEERING
# =============================================================================

# Standard imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Evaluation
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                            f1_score, roc_auc_score, confusion_matrix,
                            classification_report)

# Feature importance & explainability
# !pip install shap  # Uncomment if needed
# import shap

# Settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-whitegrid')
RANDOM_STATE = 42

print("Environment ready! ✓")

Environment ready! ✓


In [None]:
processed_dataset = 'Cafe_Rewards_Offers/processed_data_for_classification.csv'

In [3]:
df = pd.read_csv(processed_dataset)
df.head()

Unnamed: 0,customer_id,offer_id,received_time,difficulty,duration,offer_type,in_email,in_mobile,in_social,in_web,...,age,income,membership_year,is_demographics_missing,age_group,income_bracket,became_member_date,membership_duration_days,membership_month,tenure_group
0,78afa995795e4d85b5d9ceeca43f5fef,9b98b8c7a33c4b65b9aebfe6a799e6d9,0,5.0,7.0,bogo,1.0,1.0,0.0,1.0,...,75,100000.0,2017,0,61-75,Very High,2017-05-09,443,5,1-2 years
1,a03223e636434f42ac4c3df47e8bac43,0b1e1539f2cc45b7b9fa7c272da2e1d7,0,20.0,10.0,discount,1.0,0.0,0.0,1.0,...,118,0.0,2017,1,76+,Missing,2017-08-04,356,8,6-12 months
2,e2127556f4f64592b11af22de27a7932,2906b810c7d4411798c6938adc9daaa5,0,10.0,7.0,discount,1.0,1.0,0.0,1.0,...,68,70000.0,2018,0,61-75,High,2018-04-26,91,4,0-6 months
3,8ec6ce2a7e7949b1bf142def7d0e0586,fafdcd668e3743c1bb461111dcafc2a4,0,10.0,10.0,discount,1.0,1.0,1.0,1.0,...,118,0.0,2017,1,76+,Missing,2017-09-25,304,9,6-12 months
4,68617ca6246f4fbc85e91a2a49552598,4d5c57ea9a6940dd891ad53e9dbe8da0,0,10.0,5.0,bogo,1.0,1.0,1.0,1.0,...,118,0.0,2017,1,76+,Missing,2017-10-02,297,10,6-12 months


In [6]:
df.dtypes

customer_id                  object
offer_id                     object
received_time                 int64
difficulty                  float64
duration                    float64
offer_type                   object
in_email                    float64
in_mobile                   float64
in_social                   float64
in_web                      float64
offer_received                int64
offer_viewed                  int64
offer_completed               int64
target                        int64
completion_time             float64
time_to_action              float64
became_member_on              int64
gender                       object
age                           int64
income                      float64
membership_year               int64
is_demographics_missing       int64
age_group                    object
income_bracket               object
became_member_date           object
membership_duration_days      int64
membership_month              int64
tenure_group                

In [5]:
df.describe()

Unnamed: 0,received_time,difficulty,duration,in_email,in_mobile,in_social,in_web,offer_received,offer_viewed,offer_completed,target,completion_time,time_to_action,became_member_on,age,income,membership_year,is_demographics_missing,membership_duration_days,membership_month
count,86432.0,86432.0,86432.0,86432.0,86432.0,86432.0,86432.0,86432.0,86432.0,86432.0,86432.0,46152.0,46152.0,86432.0,86432.0,86432.0,86432.0,86432.0,86432.0,86432.0
mean,333.046094,7.887669,6.604105,1.0,0.900141,0.606292,0.809932,1.0,0.807062,0.533969,0.533969,401.080083,64.545502,20166730.0,61.91325,58399.932895,2016.604718,0.11527,528.157476,6.695483
std,196.17301,5.408132,2.172543,0.0,0.299813,0.488574,0.392357,0.0,0.394607,0.498848,0.498848,192.245597,162.494952,11606.43,25.980263,29329.059923,1.170479,0.319349,408.834252,3.492822
min,0.0,0.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-576.0,20130730.0,18.0,0.0,2013.0,0.0,0.0,1.0
25%,168.0,5.0,5.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,240.0,12.0,20160510.0,45.0,42000.0,2016.0,0.0,217.0,4.0
50%,408.0,10.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,432.0,54.0,20170710.0,58.0,60000.0,2017.0,0.0,377.0,7.0
75%,504.0,10.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,564.0,114.0,20171220.0,72.0,78000.0,2017.0,0.0,805.0,10.0
max,576.0,20.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,714.0,714.0,20180730.0,118.0,120000.0,2018.0,1.0,1823.0,12.0


In [None]:
DF