In [1]:
from sklearn.model_selection import StratifiedKFold, train_test_split
import pandas as pd
pd.set_option('display.max_columns',100)
import os
import seaborn as sns
sns.set()
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier
plt.rcParams['figure.figsize'] = 8, 5
plt.style.use("fivethirtyeight")
for dirname, _, filenames in os.walk('Predicting_Coupon_Redemption'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
train_df = pd.read_csv('../Data/train.csv')
campaign_data_df = pd.read_csv('../Data/campaign_data.csv')
coupon_item_mapping_df = pd.read_csv('../Data/coupon_item_mapping.csv')
customer_demographics_df = pd.read_csv('../Data/customer_demographics.csv')
customer_transaction_data_df = pd.read_csv('../Data/customer_transaction_data.csv')
item_data_df = pd.read_csv('../Data/item_data.csv')
test_df = pd.read_csv('../Data/test.csv')
sub_df = pd.read_csv('../Data/sample_submission.csv')




In [3]:
# Feature Engineering in campaign_data.csv data.
campaign_data_df['start_date'] = pd.to_datetime(campaign_data_df['start_date'], format='%d/%m/%y', dayfirst=True)
campaign_data_df['end_date'] = pd.to_datetime(campaign_data_df['end_date'], format='%d/%m/%y', dayfirst=True)
campaign_data_df['diff_d'] = (campaign_data_df['end_date'] - campaign_data_df['start_date']) / np.timedelta64(1,'D')
campaign_data_df.drop(['start_date','end_date'], axis=1, inplace=True)

No Feature Engineering required for the train.csv data.

No Feature Engineering required for the item_data.csv data.

No Feature Engineering required for the coupon_item_mapping.csv data.

Feature Engineering in customer_demographics.csv data.

In [4]:
lb = LabelEncoder()
customer_demographics_df['age_range'] = lb.fit_transform(customer_demographics_df['age_range'])

In [5]:
# customer_transaction_data.csv
customer_transaction_data_df['date'] = pd.to_datetime(customer_transaction_data_df['date'], format='%Y-%m-%d')
customer_transaction_data_df['date_d'] = customer_transaction_data_df['date'].dt.day.astype('category')
customer_transaction_data_df['date_m'] = customer_transaction_data_df['date'].dt.month.astype('category')
customer_transaction_data_df['date_w'] = customer_transaction_data_df['date'].dt.week.astype('category')
customer_transaction_data_df.drop(['date'], axis=1, inplace=True)

In [6]:
tgroup = customer_transaction_data_df.groupby(['customer_id']).sum().reset_index()

In [7]:
# Merge Data.
train_campaign_data = pd.merge(train_df,campaign_data_df,on='campaign_id',how="left")
test_campaign_data= pd.merge(test_df,campaign_data_df,on='campaign_id',how="left")

In [8]:
# Coupon Item Mapping.
coupon_item_mapping_item_data = pd.merge(coupon_item_mapping_df, item_data_df, on='item_id', how="left")
mci_group = pd.DataFrame()
mci_group[['coupon_id','no_of_items']] = coupon_item_mapping_item_data.groupby('coupon_id').count().reset_index()[
    ['coupon_id','item_id']]
mci_group[['brand_type','category']] = coupon_item_mapping_item_data.groupby('coupon_id').max().reset_index()[
    ['brand_type','category']]


In [9]:
mdtg = pd.merge(tgroup,customer_demographics_df,on='customer_id',how='outer')


In [10]:
# Merge all.
mergeddata = pd.merge(train_campaign_data, mdtg, on=['customer_id'], how='left')
mergeddata = pd.merge(mergeddata, mci_group, on=['coupon_id'], how='left')

In [11]:
mergeddata2 = pd.merge(test_campaign_data, mdtg, on=['customer_id'], how='left')
mergeddata2 = pd.merge(mergeddata2, mci_group, on=['coupon_id'], how='left')
id_df = mergeddata2['id']
# mergeddata.drop(['id'],axis=1,inplace=True)
# mergeddata2.drop(['id'],axis=1,inplace=True)



In [12]:
# Checking Missing values.
print(mergeddata.isnull().sum())
mergeddata.drop(['no_of_children','age_range','marital_status','rented','family_size','income_bracket'], axis=1, inplace=True)
print(mergeddata.isnull().sum())

id                       0
campaign_id              0
coupon_id                0
customer_id              0
redemption_status        0
campaign_type            0
diff_d                   0
item_id                  0
quantity                 0
selling_price            0
other_discount           0
coupon_discount          0
age_range            34708
marital_status       52975
rented               34708
family_size          34708
no_of_children       64313
income_bracket       34708
no_of_items              0
brand_type               0
category                 0
dtype: int64
id                   0
campaign_id          0
coupon_id            0
customer_id          0
redemption_status    0
campaign_type        0
diff_d               0
item_id              0
quantity             0
selling_price        0
other_discount       0
coupon_discount      0
no_of_items          0
brand_type           0
category             0
dtype: int64


In [13]:
# Checking Missing values.
print(mergeddata2.isnull().sum())
mergeddata2.drop(['no_of_children','age_range','marital_status','rented','family_size','income_bracket'], axis=1, inplace=True)
print(mergeddata2.isnull().sum())

id                     0
campaign_id            0
coupon_id              0
customer_id            0
campaign_type          0
diff_d                 0
item_id                0
quantity               0
selling_price          0
other_discount         0
coupon_discount        0
age_range          19287
marital_status     32144
rented             19287
family_size        19287
no_of_children     40064
income_bracket     19287
no_of_items            0
brand_type             0
category               0
dtype: int64
id                 0
campaign_id        0
coupon_id          0
customer_id        0
campaign_type      0
diff_d             0
item_id            0
quantity           0
selling_price      0
other_discount     0
coupon_discount    0
no_of_items        0
brand_type         0
category           0
dtype: int64


In [14]:
# Feature Encoding.
cols = ['campaign_type','brand_type','category']
lb = LabelEncoder()
for i in cols:
    mergeddata[i] = lb.fit_transform(mergeddata[i])

In [15]:
for i in cols:
    mergeddata2[i] = lb.fit_transform(mergeddata2[i])


In [16]:
X = mergeddata.drop(['redemption_status'],axis=1)
Y = mergeddata['redemption_status']

In [17]:
# Handling class imbalance.
# Technique - 1.
fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=2020)
i = 1
auc = []

for train_index, test_index in fold.split(X, Y):
    x_train, x_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = Y.iloc[train_index], Y.iloc[test_index]
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_val = scaler.transform(x_val)
    mergeddata2 = scaler.transform(mergeddata2)
    m = LGBMClassifier(random_state=80) # 93.23
    m.fit(x_train, y_train)
    pred_prob1 = m.predict_proba(x_val)
    auc.append(roc_auc_score(y_val, pred_prob1[:, 1]))
    i = i + 1

print("AUC Score")
print(sum(auc)/10)

AUC Score
0.927883427257051


In [18]:
# Technique-2:SMOTE
# Oversample the dataset.

from imblearn.over_sampling import SMOTE
from collections import Counter
oversample = SMOTE()
X,Y = oversample.fit_resample(X,Y)
counter = Counter(Y)
print(counter)

Counter({0: 77640, 1: 77640})


In [19]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state = 101)

In [20]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [21]:
classifier_o = AdaBoostClassifier(random_state=20)  # 98.11
classifier_o.fit(X_train,y_train)
pred_prob1 = classifier_o.predict_proba(X_test)
print(roc_auc_score(y_test, pred_prob1[:, 1]))

0.9817128986521052


In [22]:
classifier_o = GradientBoostingClassifier(random_state=30)  # 98.74
classifier_o.fit(X_train,y_train)
pred_prob1 = classifier_o.predict_proba(X_test)
print(roc_auc_score(y_test, pred_prob1[:, 1]))

0.9877396210118656


In [23]:
classifier_o = LGBMClassifier(random_state=10)  # 99.78
classifier_o.fit(X_train,y_train)
pred_prob1 = classifier_o.predict_proba(X_test)
print(roc_auc_score(y_test,pred_prob1[:,1]))

0.9982604113043118


In [24]:
# LGBM Hyperparameter Tuning using Grid Search.
from sklearn.model_selection import GridSearchCV
param_grid = {'learning_rate': [0.1,0.5],
                 'max_depth': [4,5,6],
                 'num_leaves': [10,20],
                 'feature_fraction': [0.6,0.8],
                 'subsample': [0.2,0.6],
                 'objective': ['binary'],
              'metric': ['auc'],
              'is_unbalance':[False],
              'boosting':['gbdt'],
              'num_boost_round':[100],
              'early_stopping_rounds':[30]}

# Build and fit the GridSearchCV.
grid = GridSearchCV(estimator=classifier_o, param_grid=param_grid,
                    cv=10,verbose=10)
grid_results = grid.fit(X_train,y_train,eval_set = (X_test,y_test))
# Summarize the results in a readable format.
print("Best: {0}, using {1}".format(grid_results.best_score_, grid_results.best_params_))

Fitting 10 folds for each of 48 candidates, totalling 480 fits
[CV 1/10; 1/48] START boosting=gbdt, early_stopping_rounds=30, feature_fraction=0.6, is_unbalance=False, learning_rate=0.1, max_depth=4, metric=auc, num_boost_round=100, num_leaves=10, objective=binary, subsample=0.2
[1]	valid_0's auc: 0.896376
Training until validation scores don't improve for 30 rounds
[2]	valid_0's auc: 0.953092
[3]	valid_0's auc: 0.956684
[4]	valid_0's auc: 0.957942
[5]	valid_0's auc: 0.956987
[6]	valid_0's auc: 0.959825
[7]	valid_0's auc: 0.96026
[8]	valid_0's auc: 0.964144
[9]	valid_0's auc: 0.964179
[10]	valid_0's auc: 0.963692
[11]	valid_0's auc: 0.964363
[12]	valid_0's auc: 0.964965
[13]	valid_0's auc: 0.964359
[14]	valid_0's auc: 0.965783
[15]	valid_0's auc: 0.965999
[16]	valid_0's auc: 0.96719
[17]	valid_0's auc: 0.968271
[18]	valid_0's auc: 0.969923
[19]	valid_0's auc: 0.970188
[20]	valid_0's auc: 0.970214
[21]	valid_0's auc: 0.970544
[22]	valid_0's auc: 0.97126
[23]	valid_0's auc: 0.971846
[24]

In [25]:
classifier_o = LGBMClassifier(random_state=10,boosting='gbdt',feature_fraction=0.8,is_unbalance=False,learning_rate=0.5,max_depth=6,metric='auc',num_boost_round=100,num_leaves=20,objective='binary',subsample=0.2)  # 99.78
classifier_o.fit(X_train,y_train)
pred_prob1 = classifier_o.predict_proba(X_test)
print(roc_auc_score(y_test,pred_prob1[:,1]))  # 99.89

0.9989049120871412


In [26]:
pred = classifier_o.predict(mergeddata2)
sub_df['redemption_status'] = pred
sub_df.to_csv('final_predictions.csv', index=False)