Based on https://www.kaggle.com/rahul1394/amex-coupon-redeem-classification

In [78]:
import numpy as np
import pandas as pd
from scipy.stats import mode
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler

## Data prep

In [16]:
campaign = pd.read_csv('original_data/campaign_data.csv')
coupons = pd.read_csv('original_data/coupon_item_mapping.csv')
cust_demo = pd.read_csv('original_data/customer_demographics.csv')
cust_tran = pd.read_csv('original_data/customer_transaction_data.csv')
items = pd.read_csv('original_data/item_data.csv')
train = pd.read_csv('original_data/train.csv')
test = pd.read_csv('original_data/test.csv')

In [18]:
campaign['start_date'] = pd.to_datetime(campaign['start_date'])
campaign['end_date'] = pd.to_datetime(campaign['end_date'])
campaign['campaign_duration'] = campaign['campaign_duration'].apply(lambda x: x.days)

In [25]:
cust_demo['family_size'] = cust_demo.family_size.apply(lambda x: int(x.replace('+', '')))
cust_demo['no_of_children'] = cust_demo.no_of_children.apply(lambda x: int(x.replace('+', '')) if pd.notna(x) else x)

cust_demo.loc[pd.isnull(cust_demo.marital_status) & (cust_demo.family_size == 1),
              'marital_status'] = 'Single'
cust_demo.loc[pd.isnull(cust_demo.marital_status) & ((cust_demo.family_size - cust_demo.no_of_children) == 1),
              'marital_status'] = 'Single'
cust_demo.loc[pd.isnull(cust_demo.marital_status) & ((cust_demo.family_size - cust_demo.no_of_children) == 2),
              'marital_status'] = 'Married'
cust_demo.loc[pd.isnull(cust_demo.marital_status) & pd.isnull(cust_demo.no_of_children) & (cust_demo.family_size == 2),
              'marital_status'] = 'Married'

cust_demo.loc[pd.isnull(cust_demo.no_of_children) & (cust_demo.marital_status == 'Married') & (cust_demo.family_size == 2),
              'no_of_children'] = 0
cust_demo.loc[pd.isnull(cust_demo.no_of_children) & (cust_demo.family_size == 1), 'no_of_children'] = 0
cust_demo.loc[pd.isnull(cust_demo.no_of_children) & (cust_demo.family_size == 2),'no_of_children'] = 1
cust_demo['no_of_children'] = cust_demo['no_of_children'].astype(np.int64)

customer_id       0
age_range         0
marital_status    0
rented            0
family_size       0
no_of_children    0
income_bracket    0
dtype: int64

In [31]:
cust_tran['date'] = pd.to_datetime(cust_tran['date'])
cust_tran = cust_tran.sort_values('date').reset_index(drop=True)

cust_tran['day'] = cust_tran['date'].apply(lambda x: x.day)
cust_tran['dow'] = cust_tran['date'].apply(lambda x: x.weekday())
cust_tran['month'] = cust_tran['date'].apply(lambda x: x.month)

cust_tran['selling_price'] = cust_tran['selling_price'] / cust_tran['quantity']
cust_tran['other_discount'] = cust_tran['other_discount'] / cust_tran['quantity']
cust_tran['selling_price'] = cust_tran['selling_price'] + cust_tran['other_discount']
cust_tran.drop(['other_discount'], axis=1, inplace=True)

cust_tran['coupon_used'] = cust_tran.coupon_discount.apply(lambda x: 1 if x != 0 else 0)

In [33]:
items.drop(['brand_type'], axis=1, inplace=True)

## Merging

In [48]:
coupons_items = pd.merge(coupons, items, on='item_id', how='left')

In [49]:
transactions1 = pd.pivot_table(cust_tran, index='item_id',
                               values=['customer_id', 'quantity', 'selling_price', 'coupon_discount', 'coupon_used'],
                               aggfunc={
                                   'customer_id': lambda x: len(set(x)),
                                   'quantity': np.mean,
                                   'selling_price': np.mean,
                                   'coupon_discount': np.mean,
                                   'coupon_used': np.sum
                               })
transactions1.reset_index(inplace=True)
transactions1.rename(columns={'customer_id': 'no_of_customers', 'selling_price': 'price_mean',
                              'coupon_discount': 'discount_mean', 'quantity': 'quantity_mean'}, inplace=True)

transactions2 = pd.pivot_table(cust_tran, index='item_id',
                               values=['customer_id', 'quantity', 'selling_price', 'coupon_discount'],
                               aggfunc={
                                   'customer_id': len,
                                   'quantity': np.sum,
                                   'selling_price': np.sum,
                                   'coupon_discount': np.sum
                               })
transactions2.reset_index(inplace=True)
transactions2.rename(columns={'customer_id': 'tran_counts', 'quantity': 'quantity_sum', 'selling_price': 'price_sum',
                              'coupon_discount': 'discount_sum'}, inplace=True)

transactions1 = pd.merge(transactions1, transactions2, on='item_id', how='left')

In [50]:
item_coupon_trans = pd.merge(coupons_items, transactions1, on='item_id', how='left')

In [53]:
coupon = pd.pivot_table(item_coupon_trans, index ='coupon_id',
                        values=['item_id', 'brand', 'category', 'discount_mean', 'coupon_used', 'no_of_customers',
                                'quantity_mean', 'price_mean', 'discount_sum', 'tran_counts', 'quantity_sum',
                                'price_sum'],
                        aggfunc={
                            'item_id': lambda x: len(set(x)),
                            'brand': lambda x: mode(x)[0][0],
                            'category': lambda x: mode(x)[0][0],
                            'discount_mean': np.mean,
                            'no_of_customers': np.mean,
                            'quantity_mean': np.mean,
                            'price_mean': np.mean,
                            'coupon_used': np.sum,
                            'discount_sum': np.sum,
                            'tran_counts': np.sum,
                            'quantity_sum': np.sum,
                            'price_sum': np.sum
                        })
coupon.reset_index(inplace=True)
coupon.rename(columns={'item_id': 'item_counts'}, inplace=True)

In [54]:
transactions3 = pd.pivot_table(cust_tran, index='customer_id',
                               values=['item_id', 'quantity','selling_price', 'coupon_discount',
                                       'coupon_used', 'day', 'dow', 'month'],
                               aggfunc={
                                   'item_id': lambda x: len(set(x)),
                                   'quantity': np.mean,
                                   'selling_price': np.mean,
                                   'coupon_discount': np.mean,
                                   'coupon_used': np.sum,
                                   'day': lambda x: mode(x)[0][0],
                                   'dow': lambda x: mode(x)[0][0],
                                   'month': lambda x: mode(x)[0][0]
                               })
transactions3.reset_index(inplace=True)
transactions3.rename(columns={'item_id': 'no_of_items', 'quantity': 'mean_quantity', 'selling_price': 'mean_price',
                              'coupon_discount': 'mean_discount'}, inplace=True)

transactions4 = pd.pivot_table(cust_tran, index='customer_id',
                               values=['item_id', 'quantity', 'selling_price', 'coupon_discount'],
                               aggfunc={
                                   'item_id': len,
                                   'quantity': np.sum,
                                   'selling_price': np.sum,
                                   'coupon_discount': np.sum
                               })
transactions4.reset_index(inplace=True)
transactions4.rename(columns={'item_id': 'customer_id_count','quantity': 'quantity_sum', 'selling_price':'pprice_sum',
                              'coupon_discount': 'ddiscount_sum'}, inplace=True)

transactions = pd.merge(transactions3, transactions4, on='customer_id', how='left')

In [55]:
def merge_all(df):
    df = pd.merge(df, coupon, on='coupon_id', how='left')
    df = pd.merge(df, campaign, on='campaign_id', how='left')
    df = pd.merge(df, cust_demo, on='customer_id', how='left')
    df = pd.merge(df, transactions, on='customer_id', how='left')
    return df

train = merge_all(train)
test = merge_all(test)

In [57]:
def fill_nans(df):
    for col in cust_demo.columns.tolist()[1:]:
        df[col].fillna(mode(df[col]).mode[0], inplace=True)
    return df

train = fill_nans(train)
test = fill_nans(test)

In [65]:
train.drop(['id','campaign_id','start_date','end_date', 'coupon_id', 'customer_id'], axis=1, inplace=True)
test.drop(['id','campaign_id','start_date','end_date', 'coupon_id', 'customer_id'], axis=1, inplace=True)

In [66]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78369 entries, 0 to 78368
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   redemption_status  78369 non-null  int64  
 1   brand              78369 non-null  int64  
 2   category           78369 non-null  object 
 3   coupon_used_x      78369 non-null  float64
 4   discount_mean      78369 non-null  float64
 5   discount_sum       78369 non-null  float64
 6   item_counts        78369 non-null  int64  
 7   no_of_customers    78369 non-null  float64
 8   price_mean         78369 non-null  float64
 9   price_sum          78369 non-null  float64
 10  quantity_mean      78369 non-null  float64
 11  quantity_sum_x     78369 non-null  float64
 12  tran_counts        78369 non-null  float64
 13  campaign_type      78369 non-null  object 
 14  age_range          78369 non-null  int64  
 15  marital_status     78369 non-null  object 
 16  rented             783

## Encoding

In [62]:
le = LabelEncoder()
train['age_range'] = le.fit_transform(train['age_range'])
test['age_range'] = le.fit_transform(test['age_range'])

In [63]:
train.head()

Unnamed: 0,coupon_id,customer_id,redemption_status,brand,category,coupon_used_x,discount_mean,discount_sum,item_counts,no_of_customers,...,day,dow,no_of_items,month,mean_quantity,mean_price,ddiscount_sum,customer_id_count,quantity_sum_y,pprice_sum
0,27,1053,0,1105,Grocery,191.0,-1.241694,-5349.93,125,19.224,...,11,2,208,5,340.487097,112.800627,-89.05,310,105551,34968.194289
1,116,48,0,56,Grocery,0.0,0.0,0.0,3,20.333333,...,4,5,244,6,31.54026,148.611104,-1237.79,385,12143,57215.27503
2,635,205,0,560,Pharmaceutical,15.0,-2.987321,-605.54,67,1.761194,...,27,2,533,3,1.392784,86.320926,-2145.72,970,1351,83731.298063
3,644,1050,0,611,Grocery,1.0,-4.4525,-17.81,4,3.5,...,9,4,216,8,1.291139,75.13526,-178.1,237,306,17807.056667
4,1017,1489,0,1558,Grocery,23.0,-0.198055,-569.92,32,27.3125,...,2,4,327,5,247.44306,60.488003,-265.01,562,139063,33994.257567


In [71]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

## Scaling

In [77]:
rc = RobustScaler()
scaled_data = pd.DataFrame(rc.fit_transform(train), columns=train.columns)

## Model Building

In [80]:
y = scaled_data['redemption_status']
x = scaled_data.drop('redemption_status', axis=1)

In [81]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state=0)

In [82]:
from sklearn.ensemble import AdaBoostClassifier

In [86]:
ada = AdaBoostClassifier()
ada.fit(xtrain, ytrain)

AdaBoostClassifier()

In [87]:
pred_y = ada.predict(xtest)

In [88]:
pd.crosstab(ytest, pred_y, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0.0,1.0,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,19418,4,19422
1.0,166,5,171
All,19584,9,19593


In [89]:
print(classification_report(ytest, pred_y))

              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     19422
         1.0       0.56      0.03      0.06       171

    accuracy                           0.99     19593
   macro avg       0.77      0.51      0.53     19593
weighted avg       0.99      0.99      0.99     19593



In [90]:
train_y = train['redemption_status']
train_x = train.drop('redemption_status', axis=1)
pred_train = ada.predict(train_x)

In [None]:
train_y.value_counts()

In [91]:
pd.crosstab(pred_train, train_y, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,77640,729,78369
All,77640,729,78369


In [92]:
print(classification_report(train_y, pred_train))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00     77640
           1       0.00      0.00      0.00       729

    accuracy                           0.99     78369
   macro avg       0.50      0.50      0.50     78369
weighted avg       0.98      0.99      0.99     78369



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
