In [1]:
import os
import pickle

import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
# Retrieve data 
data = pd.read_csv('prepped-data/train_with_ids.csv')

In [3]:
data

Unnamed: 0,coupon_id,customer_id,redemption_status,age_range,marital_status,family_size,no_of_children,income_bracket,gender,mean_discount_used_by_cust,unique_items_bought_by_cust,mean_selling_price_paid_by_cust,mean_quantity_bought_by_cust,total_discount_used_by_cust,total_coupons_used_by_cust,total_price_paid_by_cust,total_quantity_bought_by_cust,mean_coupon_discount,mean_item_price,category
0,27,1053,0,3,1,1,0,5,1,-0.29,208,164.25,1.08,-89.05,1,50918.77,335,-1.13,118.41,Women
1,27,1053,0,3,1,1,0,5,1,-0.29,208,164.25,1.08,-89.05,1,50918.77,335,-1.13,118.41,Men
2,27,1053,0,3,1,1,0,5,1,-0.29,208,164.25,1.08,-89.05,1,50918.77,335,-1.13,118.41,Sport
3,27,1053,0,3,1,1,0,5,1,-0.29,208,164.25,1.08,-89.05,1,50918.77,335,-1.13,118.41,Girls
4,27,1053,0,3,1,1,0,5,1,-0.29,208,164.25,1.08,-89.05,1,50918.77,335,-1.13,118.41,Boys
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296327,681,623,0,3,0,2,0,2,0,0.00,239,106.05,1.13,0.00,0,75509.00,804,-1.96,169.80,Boys
296328,681,623,0,3,0,2,0,2,0,0.00,239,106.05,1.13,0.00,0,75509.00,804,-1.96,169.80,Men
296329,681,623,0,3,0,2,0,2,0,0.00,239,106.05,1.13,0.00,0,75509.00,804,-1.96,169.80,Girls
296330,681,623,0,3,0,2,0,2,0,0.00,239,106.05,1.13,0.00,0,75509.00,804,-1.96,169.80,Women


In [4]:
data.drop('redemption_status', axis=1, inplace=True)

In [5]:
data.drop(['mean_item_price', 'mean_coupon_discount'], axis=1, inplace=True)

In [6]:
ids = data[['coupon_id', 'customer_id']]

In [7]:
data.drop(['coupon_id', 'customer_id'], axis=1, inplace=True)

In [8]:
model_path = '../../prediction-service/app/model_store/scikit_classifier'

In [9]:
with open(model_path, 'rb') as f:
    model = pickle.load(f)

In [10]:
encoder = LabelEncoder()
data['age_range'] = encoder.fit_transform(data['age_range'])
print('Age range encoding:')
for i, c in enumerate(encoder.classes_):
    print(f'{i}: {c}')

data['marital_status'] = encoder.fit_transform(data['marital_status'])
print('Marital status encoding:')
for i, c in enumerate(encoder.classes_):
    print(f'{i}: {c}')

data['gender'] = encoder.fit_transform(data['gender'])
print('Gender encoding:')
for i, c in enumerate(encoder.classes_):
    print(f'{i}: {c}')

data = pd.get_dummies(data, columns=['category'])

Age range encoding:
0: 0
1: 1
2: 2
3: 3
4: 4
5: 5
Marital status encoding:
0: 0
1: 1
Gender encoding:
0: 0
1: 1


In [11]:
data.columns

Index(['age_range', 'marital_status', 'family_size', 'no_of_children',
       'income_bracket', 'gender', 'mean_discount_used_by_cust',
       'unique_items_bought_by_cust', 'mean_selling_price_paid_by_cust',
       'mean_quantity_bought_by_cust', 'total_discount_used_by_cust',
       'total_coupons_used_by_cust', 'total_price_paid_by_cust',
       'total_quantity_bought_by_cust', 'category_Boys', 'category_Girls',
       'category_Men', 'category_Sport', 'category_Women'],
      dtype='object')

In [12]:
preds = model.predict(data)

In [13]:
data['prediction'] = preds

In [14]:
data['customer_id'] = ids['customer_id']
data['coupon_id'] = ids['coupon_id']

### Decode categories

In [15]:
encoded_labels = ['category_Boys', 'category_Girls', 'category_Men', 'category_Sport', 'category_Women']
category = data[encoded_labels].idxmax(axis=1).apply(lambda x: x.replace('category_', ''))
data['category'] = category
data.drop(encoded_labels, axis=1, inplace=True)
data.sample(5)

Unnamed: 0,age_range,marital_status,family_size,no_of_children,income_bracket,gender,mean_discount_used_by_cust,unique_items_bought_by_cust,mean_selling_price_paid_by_cust,mean_quantity_bought_by_cust,total_discount_used_by_cust,total_coupons_used_by_cust,total_price_paid_by_cust,total_quantity_bought_by_cust,prediction,customer_id,coupon_id,category
5445,1,0,2,0,5,0,-0.23,136,104.38,1.19,-35.62,1,16387.1,187,0,77,751,Men
156004,4,1,1,0,1,0,-0.08,307,145.77,1.0,-35.62,1,66764.46,458,0,1092,726,Girls
201021,3,1,1,0,5,0,0.0,268,101.83,1.0,0.0,0,29836.31,293,0,477,998,Sport
77766,2,1,4,2,1,1,0.0,473,86.99,1.07,0.0,0,55673.44,682,0,1348,691,Women
9862,2,1,1,0,5,0,-1.45,658,117.49,1.0,-1979.65,81,160022.32,1366,0,967,850,Boys


In [16]:
data = data[['customer_id', 'coupon_id', 'prediction', 'category']]

In [17]:
cust_cat = data.loc[data['prediction'] == 1][['customer_id', 'category']].value_counts().rename('hit_count').reset_index().sort_values(by='customer_id')

In [18]:
cust_cat

Unnamed: 0,customer_id,category,hit_count
40,1,Women,84
320,3,Women,38
350,3,Sport,35
434,3,Boys,26
42,8,Women,84
...,...,...,...
382,1578,Women,32
480,1578,Boys,20
406,1578,Sport,29
131,1582,Men,60


In [19]:
cust_cat_stats = pd.pivot_table(cust_cat, index='customer_id', values=['category', 'hit_count'],
                                aggfunc={
                                    'category': lambda x: len(set(x)),
                                    'hit_count': sum
                                })
cust_cat_stats = cust_cat_stats.reset_index().sort_values(by=['category', 'hit_count'], ascending=False)

In [20]:
good_customers = cust_cat_stats.loc[cust_cat_stats['category'] == 3].customer_id

In [21]:
good_customers_preds = pd.merge(good_customers, data, on='customer_id', how='left')

In [22]:
assert len(good_customers) == len(good_customers_preds.customer_id.unique())

In [23]:
cust_hits = good_customers_preds.loc[good_customers_preds['prediction'] == 1]\
    .customer_id.value_counts().reset_index().rename(columns={'index': 'customer_id', 'customer_id': 'total_hits'})

In [24]:
good_customers_preds = pd.merge(good_customers_preds, cust_hits, on='customer_id', how='left')

In [25]:
good_customers_preds.category.value_counts()

Women    5329
Men      5105
Sport    4885
Girls    4153
Boys     4012
Name: category, dtype: int64

In [26]:
len(good_customers_preds.loc[good_customers_preds['total_hits'] > 100].customer_id.unique())

68

In [27]:
good_customers_preds = good_customers_preds.loc[good_customers_preds['total_hits'] > 100]

In [28]:
good_customers_preds_hits_only = good_customers_preds.loc[good_customers_preds['prediction'] == 1]
good_customers_preds_hits_only

Unnamed: 0,customer_id,coupon_id,prediction,category,total_hits
0,1467,3,1,Women,300
3,1467,3,1,Sport,300
5,1467,939,1,Women,300
6,1467,939,1,Sport,300
8,1467,939,1,Boys,300
...,...,...,...,...,...
17831,1542,362,1,Women,102
17832,1542,362,1,Sport,102
17835,1542,362,1,Boys,102
17837,1542,134,1,Sport,102


## Save data to files

In [29]:
dirname = 'demo-data'
if not os.path.exists(dirname):
    os.mkdir(dirname)

In [30]:
good_customers_preds_hits_only[['customer_id', 'category']].drop_duplicates()\
    .to_csv(os.path.join(dirname, 'customers_categories.csv'), index=False)
cust_coup_cat = good_customers_preds_hits_only[['customer_id', 'coupon_id', 'category']]
cust_coup_cat.to_csv(os.path.join(dirname, 'customers_coupons_categories.csv'), index=False)

### Save with full details

In [31]:
# Retrieve original data
data = pd.read_csv('prepped-data/train_with_ids.csv')
data.drop('redemption_status', axis=1, inplace=True)

In [32]:
details = pd.merge(cust_coup_cat, data, on=['customer_id', 'coupon_id', 'category'], how='left')
details.to_csv(os.path.join(dirname, 'cust_coup_cat_details.csv'))

## Function to generate json payload