In [1]:
import os
import pickle

import numpy as np
import pandas as pd
from scipy import stats

In [2]:
data_dir = 'data_0419_0'

In [3]:
DPT_BOYS = 'Boys'
DPT_GIRLS = 'Girls'
DPT_MEN = 'Men'
DPT_SPORT = 'Sport'
DPT_WOMEN = 'Women'
DEPARTMENTS = [DPT_BOYS, DPT_GIRLS, DPT_MEN, DPT_SPORT, DPT_WOMEN]

In [4]:
COUPON_TYPES = ['buy_all', 'department', 'buy_more', 'just_discount']

In [5]:
customers = pd.read_csv(os.path.join(data_dir, 'customers.csv'))
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           1000 non-null   int64 
 1   name         1000 non-null   object
 2   gender       1000 non-null   object
 3   age          1000 non-null   int64 
 4   phone        1000 non-null   object
 5   address      1000 non-null   object
 6   city         1000 non-null   object
 7   state        1000 non-null   object
 8   postalCode   1000 non-null   int64 
 9   country      1000 non-null   object
 10  creditLimit  1000 non-null   int64 
dtypes: int64(4), object(7)
memory usage: 86.1+ KB


In [6]:
customers.rename(columns={'id': 'customer_id'}, inplace=True)

### Let the model calculate coupon redemption probabilities

In [7]:
with open(os.path.join(data_dir, 'pickled_model_gbm_smote'), 'rb') as f:
    model = pickle.load(f)

In [8]:
df_probs_full = pd.read_csv(os.path.join(data_dir, 'train_before_encoding.csv')).drop(['coupon_used'], axis=1)

In [9]:
encoded_df = pd.get_dummies(df_probs_full, columns=['cust_gender', 'cust_age', 'coupon_type', 'coupon_dpt'])\
            .drop(['customer_id', 'coupon_id'], axis=1)
df_probs_full['probs'] = model.predict_proba(encoded_df)[:, 1]
df_probs_full

Unnamed: 0,coupon_id,customer_id,cust_gender,cust_age,cust_mean_buy_price,cust_total_coupons,cust_mean_discount,cust_unique_products,cust_unique_products_coupon,cust_total_products,coupon_type,coupon_dpt,coupon_discount,coupon_how_many,coupon_mean_prod_price,coupon_prods_avail,probs
0,1.0,9,M,young,12.67,337.0,8.32,930.0,283.0,1285.0,buy_all,Men,10,4,7.16,4,0.176736
1,2.0,9,M,young,12.67,337.0,8.32,930.0,283.0,1285.0,buy_more,Men,23,3,9.12,1,0.059867
2,3.0,9,M,young,12.67,337.0,8.32,930.0,283.0,1285.0,just_discount,Men,12,1,1.13,1,0.052781
3,13.0,9,M,young,12.67,337.0,8.32,930.0,283.0,1285.0,just_discount,Women,8,1,4.87,1,0.056935
4,14.0,9,M,young,12.67,337.0,8.32,930.0,283.0,1285.0,buy_all,Women,61,3,8.43,3,0.108412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2821563,967.0,995,F,mid,14.28,2873.0,10.81,2363.0,1113.0,8961.0,buy_more,Girls,33,2,4.81,1,0.074938
2821564,971.0,995,F,mid,14.28,2873.0,10.81,2363.0,1113.0,8961.0,just_discount,Girls,24,1,1993.67,1,0.160799
2821565,945.0,995,F,mid,14.28,2873.0,10.81,2363.0,1113.0,8961.0,buy_all,Women,65,5,8.67,5,0.362157
2821566,947.0,995,F,mid,14.28,2873.0,10.81,2363.0,1113.0,8961.0,just_discount,Women,12,1,57.58,1,0.122117


In [10]:
df_probs = df_probs_full[['coupon_id', 'coupon_dpt', 'coupon_type', 'customer_id', 'probs']].drop_duplicates()
df_probs

Unnamed: 0,coupon_id,coupon_dpt,coupon_type,customer_id,probs
0,1.0,Men,buy_all,9,0.176736
1,2.0,Men,buy_more,9,0.059867
2,3.0,Men,just_discount,9,0.052781
3,13.0,Women,just_discount,9,0.056935
4,14.0,Women,buy_all,9,0.108412
...,...,...,...,...,...
2821537,970.0,Boys,buy_all,972,0.258629
2821543,971.0,Girls,just_discount,972,0.075763
2821552,969.0,Women,buy_more,976,0.047352
2821555,970.0,Boys,buy_all,976,0.217969


### Find coupons which have various scores for various customers

In [11]:
high = df_probs.loc[df_probs.probs >= 0.5]
mid = df_probs.loc[(df_probs.probs < 0.5) & (df_probs.probs >= 0.3)]
low = df_probs.loc[df_probs.probs < 0.3]

coupons = high[['coupon_id', 'coupon_dpt', 'coupon_type']].merge(mid['coupon_id'], on='coupon_id', how='inner')\
    .merge(low['coupon_id'], on='coupon_id', how='inner')\
    .drop_duplicates()
coupons.coupon_dpt.value_counts()

Sport    41
Men      13
Boys     10
Girls     8
Women     3
Name: coupon_dpt, dtype: int64

### Split data into groups of 3 coupons per dpt (daily set of coupons)

In [12]:
coupons.sample(frac=1)
coupon_set_0 = coupons.groupby('coupon_dpt').apply(lambda x: x.iloc[:3]).reset_index(drop=True)
coupon_set_1 = coupons.groupby('coupon_dpt').apply(lambda x: x.iloc[3:6]).reset_index(drop=True)
coupon_set_2 = coupons.groupby('coupon_dpt').apply(lambda x: x.iloc[6:9]).reset_index(drop=True)

In [14]:
# Fill in with random coupons if a set has less than 3 coupons per dpt
coupons_to_fill_in = df_probs[['coupon_dpt', 'coupon_id', 'coupon_type']][~df_probs.coupon_id.isin(coupons['coupon_id'])]
def fill_in_missing_coupons(coupon_set):
    for dpt_name in DEPARTMENTS:
        missing = 3 - coupon_set.coupon_dpt.value_counts().get(dpt_name, 0)
        rows_to_add = coupons_to_fill_in.loc[coupons_to_fill_in.coupon_dpt == dpt_name].sample(missing)
        coupon_set = coupon_set.append(rows_to_add)
    return coupon_set
coupon_set_0 = fill_in_missing_coupons(coupon_set_0)
coupon_set_1 = fill_in_missing_coupons(coupon_set_1)
coupon_set_2 = fill_in_missing_coupons(coupon_set_2)

### Select customers who have high scores within a department

In [15]:
def select_customers(coupon_set):
    df = pd.merge(coupon_set[['coupon_id']], df_probs, on='coupon_id', how='left')
    customers = pd.DataFrame(columns=['customer_id', 'coupon_dpt'])
    for dpt_name in DEPARTMENTS:
        dpt_customers = df.loc[df.coupon_dpt == dpt_name]\
            .sort_values('probs', ascending=False)[['customer_id']].drop_duplicates().head(5)
        dpt_customers['coupon_dpt'] = dpt_name
        customers = customers.append(dpt_customers)
    return customers

customers_0 = select_customers(coupon_set_0)
customers_1 = select_customers(coupon_set_1)
customers_2 = select_customers(coupon_set_2)

### Combine the coupons and customers

In [16]:
def create_data_set(customers, coupons):
    data = pd.merge(customers['customer_id'], df_probs_full, on='customer_id', how='left')\
        .merge(coupons['coupon_id'], on='coupon_id', how='right').drop_duplicates()\
        .sort_values(['coupon_dpt', 'customer_id', 'coupon_id'])
    return data

data_set_0 = create_data_set(customers_0, coupon_set_0)
data_set_1 = create_data_set(customers_1, coupon_set_1)
data_set_2 = create_data_set(customers_2, coupon_set_2)
data_set_0

Unnamed: 0,customer_id,coupon_id,cust_gender,cust_age,cust_mean_buy_price,cust_total_coupons,cust_mean_discount,cust_unique_products,cust_unique_products_coupon,cust_total_products,coupon_type,coupon_dpt,coupon_discount,coupon_how_many,coupon_mean_prod_price,coupon_prods_avail,probs
122,25,116.0,F,mid,15.22,2009.0,12.17,2113.0,924.0,5841.0,department,Boys,64,1,11.53,609,0.965479
359,25,203.0,F,mid,15.22,2009.0,12.17,2113.0,924.0,5841.0,buy_all,Boys,65,4,7.85,4,0.411395
506,25,207.0,F,mid,15.22,2009.0,12.17,2113.0,924.0,5841.0,buy_all,Boys,69,5,66.62,5,0.467091
74,81,116.0,F,mid,14.02,2753.0,11.99,2338.0,1081.0,8048.0,department,Boys,64,1,11.53,609,0.972696
348,81,203.0,F,mid,14.02,2753.0,11.99,2338.0,1081.0,8048.0,buy_all,Boys,65,4,7.85,4,0.501159
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3305,948,765.0,F,mid,14.56,2050.0,10.29,2239.0,970.0,6725.0,buy_all,Women,47,5,5.33,5,0.406497
3624,948,945.0,F,mid,14.56,2050.0,10.29,2239.0,970.0,6725.0,buy_all,Women,65,5,8.67,5,0.387990
3214,974,155.0,F,mid,13.92,2787.0,10.66,2418.0,1110.0,8851.0,buy_all,Women,18,5,5.00,5,0.279379
3491,974,765.0,F,mid,13.92,2787.0,10.66,2418.0,1110.0,8851.0,buy_all,Women,47,5,5.33,5,0.400049


### Save data to csv files
- `customer_stats.csv` - additional statistical data which are missing from original data and are required as input to the model
- `coupon_stats.csv` - additional coupon data
- `good_customer_coupon_pairs.csv` - customers for which the model returns many coupon 'hits', paired with the 'hit' coupons
- `good_customer_id_coupon_data.csv` - the same set of customer-coupon pairs, but instead of coupon_ids, coupon stats are present
- `good_customer_coupon_full.csv` - 'good' customer-coupon pairs along with all the data required as input to the model

In [17]:
demo_data_dir = os.path.join(data_dir, 'demo_data')
if not os.path.exists(demo_data_dir):
    os.makedirs(demo_data_dir)

#### Coupon stats

In [18]:
for i, ds in enumerate([data_set_0, data_set_1, data_set_2]):
    coupon_stats = ds[['coupon_id', 'coupon_type', 'coupon_dpt', 'coupon_discount', 'coupon_how_many',
                       'coupon_mean_prod_price', 'coupon_prods_avail']].drop_duplicates()\
        .sort_values(by='coupon_id')
    coupon_stats.to_csv(os.path.join(demo_data_dir, f'coupon_stats_{i}.csv'), index=False)

#### Customer stats

In [19]:
for i, ds in enumerate([data_set_0, data_set_1, data_set_2]):
    customer_stats = ds[['customer_id', 'cust_gender', 'cust_age', 'cust_mean_buy_price', 'cust_total_coupons',
                         'cust_mean_discount', 'cust_unique_products', 'cust_unique_products_coupon',
                         'cust_total_products']].drop_duplicates()\
        .sort_values(by='customer_id')
    customer_stats.to_csv(os.path.join(demo_data_dir, f'customer_stats_{i}.csv'), index=False)

#### Full data - with probabilities

In [20]:
for i, ds in enumerate([data_set_0, data_set_1, data_set_2]):
    ds.to_csv(os.path.join(demo_data_dir, f'full_data_set_{i}.csv'), index=False)