# Prepare Data for Training with Remapped Categories

In [1]:
DEVELOPMENT = True

In [2]:
import os

import numpy as np
import pandas as pd
from scipy.stats import mode
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [3]:
if DEVELOPMENT:
    np.random.seed(0) # Don't be so random

## Prepare transaction data

In [4]:
# Retrieve indices of such customers
transactions = pd.read_csv('original_data/customer_transaction_data.csv')
transactions.sample(5)

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
36139,2012-02-28,214,32077,1,17.1,-3.92,0.0
402816,2012-08-05,1226,31138,1,14.25,-3.21,0.0
139405,2012-04-25,646,32721,1,88.69,0.0,0.0
615085,2012-10-19,1149,34853,1,99.38,0.0,0.0
795619,2012-12-22,1382,5798,1,41.32,0.0,0.0


In [5]:
transactions.drop('date', axis=1, inplace=True)

In [6]:
transactions['original_price'] = transactions.selling_price - transactions.other_discount - transactions.coupon_discount
transactions.original_price = transactions.original_price / transactions.quantity
transactions.coupon_discount = transactions.coupon_discount / transactions.quantity
transactions.drop(['other_discount', 'selling_price'], axis=1, inplace=True)

In [7]:
transactions['coupon_used'] = transactions.coupon_discount.apply(lambda x: 1 if x != 0 else 0)
transactions.coupon_used.value_counts(normalize=True)

0    0.98393
1    0.01607
Name: coupon_used, dtype: float64

#### Modify  quantities to fit clothing store categories

In [8]:
# Scale quantity to something more realistic in a clothing store
print(f'Min quantity: {transactions.quantity.min()}')
print(f'Max quantity: {transactions.quantity.max()}')
print(f'Mean quantity: {transactions.quantity.mean()}')
print(f'Median quantity: {transactions.quantity.median()}')
print(f'Mode quantity: {transactions.quantity.mode()[0]}')

# We want min to be 1 and max to be, e.g. 23, and also for the items to be integers
scaler = MinMaxScaler((1, 23))
transactions['quantity'] = scaler.fit_transform(transactions[['quantity']])
transactions['quantity'] = transactions['quantity'].round(decimals=0).astype(np.int64)

print('\nNew values:')
print(f'Min quantity: {transactions.quantity.min()}')
print(f'Max quantity: {transactions.quantity.max()}')
print(f'Mean quantity: {transactions.quantity.mean()}')
print(f'Median quantity: {transactions.quantity.median()}')
print(f'Mode quantity: {transactions.quantity.mode()[0]}')

Min quantity: 1
Max quantity: 89638
Mean quantity: 130.6633395391396
Median quantity: 1.0
Mode quantity: 1

New values:
Min quantity: 1
Max quantity: 23
Mean quantity: 1.0317772009850774
Median quantity: 1.0
Mode quantity: 1


## Prepare customer demographics data

In [9]:
customers = pd.read_csv('original_data/customer_demographics.csv')
customers.sample(5)

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
61,119,46-55,,0,1,,3
174,352,46-55,,0,2,,5
63,124,46-55,Married,0,2,,6
717,1479,18-25,,0,1,,5
674,1392,36-45,,0,3,2.0,5


In [10]:
customers.isnull().sum()

customer_id         0
age_range           0
marital_status    329
rented              0
family_size         0
no_of_children    538
income_bracket      0
dtype: int64

In [11]:
customers['family_size'] = customers.family_size.apply(lambda x: int(x.replace('+', '')))
customers['no_of_children'] = customers.no_of_children.apply(lambda x: int(x.replace('+', '')) if pd.notna(x) else x)
customers.loc[pd.isnull(customers.marital_status) & (customers.family_size == 1),
              'marital_status'] = 'Single'
customers.loc[pd.isnull(customers.marital_status) & ((customers.family_size - customers.no_of_children) == 1),
              'marital_status'] = 'Single'
customers.loc[pd.isnull(customers.marital_status) & ((customers.family_size - customers.no_of_children) == 2),
              'marital_status'] = 'Married'
customers.loc[pd.isnull(customers.marital_status) & pd.isnull(customers.no_of_children) & (customers.family_size == 2),
              'marital_status'] = 'Married'
customers.loc[pd.isnull(customers.no_of_children) & (customers.marital_status == 'Married') & (customers.family_size == 2),
              'no_of_children'] = 0
customers.loc[pd.isnull(customers.no_of_children) & (customers.family_size == 1), 'no_of_children'] = 0
customers.loc[pd.isnull(customers.no_of_children) & (customers.family_size == 2), 'no_of_children'] = 1
customers['no_of_children'] = customers['no_of_children'].astype(np.int64)

In [12]:
customers.drop('rented', axis=1, inplace=True)

In [13]:
len(customers)

760

#### Fill in customer demographics data for customer ids absent from customers dataframe, but present in transactions dataframe

In [14]:
ids_to_add = sorted([i for i in transactions.customer_id.unique() if i not in customers.customer_id.values])
len(ids_to_add)

822

In [15]:
# Calculate statistics for existing customers
demographics = customers.drop('customer_id', axis=1).value_counts(normalize=True).rename('count_normalized').reset_index()

# Fill in missing customer info based on the statistics
columns = demographics.drop('count_normalized', axis=1).columns
values = demographics.drop('count_normalized', axis=1).to_records()
probs = demographics['count_normalized'].values
row_num = len(ids_to_add)

new_customers = pd.DataFrame(np.random.choice(values, row_num, p=probs), columns=columns)
new_customers['customer_id'] = ids_to_add
new_customers

Unnamed: 0,age_range,marital_status,family_size,no_of_children,income_bracket,customer_id
0,46-55,Married,2,0,1,2
1,18-25,Married,2,0,2,3
2,46-55,Married,5,3,3,4
3,26-35,Single,3,2,3,5
4,26-35,Single,2,1,4,9
...,...,...,...,...,...,...
817,36-45,Married,3,1,4,1570
818,36-45,Single,1,0,5,1571
819,46-55,Married,2,0,1,1575
820,26-35,Married,5,3,1,1576


In [16]:
customers = customers.append(new_customers).sort_values('customer_id').reset_index(drop=True)

In [17]:
customers['gender'] = np.random.choice(['M', 'F'], len(customers), p=[0.48, 0.52])
customers.gender.value_counts()

F    841
M    741
Name: gender, dtype: int64

In [18]:
customers

Unnamed: 0,customer_id,age_range,marital_status,family_size,no_of_children,income_bracket,gender
0,1,70+,Married,2,0,4,F
1,2,46-55,Married,2,0,1,F
2,3,18-25,Married,2,0,2,M
3,4,46-55,Married,5,3,3,F
4,5,26-35,Single,3,2,3,M
...,...,...,...,...,...,...,...
1577,1578,46-55,Married,3,1,6,F
1578,1579,46-55,Single,1,0,4,M
1579,1580,26-35,Married,2,0,5,F
1580,1581,26-35,Married,3,1,1,F


## Prepare item data

In [19]:
items = pd.read_csv('original_data/item_data.csv')
items.sample(5)

Unnamed: 0,item_id,brand,brand_type,category
3427,3428,57,Established,Grocery
36,37,56,Local,Grocery
52906,52907,584,Established,Grocery
6913,6914,989,Established,Packaged Meat
12788,12789,1212,Established,Grocery


In [20]:
items.drop(['brand_type', 'brand'], axis=1, inplace=True)

In [21]:
items.sort_values(by='item_id', inplace=True)

### Find out item price and mean coupon discount for that item from transactions table

In [22]:
item_price = pd.pivot_table(
    data=transactions,
    index='item_id',
    values=['original_price', 'coupon_discount'],
    aggfunc={
        'original_price': lambda x: np.round(np.mean(x), decimals=2),
        'coupon_discount': lambda x: np.round(np.mean(x), decimals=2)
    }
)
item_price.rename(columns={'original_price': 'mean_item_price', 'coupon_discount': 'mean_coupon_discount'}, inplace=True)
item_price.sample(5)

Unnamed: 0_level_0,mean_coupon_discount,mean_item_price
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
25630,0.0,35.26
47807,0.0,142.12
7089,0.0,87.11
47184,0.0,124.31
39937,0.0,117.19


In [23]:
# Merge prices with items
items = pd.merge(items, item_price, on='item_id', how='left').reset_index(drop=True)
items.sample(5)

Unnamed: 0,item_id,category,mean_coupon_discount,mean_item_price
33675,33676,Grocery,0.0,142.12
65963,65964,Meat,0.0,159.53
34184,34185,Grocery,0.0,106.5
10840,10841,Grocery,0.0,664.55
63854,63855,Grocery,0.0,248.98


In [24]:
items.isnull().sum()

item_id                 0
category                0
mean_coupon_discount    3
mean_item_price         3
dtype: int64

In [25]:
items.dropna(axis='index', inplace=True)
items.isnull().sum()

item_id                 0
category                0
mean_coupon_discount    0
mean_item_price         0
dtype: int64

In [26]:
items.shape

(74063, 4)

## Category Mapping

### Merge customers, items, and transactions

In [27]:
item_categories = items.drop(['mean_item_price', 'mean_coupon_discount'], axis=1)

In [28]:
df = transactions.merge(customers, on='customer_id', how='left')
df = df.merge(item_categories, on='item_id', how='left')
df.sample(5)

Unnamed: 0,customer_id,item_id,quantity,coupon_discount,original_price,coupon_used,age_range,marital_status,family_size,no_of_children,income_bracket,gender,category
553071,1119,45785,1,0.0,195.55,0,36-45,Married,2,0,10,F,Prepared Food
99231,1170,62549,1,0.0,13.893333,0,36-45,Single,1,0,2,F,Grocery
274609,973,26648,1,0.0,248.27,0,56-70,Married,2,0,4,M,Meat
905433,1532,36243,1,0.0,59.49,0,26-35,Single,1,0,9,F,Grocery
977613,239,27071,1,0.0,37.4,0,36-45,Married,3,1,6,M,Grocery


In [29]:
df.isnull().sum()

customer_id        0
item_id            0
quantity           0
coupon_discount    0
original_price     0
coupon_used        0
age_range          0
marital_status     0
family_size        0
no_of_children     0
income_bracket     0
gender             0
category           0
dtype: int64

### Category mapping

In [30]:
# Check which items are bought mainly by people with children
df['has_children'] = df.no_of_children.apply(lambda x: 1 if x > 0 else 0)
df['no_children'] = df.no_of_children.apply(lambda x: 1 if x == 0 else 0)

agg_items = df.groupby('item_id').agg({'has_children': np.sum, 'no_children': np.sum}).reset_index()
more_with_children = agg_items.loc[(agg_items['has_children'] - agg_items['no_children']) > 0]

items_for_children = more_with_children.drop(['has_children', 'no_children'], axis=1)
items_for_children['category'] = np.random.choice(['Boys', 'Girls'], len(items_for_children))
items_for_children.reset_index(drop=True, inplace=True)

In [31]:
# Check which items are more popular with each gender
df['male'] = df.gender.apply(lambda x: 1 if x == 'M' else 0)
df['female'] = df.gender.apply(lambda x: 1 if x == 'F' else 0)

agg_items = df.groupby('item_id').agg({'male': np.sum, 'female': np.sum}).reset_index()

more_by_men = agg_items.loc[(agg_items['male'] > agg_items['female'])]
more_by_women = agg_items.loc[(agg_items['female'] >= agg_items['male'])]
# Sanity check
more_by_men_transactions = more_by_men.male.sum() + more_by_men.female.sum()
more_by_women_transactions = more_by_women.female.sum() + more_by_women.male.sum()
assert len(transactions) == more_by_women_transactions + more_by_men_transactions

items_for_men = more_by_men.loc[~(more_by_men.item_id.isin(items_for_children.item_id.values))].reset_index(drop=True)
items_for_men.drop(['male', 'female'], axis=1, inplace=True)
items_for_men['category'] = np.random.choice(['Men', 'Sport'], len(items_for_men), p=[0.7, 0.3])

items_for_women = more_by_women.loc[~(more_by_women.item_id.isin(items_for_children.item_id.values))].reset_index(drop=True)
items_for_women.drop(['male', 'female'], axis=1, inplace=True)
items_for_women['category'] = np.random.choice(['Women', 'Sport'], len(items_for_women), p=[0.8, 0.2])

In [32]:
print(f'{len(items)} items total')
print(f'{len(items_for_children)} items for children')
print(f'{len(items_for_women)} items for women')
print(f'{len(items_for_men)} items for men')
not_categorized = len(items) - (len(items_for_men) + len(items_for_women) + len(items_for_children))
print(f'{not_categorized} items not categorized')

74063 items total
16418 items for children
33822 items for women
23823 items for men
0 items not categorized


In [33]:
new_items = items_for_children.append(items_for_women).append(items_for_men)
new_items

Unnamed: 0,item_id,category
0,3,Girls
1,4,Girls
2,9,Girls
3,11,Girls
4,12,Girls
...,...,...
23818,74053,Men
23819,74055,Sport
23820,74056,Men
23821,74059,Men


In [34]:
new_items.category.value_counts()

Women    27023
Men      16667
Sport    13955
Boys      8212
Girls     8206
Name: category, dtype: int64

In [35]:
items.drop('category', axis=1, inplace=True)
items = pd.merge(new_items, items, on='item_id', how='left').sort_values('item_id').reset_index(drop=True)
items

Unnamed: 0,item_id,category,mean_coupon_discount,mean_item_price
0,1,Women,0.0,124.31
1,2,Women,0.0,35.26
2,3,Girls,0.0,56.64
3,4,Girls,0.0,54.85
4,5,Women,0.0,81.57
...,...,...,...,...
74058,74062,Women,0.0,447.03
74059,74063,Sport,0.0,414.97
74060,74064,Women,0.0,414.97
74061,74065,Girls,0.0,127.88


### Merge items with coupon info

In [36]:
coupons = pd.read_csv('original_data/coupon_item_mapping.csv')
coupons.sample(5)

Unnamed: 0,coupon_id,item_id
41624,22,15943
56060,23,63727
22613,6,41933
88892,29,68062
72736,29,50282


In [37]:
coupons_items = coupons.merge(items, on='item_id', how='left')
coupons_items

Unnamed: 0,coupon_id,item_id,category,mean_coupon_discount,mean_item_price
0,105,37,Women,0.0,57.25
1,107,75,Men,0.0,38.83
2,494,76,Girls,0.0,106.50
3,522,77,Men,0.0,87.27
4,518,77,Men,0.0,87.27
...,...,...,...,...,...
92658,32,69268,Women,0.0,355.84
92659,32,68502,Girls,0.0,355.84
92660,32,68612,Men,0.0,169.20
92661,33,71390,Boys,0.0,255.58


In [38]:
coupons_items.isnull().sum()

coupon_id               0
item_id                 0
category                8
mean_coupon_discount    8
mean_item_price         8
dtype: int64

In [39]:
coupons_items.dropna(axis=0, inplace=True)
coupons_items.shape

(92655, 5)

In [40]:
coupons_items.loc[coupons_items['mean_coupon_discount'] < 0]

Unnamed: 0,coupon_id,item_id,category,mean_coupon_discount,mean_item_price
81,643,419,Women,-3.45,74.33
131,415,907,Sport,-11.87,158.75
138,527,934,Girls,-0.71,59.63
140,417,942,Women,-11.87,155.42
141,415,942,Women,-11.87,155.42
...,...,...,...,...,...
92578,32,71737,Girls,-17.81,378.99
92599,33,72996,Women,-19.59,121.93
92624,32,72159,Girls,-10.39,579.95
92627,32,73170,Men,-8.90,51.89


In [41]:
coupon_stats = pd.pivot_table(
    data=coupons_items,
    index=['coupon_id'],
    values=['mean_coupon_discount', 'mean_item_price'],
    aggfunc={
        'mean_coupon_discount': lambda x: np.round(np.mean(x), decimals=2),
        'mean_item_price': lambda x: np.round(np.mean(x), decimals=2)
    }
)
coupon_stats.shape

(1116, 2)

In [42]:
coupon_stats.loc[coupon_stats.mean_coupon_discount == 0]

Unnamed: 0_level_0,mean_coupon_discount,mean_item_price
coupon_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,0.0,122.54
5,0.0,403.97
15,0.0,222.86
19,0.0,81.82
25,0.0,285.71
...,...,...
1111,0.0,118.08
1112,0.0,165.66
1113,0.0,125.97
1114,0.0,611.12


In [43]:
coupon_stats.loc[coupon_stats.mean_coupon_discount == 0, 'mean_coupon_discount'] = coupon_stats.mean_coupon_discount.mean()

In [44]:
len(coupon_stats.loc[np.absolute(coupon_stats.mean_coupon_discount) > coupon_stats.mean_item_price])

0

## Add customer stats

In [45]:
transactions.sample(5)

Unnamed: 0,customer_id,item_id,quantity,coupon_discount,original_price,coupon_used
785606,1218,37218,1,0.0,167.06,0
519029,1084,17948,1,0.0,85.13,0
553927,1570,20897,1,0.0,42.39,0
662385,1367,9193,1,0.0,142.12,0
507196,1181,18919,1,0.0,193.77,0


In [46]:
pt_sums = pd.pivot_table(transactions, index='customer_id',
                         values=['quantity', 'original_price', 'coupon_discount', 'coupon_used'],
                         aggfunc={
                             'quantity': np.sum,
                             'original_price': lambda x: np.round(np.sum(x), decimals=2),
                             'coupon_discount': lambda x: np.round(np.sum(x), decimals=2),
                             'coupon_used': np.sum
                         })
pt_sums.reset_index(inplace=True)
pt_sums.rename(columns={
    'quantity': 'total_quantity_bought_by_cust',
    'original_price': 'total_price_paid_by_cust',
    'coupon_discount': 'total_discount_used_by_cust',
    'coupon_used': 'total_coupons_used_by_cust'}, inplace=True)
pt_sums

Unnamed: 0,customer_id,total_discount_used_by_cust,total_coupons_used_by_cust,total_price_paid_by_cust,total_quantity_bought_by_cust
0,1,-1832.94,78,103982.01,1048
1,2,-189.97,4,45360.60,419
2,3,-1329.46,53,61312.59,707
3,4,-17.81,1,30434.30,220
4,5,-90.83,2,91553.24,814
...,...,...,...,...,...
1577,1578,-613.86,29,72719.29,810
1578,1579,-145.45,6,132804.28,1162
1579,1580,0.00,0,59651.72,534
1580,1581,0.00,0,45356.97,541


In [47]:
pt_means = pd.pivot_table(transactions, index='customer_id',
                          values=['item_id', 'quantity', 'original_price', 'coupon_discount'],
                          aggfunc={
                              'item_id': lambda x: len(set(x)),
                              'quantity': lambda x: np.round(np.mean(x), decimals=2),
                              'original_price': lambda x: np.round(np.mean(x), decimals=2),
                              'coupon_discount': lambda x: np.round(np.mean(x), decimals=2)
                          })
pt_means.reset_index(inplace=True)
pt_means.rename(columns={
    'item_id': 'unique_items_bought_by_cust',
    'quantity': 'mean_quantity_bought_by_cust',
    'original_price': 'mean_selling_price_paid_by_cust',
    'coupon_discount': 'mean_discount_used_by_cust'}, inplace=True)
pt_means

Unnamed: 0,customer_id,mean_discount_used_by_cust,unique_items_bought_by_cust,mean_selling_price_paid_by_cust,mean_quantity_bought_by_cust
0,1,-1.75,463,99.22,1.00
1,2,-0.45,352,108.26,1.00
2,3,-1.89,406,86.97,1.00
3,4,-0.08,125,138.34,1.00
4,5,-0.11,490,115.60,1.03
...,...,...,...,...,...
1577,1578,-0.78,481,92.87,1.03
1578,1579,-0.13,639,114.29,1.00
1579,1580,0.00,422,112.76,1.01
1580,1581,0.00,390,89.82,1.07


In [48]:
cust_tran_stats = pd.merge(pt_means, pt_sums, on='customer_id')
cust_tran_stats

Unnamed: 0,customer_id,mean_discount_used_by_cust,unique_items_bought_by_cust,mean_selling_price_paid_by_cust,mean_quantity_bought_by_cust,total_discount_used_by_cust,total_coupons_used_by_cust,total_price_paid_by_cust,total_quantity_bought_by_cust
0,1,-1.75,463,99.22,1.00,-1832.94,78,103982.01,1048
1,2,-0.45,352,108.26,1.00,-189.97,4,45360.60,419
2,3,-1.89,406,86.97,1.00,-1329.46,53,61312.59,707
3,4,-0.08,125,138.34,1.00,-17.81,1,30434.30,220
4,5,-0.11,490,115.60,1.03,-90.83,2,91553.24,814
...,...,...,...,...,...,...,...,...,...
1577,1578,-0.78,481,92.87,1.03,-613.86,29,72719.29,810
1578,1579,-0.13,639,114.29,1.00,-145.45,6,132804.28,1162
1579,1580,0.00,422,112.76,1.01,0.00,0,59651.72,534
1580,1581,0.00,390,89.82,1.07,0.00,0,45356.97,541


In [49]:
if not os.path.exists('csv_4_db'):
    os.mkdir('csv_4_db')

In [50]:
coupons_items[['coupon_id', 'item_id', 'category']].sort_values('coupon_id').drop_duplicates()\
    .to_csv('csv_4_db/coupon_categories.csv', index=False)

In [51]:
coupon_stats['mean_coupon_discount'] = coupon_stats.mean_coupon_discount.apply(lambda x: np.round(x, decimals=2))
coupon_stats.reset_index().to_csv('csv_4_db/coupon_info.csv', index=False)

In [52]:
cust_stats = pd.merge(customers, cust_tran_stats, on='customer_id', how='left')
cust_stats.to_csv('csv_4_db/customer_info.csv', index=False)

## Merge everything with train df

In [53]:
train = pd.read_csv('original_data/train.csv')
train.sample(5)

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status
54893,90208,8,550,364,0
39822,65403,13,166,1304,0
50057,82109,8,983,579,0
6845,11349,8,807,705,0
25687,42145,13,138,855,0


In [54]:
train = pd.merge(train, cust_stats, on='customer_id', how='left')
train = pd.merge(train, coupon_stats, on='coupon_id', how='left')
train = pd.merge(train, coupons_items[['coupon_id', 'item_id', 'category']], on='coupon_id', how='left')
train

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,age_range,marital_status,family_size,no_of_children,income_bracket,...,mean_selling_price_paid_by_cust,mean_quantity_bought_by_cust,total_discount_used_by_cust,total_coupons_used_by_cust,total_price_paid_by_cust,total_quantity_bought_by_cust,mean_coupon_discount,mean_item_price,item_id,category
0,1,13,27,1053,0,46-55,Single,1,0,5,...,164.25,1.08,-89.05,1,50918.77,335,-1.13,118.41,24775,Women
1,1,13,27,1053,0,46-55,Single,1,0,5,...,164.25,1.08,-89.05,1,50918.77,335,-1.13,118.41,14958,Women
2,1,13,27,1053,0,46-55,Single,1,0,5,...,164.25,1.08,-89.05,1,50918.77,335,-1.13,118.41,40431,Sport
3,1,13,27,1053,0,46-55,Single,1,0,5,...,164.25,1.08,-89.05,1,50918.77,335,-1.13,118.41,20749,Sport
4,1,13,27,1053,0,46-55,Single,1,0,5,...,164.25,1.08,-89.05,1,50918.77,335,-1.13,118.41,56860,Men
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6419922,128595,13,681,623,0,46-55,Married,4,2,8,...,106.05,1.13,0.00,0,75509.00,804,-1.96,169.80,16981,Women
6419923,128595,13,681,623,0,46-55,Married,4,2,8,...,106.05,1.13,0.00,0,75509.00,804,-1.96,169.80,44676,Women
6419924,128595,13,681,623,0,46-55,Married,4,2,8,...,106.05,1.13,0.00,0,75509.00,804,-1.96,169.80,38641,Women
6419925,128595,13,681,623,0,46-55,Married,4,2,8,...,106.05,1.13,0.00,0,75509.00,804,-1.96,169.80,34285,Women


In [55]:
train.columns

Index(['id', 'campaign_id', 'coupon_id', 'customer_id', 'redemption_status',
       'age_range', 'marital_status', 'family_size', 'no_of_children',
       'income_bracket', 'gender', 'mean_discount_used_by_cust',
       'unique_items_bought_by_cust', 'mean_selling_price_paid_by_cust',
       'mean_quantity_bought_by_cust', 'total_discount_used_by_cust',
       'total_coupons_used_by_cust', 'total_price_paid_by_cust',
       'total_quantity_bought_by_cust', 'mean_coupon_discount',
       'mean_item_price', 'item_id', 'category'],
      dtype='object')

In [56]:
train.drop(['id', 'campaign_id', 'item_id'], axis=1, inplace=True)

In [57]:
train_full = train
train_full.drop_duplicates(inplace=True)
train_full.columns

Index(['coupon_id', 'customer_id', 'redemption_status', 'age_range',
       'marital_status', 'family_size', 'no_of_children', 'income_bracket',
       'gender', 'mean_discount_used_by_cust', 'unique_items_bought_by_cust',
       'mean_selling_price_paid_by_cust', 'mean_quantity_bought_by_cust',
       'total_discount_used_by_cust', 'total_coupons_used_by_cust',
       'total_price_paid_by_cust', 'total_quantity_bought_by_cust',
       'mean_coupon_discount', 'mean_item_price', 'category'],
      dtype='object')

In [58]:
train.shape

(293446, 20)

In [59]:
train = train.drop(['coupon_id', 'customer_id'], axis=1)
train.drop_duplicates(inplace=True)
train.shape

(289552, 18)

# Encoding

In [60]:
def encode(df):
    encoder = LabelEncoder()
    
    df['age_range'] = encoder.fit_transform(df['age_range'])
    print('Age range encoding:')
    for i, c in enumerate(encoder.classes_):
        print(f'{i}: {c}')
        
    df['marital_status'] = encoder.fit_transform(df['marital_status'])
    print('Marital status encoding:')
    for i, c in enumerate(encoder.classes_):
        print(f'{i}: {c}')
        
    df['gender'] = encoder.fit_transform(df['gender'])
    print('Gender encoding:')
    for i, c in enumerate(encoder.classes_):
        print(f'{i}: {c}')
        
    df = pd.get_dummies(df, columns=['category'])
    return df

In [61]:
train_encoded = encode(train)
train_with_ids_encoded = encode(train_full)

Age range encoding:
0: 18-25
1: 26-35
2: 36-45
3: 46-55
4: 56-70
5: 70+
Marital status encoding:
0: Married
1: Single
Gender encoding:
0: F
1: M
Age range encoding:
0: 18-25
1: 26-35
2: 36-45
3: 46-55
4: 56-70
5: 70+
Marital status encoding:
0: Married
1: Single
Gender encoding:
0: F
1: M


## Save to files

In [62]:
dirname = 'prepped-data'
if not os.path.exists(dirname):
    os.mkdir(dirname)

In [63]:
train_full.to_csv(os.path.join(dirname, 'train_full.csv'), index=False)

In [64]:
train_encoded.to_csv(os.path.join(dirname, 'train.csv'), index=False)