# Training AmEx dataset for product recommendations
Try reducing number of products

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import mode
from sklearn.preprocessing import LabelEncoder

In [2]:
transactions = pd.read_csv('original_data/customer_transaction_data.csv')

In [3]:
transactions.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0
2,2012-01-02,1501,31962,1,106.5,-14.25,0.0
3,2012-01-02,1501,33647,1,67.32,0.0,0.0
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0


In [4]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1324566 entries, 0 to 1324565
Data columns (total 7 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   date             1324566 non-null  object 
 1   customer_id      1324566 non-null  int64  
 2   item_id          1324566 non-null  int64  
 3   quantity         1324566 non-null  int64  
 4   selling_price    1324566 non-null  float64
 5   other_discount   1324566 non-null  float64
 6   coupon_discount  1324566 non-null  float64
dtypes: float64(3), int64(3), object(1)
memory usage: 70.7+ MB


In [5]:
counts = transactions.item_id.value_counts().to_frame(name='tran_count').reset_index().rename(columns={'index': 'item_id'})
counts

Unnamed: 0,item_id,tran_count
0,49009,13540
1,34047,6308
2,13174,3879
3,45502,3292
4,29847,2276
...,...,...
74058,26114,1
74059,66094,1
74060,35359,1
74061,41498,1


In [6]:
print(f'Median number of transactions is {counts.tran_count.median()}')
print(f'There are {len(transactions.item_id.unique())} unique products')
print(f'There are {len(counts.loc[counts.tran_count > 100])} products with more than 100 transactions')
print(f'There are {len(counts.loc[counts.tran_count > 200])} products with more than 200 transactions')
print(f'There are {len(counts.loc[counts.tran_count > 500])} products with more than 500 transactions')

Median number of transactions is 3.0
There are 74063 unique products
There are 2743 products with more than 100 transactions
There are 922 products with more than 200 transactions
There are 160 products with more than 500 transactions


### Reduce original dataset by items with fewer than 100, 200, 500 transactions

In [7]:
items_100 = counts[counts.tran_count > 100].item_id
items_200 = counts[counts.tran_count > 200].item_id
items_500 = counts[counts.tran_count > 500].item_id

In [8]:
tran_100 = transactions.merge(items_100, on='item_id', how='right').reindex()
tran_200 = transactions.merge(items_200, on='item_id', how='right').reindex()
tran_500 = transactions.merge(items_500, on='item_id', how='right').reindex()

In [9]:
print(f'There are {len(tran_100)} transactions for items with more than 100 transactions')
print(f'There are {len(tran_200)} transactions for items with more than 200 transactions')
print(f'There are {len(tran_500)} transactions for items with more than 500 transactions')

There are 626185 transactions for items with more than 100 transactions
There are 377109 transactions for items with more than 200 transactions
There are 158320 transactions for items with more than 500 transactions


### Group by customer_id and item_id

In [10]:
def feature_engineer_transactions(df):
    df['coupon_used'] = df.coupon_discount.apply(lambda x: 1 if x != 0 else 0)
    df['discount'] = df['other_discount'] + df['coupon_discount']
    df['selling_price'] = df['selling_price'] + df['discount']
    df['selling_price'] = df['selling_price'] / df['quantity']
    df.drop(['date', 'other_discount', 'coupon_discount'], axis=1, inplace=True)
    
    p1 = pd.pivot_table(df, index=['customer_id', 'item_id'],
                        values=['quantity', 'selling_price', 'discount'],
                        aggfunc={
                            'quantity': np.mean,
                            'selling_price': np.mean,
                            'discount': np.mean
                        })
    p1.reset_index(inplace=True)
    p1.rename(columns={'quantity': 'mean_quantity', 'selling_price': 'mean_price', 'discount': 'mean_discount'},
              inplace=True)
    
    p2 = pd.pivot_table(df, index=['customer_id', 'item_id'],
                        values=['quantity', 'selling_price', 'discount', 'coupon_used'],
                        aggfunc={
                            'quantity': np.sum,
                            'selling_price': np.sum,
                            'discount': np.sum,
                            'coupon_used': np.sum
                        })
    p2.reset_index(inplace=True)
    p2.rename(columns={'quantity': 'total_quantity', 'selling_price': 'total_price', 'discount': 'total_discount',
                       'coupon_used': 'no_coupons_used'}, inplace=True)
    
    return pd.merge(p1, p2, how='left', left_on=['customer_id', 'item_id'], right_on=['customer_id', 'item_id'])

In [11]:
tran_500 = feature_engineer_transactions(tran_500)

In [12]:
tran_500

Unnamed: 0,customer_id,item_id,mean_discount,mean_quantity,mean_price,no_coupons_used,total_discount,total_quantity,total_price
0,1,7045,-22.800,1.50,60.552500,0,-45.60,3,121.105000
1,1,8292,-7.120,1.00,31.710000,0,-7.12,1,31.710000
2,1,9193,0.000,1.25,142.121250,0,0.00,5,568.485000
3,1,9281,-1.140,3.20,17.050000,0,-5.70,16,85.250000
4,1,9927,-21.370,1.00,96.885000,0,-42.74,2,193.770000
...,...,...,...,...,...,...,...,...,...
52645,1582,46274,0.000,1.00,88.690000,0,0.00,1,88.690000
52646,1582,46536,0.000,2.00,15.675000,0,0.00,2,15.675000
52647,1582,49009,-57.064,18236.40,0.094250,0,-570.64,182364,0.942505
52648,1582,57632,-9.260,1.00,57.710000,0,-9.26,1,57.710000


### Deal with customer demographics

In [13]:
cust_demo = pd.read_csv('original_data/customer_demographics.csv')

In [14]:
cust_demo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 760 entries, 0 to 759
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   customer_id     760 non-null    int64 
 1   age_range       760 non-null    object
 2   marital_status  431 non-null    object
 3   rented          760 non-null    int64 
 4   family_size     760 non-null    object
 5   no_of_children  222 non-null    object
 6   income_bracket  760 non-null    int64 
dtypes: int64(3), object(4)
memory usage: 41.7+ KB


In [15]:
cust_demo['family_size'] = cust_demo.family_size.apply(lambda x: int(x.replace('+', '')))
cust_demo['no_of_children'] = cust_demo.no_of_children.apply(lambda x: int(x.replace('+', '')) if pd.notna(x) else x)
cust_demo.loc[pd.isnull(cust_demo.marital_status) & (cust_demo.family_size == 1),
              'marital_status'] = 'Single'
cust_demo.loc[pd.isnull(cust_demo.marital_status) & ((cust_demo.family_size - cust_demo.no_of_children) == 1),
              'marital_status'] = 'Single'
cust_demo.loc[pd.isnull(cust_demo.marital_status) & ((cust_demo.family_size - cust_demo.no_of_children) == 2),
              'marital_status'] = 'Married'
cust_demo.loc[pd.isnull(cust_demo.marital_status) & pd.isnull(cust_demo.no_of_children) & (cust_demo.family_size == 2),
              'marital_status'] = 'Married'
cust_demo.loc[pd.isnull(cust_demo.no_of_children) & (cust_demo.marital_status == 'Married') & (cust_demo.family_size == 2),
              'no_of_children'] = 0
cust_demo.loc[pd.isnull(cust_demo.no_of_children) & (cust_demo.family_size == 1), 'no_of_children'] = 0
cust_demo.loc[pd.isnull(cust_demo.no_of_children) & (cust_demo.family_size == 2),'no_of_children'] = 1
cust_demo['no_of_children'] = cust_demo['no_of_children'].astype(np.int64)

### Items dataframe

In [16]:
items = pd.read_csv('original_data/item_data.csv')
items

Unnamed: 0,item_id,brand,brand_type,category
0,1,1,Established,Grocery
1,2,1,Established,Miscellaneous
2,3,56,Local,Bakery
3,4,56,Local,Grocery
4,5,56,Local,Grocery
...,...,...,...,...
74061,74062,5490,Established,Pharmaceutical
74062,74063,5497,Established,Pharmaceutical
74063,74064,5497,Established,Pharmaceutical
74064,74065,5520,Established,Pharmaceutical


In [17]:
items.drop('brand_type', axis=1, inplace=True)

## Merging tables

In [18]:
cust_tran_500 = tran_500.merge(cust_demo, on='customer_id', how='left')

In [19]:
cust_tran_500

Unnamed: 0,customer_id,item_id,mean_discount,mean_quantity,mean_price,no_coupons_used,total_discount,total_quantity,total_price,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,7045,-22.800,1.50,60.552500,0,-45.60,3,121.105000,70+,Married,0.0,2.0,0.0,4.0
1,1,8292,-7.120,1.00,31.710000,0,-7.12,1,31.710000,70+,Married,0.0,2.0,0.0,4.0
2,1,9193,0.000,1.25,142.121250,0,0.00,5,568.485000,70+,Married,0.0,2.0,0.0,4.0
3,1,9281,-1.140,3.20,17.050000,0,-5.70,16,85.250000,70+,Married,0.0,2.0,0.0,4.0
4,1,9927,-21.370,1.00,96.885000,0,-42.74,2,193.770000,70+,Married,0.0,2.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52645,1582,46274,0.000,1.00,88.690000,0,0.00,1,88.690000,,,,,,
52646,1582,46536,0.000,2.00,15.675000,0,0.00,2,15.675000,,,,,,
52647,1582,49009,-57.064,18236.40,0.094250,0,-570.64,182364,0.942505,,,,,,
52648,1582,57632,-9.260,1.00,57.710000,0,-9.26,1,57.710000,,,,,,


In [20]:
cust_tran_500.isnull().sum()

customer_id            0
item_id                0
mean_discount          0
mean_quantity          0
mean_price             0
no_coupons_used        0
total_discount         0
total_quantity         0
total_price            0
age_range          25153
marital_status     25153
rented             25153
family_size        25153
no_of_children     25153
income_bracket     25153
dtype: int64

In [21]:
cust_tran_500['age_range'].fillna(cust_tran_500['age_range'].mode()[0], inplace=True)
cust_tran_500['marital_status'].fillna(cust_tran_500['marital_status'].mode()[0], inplace=True)
cust_tran_500['rented'].fillna(cust_tran_500['rented'].mode()[0], inplace=True)
cust_tran_500['family_size'].fillna(cust_tran_500['family_size'].mode()[0], inplace=True)
cust_tran_500['no_of_children'].fillna(cust_tran_500['no_of_children'].mode()[0], inplace=True)
cust_tran_500['income_bracket'].fillna(cust_tran_500['income_bracket'].mode()[0], inplace=True)

In [22]:
cust_tran_item_500 = cust_tran_500.merge(items, on='item_id', how='left')
cust_tran_item_500

Unnamed: 0,customer_id,item_id,mean_discount,mean_quantity,mean_price,no_coupons_used,total_discount,total_quantity,total_price,age_range,marital_status,rented,family_size,no_of_children,income_bracket,brand,category
0,1,7045,-22.800,1.50,60.552500,0,-45.60,3,121.105000,70+,Married,0.0,2.0,0.0,4.0,487,Grocery
1,1,8292,-7.120,1.00,31.710000,0,-7.12,1,31.710000,70+,Married,0.0,2.0,0.0,4.0,369,Grocery
2,1,9193,0.000,1.25,142.121250,0,0.00,5,568.485000,70+,Married,0.0,2.0,0.0,4.0,278,Grocery
3,1,9281,-1.140,3.20,17.050000,0,-5.70,16,85.250000,70+,Married,0.0,2.0,0.0,4.0,1,Bakery
4,1,9927,-21.370,1.00,96.885000,0,-42.74,2,193.770000,70+,Married,0.0,2.0,0.0,4.0,634,Grocery
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52645,1582,46274,0.000,1.00,88.690000,0,0.00,1,88.690000,46-55,Married,0.0,2.0,0.0,5.0,56,Natural Products
52646,1582,46536,0.000,2.00,15.675000,0,0.00,2,15.675000,46-55,Married,0.0,2.0,0.0,5.0,56,Grocery
52647,1582,49009,-57.064,18236.40,0.094250,0,-570.64,182364,0.942505,46-55,Married,0.0,2.0,0.0,5.0,56,Fuel
52648,1582,57632,-9.260,1.00,57.710000,0,-9.26,1,57.710000,46-55,Married,0.0,2.0,0.0,5.0,487,Grocery


In [23]:
cust_tran_item_500.isnull().sum()

customer_id        0
item_id            0
mean_discount      0
mean_quantity      0
mean_price         0
no_coupons_used    0
total_discount     0
total_quantity     0
total_price        0
age_range          0
marital_status     0
rented             0
family_size        0
no_of_children     0
income_bracket     0
brand              0
category           0
dtype: int64

In [24]:
cust_tran_item_500.drop('customer_id', axis=1, inplace=True)

## Feature Encoding

In [25]:
cust_tran_item_500.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52650 entries, 0 to 52649
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   item_id          52650 non-null  int64  
 1   mean_discount    52650 non-null  float64
 2   mean_quantity    52650 non-null  float64
 3   mean_price       52650 non-null  float64
 4   no_coupons_used  52650 non-null  int64  
 5   total_discount   52650 non-null  float64
 6   total_quantity   52650 non-null  int64  
 7   total_price      52650 non-null  float64
 8   age_range        52650 non-null  object 
 9   marital_status   52650 non-null  object 
 10  rented           52650 non-null  float64
 11  family_size      52650 non-null  float64
 12  no_of_children   52650 non-null  float64
 13  income_bracket   52650 non-null  float64
 14  brand            52650 non-null  int64  
 15  category         52650 non-null  object 
dtypes: float64(9), int64(4), object(3)
memory usage: 6.8+ MB


In [26]:
cust_tran_item_500.age_range.unique()

array(['70+', '46-55', '26-35', '36-45', '18-25', '56-70'], dtype=object)

In [27]:
le = LabelEncoder()
cust_tran_item_500['age_range'] = le.fit_transform(cust_tran_item_500['age_range'])

In [28]:
print('Age range labels:')
for i, cls in enumerate(le.classes_):
    print(f'{cls} - {i}')

Age range labels:
18-25 - 0
26-35 - 1
36-45 - 2
46-55 - 3
56-70 - 4
70+ - 5


In [29]:
cust_tran_item_500.marital_status.unique()

array(['Married', 'Single'], dtype=object)

In [30]:
cust_tran_item_500['marital_status'] = le.fit_transform(cust_tran_item_500['marital_status'])

In [31]:
print('Marital Status labels:')
for i, cls in enumerate(le.classes_):
    print(f'{cls} - {i}')

Marital Status labels:
Married - 0
Single - 1


In [32]:
cust_tran_item_500 = pd.get_dummies(cust_tran_item_500, columns=['category'])
cust_tran_item_500

Unnamed: 0,item_id,mean_discount,mean_quantity,mean_price,no_coupons_used,total_discount,total_quantity,total_price,age_range,marital_status,...,income_bracket,brand,category_Bakery,category_Fuel,category_Grocery,category_Meat,category_Miscellaneous,category_Natural Products,category_Packaged Meat,category_Pharmaceutical
0,7045,-22.800,1.50,60.552500,0,-45.60,3,121.105000,5,0,...,4.0,487,0,0,1,0,0,0,0,0
1,8292,-7.120,1.00,31.710000,0,-7.12,1,31.710000,5,0,...,4.0,369,0,0,1,0,0,0,0,0
2,9193,0.000,1.25,142.121250,0,0.00,5,568.485000,5,0,...,4.0,278,0,0,1,0,0,0,0,0
3,9281,-1.140,3.20,17.050000,0,-5.70,16,85.250000,5,0,...,4.0,1,1,0,0,0,0,0,0,0
4,9927,-21.370,1.00,96.885000,0,-42.74,2,193.770000,5,0,...,4.0,634,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52645,46274,0.000,1.00,88.690000,0,0.00,1,88.690000,3,0,...,5.0,56,0,0,0,0,0,1,0,0
52646,46536,0.000,2.00,15.675000,0,0.00,2,15.675000,3,0,...,5.0,56,0,0,1,0,0,0,0,0
52647,49009,-57.064,18236.40,0.094250,0,-570.64,182364,0.942505,3,0,...,5.0,56,0,1,0,0,0,0,0,0
52648,57632,-9.260,1.00,57.710000,0,-9.26,1,57.710000,3,0,...,5.0,487,0,0,1,0,0,0,0,0


## Training

In [33]:
import h2o
from h2o.automl import H2OAutoML
h2o.init()

  "This may interfere with your H2O Connection." % name)
  "This may interfere with your H2O Connection." % name)


Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.10" 2021-01-19; OpenJDK Runtime Environment (build 11.0.10+9-Ubuntu-0ubuntu1.18.04); OpenJDK 64-Bit Server VM (build 11.0.10+9-Ubuntu-0ubuntu1.18.04, mixed mode, sharing)
  Starting server from /home/ugolowic/workspace/.venv-ml/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpzqqnhfg6
  JVM stdout: /tmp/tmpzqqnhfg6/h2o_ugolowic_started_from_python.out
  JVM stderr: /tmp/tmpzqqnhfg6/h2o_ugolowic_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Europe/Warsaw
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.0.4
H2O_cluster_version_age:,1 month and 7 days
H2O_cluster_name:,H2O_from_python_ugolowic_siwnll
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.938 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [34]:
train_500 = h2o.H2OFrame(cust_tran_item_500)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [35]:
train_500.describe()

Rows:52650
Cols:23




Unnamed: 0,item_id,mean_discount,mean_quantity,mean_price,no_coupons_used,total_discount,total_quantity,total_price,age_range,marital_status,rented,family_size,no_of_children,income_bracket,brand,category_Bakery,category_Fuel,category_Grocery,category_Meat,category_Miscellaneous,category_Natural Products,category_Packaged Meat,category_Pharmaceutical
type,int,real,real,real,int,real,int,real,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int
mins,4500.0,-1463.1090909090908,1.0,-144.975,0.0,-18043.29,1.0,-156.02,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,25619.071965811894,-19.78288495933124,229.5412297809245,46.469900628240495,0.02676163342830011,-63.289279392212876,3163.48750237416,132.6341027888522,2.677967711301049,0.21135802469135803,0.03114909781576448,2.1328015194681686,0.3366001899335241,4.75489078822409,518.6793732193729,0.013599240265906932,0.017454890788224122,0.8050902184235518,0.016106362773029438,0.0044444444444444444,0.07363722697056031,0.046153846153846156,0.023513770180436846
maxs,64343.0,0.0,32586.0,355.845,10.0,0.0,2616809.0,10730.485000000006,5.0,1.0,1.0,5.0,3.0,12.0,4467.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
sigma,15456.673298492458,39.680549278814766,1660.622201740914,36.099561628664596,0.2163032079945717,256.34516437030965,37667.517203829964,281.7041295366496,0.9521490575461773,0.40827561287980063,0.17372220563041285,0.8946846546536616,0.7836819323910119,1.6419278592405284,701.6382854873268,0.11582122308748133,0.13096008293410324,0.396135001123279,0.1258858564082308,0.06651898525490912,0.26118208481194755,0.20982017255732147,0.15152989442538511
zeros,0,18275,0,178,51570,18275,0,178,1915,41522,51010,0,42574,0,0,51934,51731,10262,51802,52416,48773,50220,51412
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,7045.0,-22.799999999999997,1.5,60.5525,0.0,-45.599999999999994,3.0,121.105,5.0,0.0,0.0,2.0,0.0,4.0,487.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,8292.0,-7.12,1.0,31.709999999999997,0.0,-7.12,1.0,31.709999999999997,5.0,0.0,0.0,2.0,0.0,4.0,369.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,9193.0,0.0,1.25,142.12125,0.0,0.0,5.0,568.485,5.0,0.0,0.0,2.0,0.0,4.0,278.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [36]:
train_500['item_id'] = train_500['item_id'].asfactor()

In [37]:
y = 'item_id'
x = train_500.columns
x.remove(y)

In [None]:
aml = H2OAutoML(max_models=5, seed=1)
aml.train(x=x, y=y, training_frame=train_500)

AutoML progress: |████████████████████████████████████████████████████████