# Training AmEx dataset for product recommendations
Try reducing number of products

In [5]:
import numpy as np
import pandas as pd
from scipy.stats import mode
from sklearn.preprocessing import LabelEncoder

In [None]:
transactions = pd.read_csv('original_data/customer_transaction_data.csv')

In [None]:
transactions.head()

In [None]:
transactions.info()

In [None]:
counts = transactions.item_id.value_counts().to_frame(name='tran_count').reset_index().rename(columns={'index': 'item_id'})
counts

In [None]:
print(f'Median number of transactions is {counts.tran_count.median()}')
print(f'There are {len(transactions.item_id.unique())} unique products')
print(f'There are {len(counts.loc[counts.tran_count > 100])} products with more than 100 transactions')
print(f'There are {len(counts.loc[counts.tran_count > 200])} products with more than 200 transactions')
print(f'There are {len(counts.loc[counts.tran_count > 500])} products with more than 500 transactions')

### Reduce original dataset by items with fewer than 100, 200, 500 transactions

In [None]:
items_100 = counts[counts.tran_count > 100].item_id
items_200 = counts[counts.tran_count > 200].item_id
items_500 = counts[counts.tran_count > 500].item_id

In [None]:
tran_100 = transactions.merge(items_100, on='item_id', how='right').reindex()
tran_200 = transactions.merge(items_200, on='item_id', how='right').reindex()
tran_500 = transactions.merge(items_500, on='item_id', how='right').reindex()

In [None]:
print(f'There are {len(tran_100)} transactions for items with more than 100 transactions')
print(f'There are {len(tran_200)} transactions for items with more than 200 transactions')
print(f'There are {len(tran_500)} transactions for items with more than 500 transactions')

### Group by customer_id and item_id

In [None]:
def feature_engineer_transactions(df):
    df['coupon_used'] = df.coupon_discount.apply(lambda x: 1 if x != 0 else 0)
    df['discount'] = df['other_discount'] + df['coupon_discount']
    df['selling_price'] = df['selling_price'] + df['discount']
    df['selling_price'] = df['selling_price'] / df['quantity']
    df.drop(['date', 'other_discount', 'coupon_discount'], axis=1, inplace=True)
    
    p1 = pd.pivot_table(df, index=['customer_id', 'item_id'],
                        values=['quantity', 'selling_price', 'discount'],
                        aggfunc={
                            'quantity': np.mean,
                            'selling_price': np.mean,
                            'discount': np.mean
                        })
    p1.reset_index(inplace=True)
    p1.rename(columns={'quantity': 'mean_quantity', 'selling_price': 'mean_price', 'discount': 'mean_discount'},
              inplace=True)
    
    p2 = pd.pivot_table(df, index=['customer_id', 'item_id'],
                        values=['quantity', 'selling_price', 'discount', 'coupon_used'],
                        aggfunc={
                            'quantity': np.sum,
                            'selling_price': np.sum,
                            'discount': np.sum,
                            'coupon_used': np.sum
                        })
    p2.reset_index(inplace=True)
    p2.rename(columns={'quantity': 'total_quantity', 'selling_price': 'total_price', 'discount': 'total_discount',
                       'coupon_used': 'no_coupons_used'}, inplace=True)
    
    return pd.merge(p1, p2, how='left', left_on=['customer_id', 'item_id'], right_on=['customer_id', 'item_id'])

In [None]:
tran_500 = feature_engineer_transactions(tran_500)

In [None]:
tran_500

### Deal with customer demographics

In [None]:
cust_demo = pd.read_csv('original_data/customer_demographics.csv')

In [None]:
cust_demo.info()

In [None]:
cust_demo['family_size'] = cust_demo.family_size.apply(lambda x: int(x.replace('+', '')))
cust_demo['no_of_children'] = cust_demo.no_of_children.apply(lambda x: int(x.replace('+', '')) if pd.notna(x) else x)
cust_demo.loc[pd.isnull(cust_demo.marital_status) & (cust_demo.family_size == 1),
              'marital_status'] = 'Single'
cust_demo.loc[pd.isnull(cust_demo.marital_status) & ((cust_demo.family_size - cust_demo.no_of_children) == 1),
              'marital_status'] = 'Single'
cust_demo.loc[pd.isnull(cust_demo.marital_status) & ((cust_demo.family_size - cust_demo.no_of_children) == 2),
              'marital_status'] = 'Married'
cust_demo.loc[pd.isnull(cust_demo.marital_status) & pd.isnull(cust_demo.no_of_children) & (cust_demo.family_size == 2),
              'marital_status'] = 'Married'
cust_demo.loc[pd.isnull(cust_demo.no_of_children) & (cust_demo.marital_status == 'Married') & (cust_demo.family_size == 2),
              'no_of_children'] = 0
cust_demo.loc[pd.isnull(cust_demo.no_of_children) & (cust_demo.family_size == 1), 'no_of_children'] = 0
cust_demo.loc[pd.isnull(cust_demo.no_of_children) & (cust_demo.family_size == 2),'no_of_children'] = 1
cust_demo['no_of_children'] = cust_demo['no_of_children'].astype(np.int64)

### Items dataframe

In [None]:
items = pd.read_csv('original_data/item_data.csv')
items

In [None]:
items.drop('brand_type', axis=1, inplace=True)

## Merging tables

In [None]:
cust_tran_500 = tran_500.merge(cust_demo, on='customer_id', how='left')

In [None]:
cust_tran_500

In [None]:
cust_tran_500.isnull().sum()

In [None]:
cust_tran_500['age_range'].fillna(cust_tran_500['age_range'].mode()[0], inplace=True)
cust_tran_500['marital_status'].fillna(cust_tran_500['marital_status'].mode()[0], inplace=True)
cust_tran_500['rented'].fillna(cust_tran_500['rented'].mode()[0], inplace=True)
cust_tran_500['family_size'].fillna(cust_tran_500['family_size'].mode()[0], inplace=True)
cust_tran_500['no_of_children'].fillna(cust_tran_500['no_of_children'].mode()[0], inplace=True)
cust_tran_500['income_bracket'].fillna(cust_tran_500['income_bracket'].mode()[0], inplace=True)

In [None]:
cust_tran_item_500 = cust_tran_500.merge(items, on='item_id', how='left')
cust_tran_item_500

In [None]:
cust_tran_item_500.isnull().sum()

In [None]:
cust_tran_item_500.drop('customer_id', axis=1, inplace=True)

## Feature Encoding

In [None]:
cust_tran_item_500.info()

In [None]:
cust_tran_item_500.age_range.unique()

In [None]:
le = LabelEncoder()
cust_tran_item_500['age_range'] = le.fit_transform(cust_tran_item_500['age_range'])

In [None]:
print('Age range labels:')
for i, cls in enumerate(le.classes_):
    print(f'{cls} - {i}')

In [None]:
cust_tran_item_500.marital_status.unique()

In [None]:
cust_tran_item_500['marital_status'] = le.fit_transform(cust_tran_item_500['marital_status'])

In [None]:
print('Marital Status labels:')
for i, cls in enumerate(le.classes_):
    print(f'{cls} - {i}')

In [None]:
cust_tran_item_500 = pd.get_dummies(cust_tran_item_500, columns=['category'])
cust_tran_item_500

## Training

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(cust_tran_item_500, test_size=0.2, random_state=42)