In [1]:
# !pip install imblearn

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import imblearn.over_sampling
import seaborn as sns
sns.set()
%matplotlib inline
plt.style.use('fivethirtyeight')


In [3]:
orders_products_dtype={
    'order_id': np.int32, 
    'product_id': np.int32,
    'add_to_cart_order': np.int16,
    'reordered': np.int8
}

orders_products_prior_df = pd.read_csv('./data/order_products__prior.csv', dtype=orders_products_dtype)
orders_products_train_df = pd.read_csv('./data/order_products__train.csv', dtype=orders_products_dtype)

orders_dtype = {
    'order_id': np.int32,
    'user_id': np.int32,
    'order_number': np.int16,
    'order_dow': np.int8,
    'order_hour_of_day': np.int8,
    'days_since_prior_order': np.float16
}
orders_df = pd.read_csv('./data/orders.csv', dtype=orders_dtype)

aisles_dtype = {
    'aisle_id': np.int16
}
aisles_df = pd.read_csv('./data/aisles.csv', dtype=aisles_dtype)

departments_dtype={
    'department_id': np.int8
}
departments_df = pd.read_csv('./data/departments.csv', dtype=departments_dtype)

products_dtype = {
    'product_id': np.int32,
    'aisle_id': np.int16,
    'department_id': np.int8
}
products_df = pd.read_csv('./data/products.csv', dtype=products_dtype)

# Data Joining

In [4]:
# Get all unique user_id in train set where current basket exists
orders_products_train_df = orders_products_train_df.merge(orders_df.drop(['eval_set'], axis=1), on='order_id')
unique_current_user_ids = orders_products_train_df.user_id.unique()

In [5]:
# Create column in PRIOR to track if user has a current order
orders_products_prior_df = orders_products_prior_df.merge(orders_df.drop(['eval_set'], axis=1), on='order_id')
orders_products_prior_df['has_current_order'] = orders_products_prior_df['user_id'].isin(unique_current_user_ids)

# Keep only rows where user has current basket
orders_products_prior_filtered_df = orders_products_prior_df[orders_products_prior_df['has_current_order']].drop(['has_current_order'], axis=1)

# Feature Engineering

In [6]:
# USER PRODUCT FEATURES
from collections import OrderedDict
from itertools import groupby
# user_product_features = ['user_total_orders','user_avg_cartsize','user_total_products','user_avg_days_since_prior_order']

df_X = (orders_products_prior_df.groupby(['user_id', 'product_id'],as_index=False)
                                           .agg(OrderedDict(
                                                   [('order_id','nunique'),
                                                    ('add_to_cart_order', 'median'),
                                                    ('order_hour_of_day','mean'),
                                                    ('days_since_prior_order','mean'),
                                                    ('reordered', (lambda x: tuple(x)))]
                                           )))

df_X.columns = ['user_id', 'product_id', 'user_product_times_ordered', 'user_product_add_to_cart_order_median', 'user_product_hour_of_day_mean', 'user_product_average_days_since_prior_order', 'reordered_history'] 


### Calculate whether last time reordered and longest reorder streak

In [7]:

df_X['last_reorder'] = df_X['reordered_history'].map(lambda x: x[-1])
def find_max_reorders(reorder_tuple):
    count_dups = [len(list(group)) for k, group in groupby(reorder_tuple) if k]
    try:
        return max(count_dups)
    except: 
        return 0
df_X['max_consec_reorders'] = df_X['reordered_history'].map(find_max_reorders)


In [8]:
df_X.drop(['reordered_history'], axis=1, inplace=True)

## Apply Training Label Column

In [9]:
# Apply Training Label Column
train_carts = (orders_products_train_df.groupby('user_id',as_index=False)
                                      .agg({'product_id':(lambda x: set(x))})
                                      .rename(columns={'product_id':'latest_cart'}))
df_X = df_X.merge(train_carts, on='user_id')
df_X['in_cart'] = (df_X.apply(lambda row: row['product_id'] in row['latest_cart'], axis=1).astype(int))

In [None]:
df_prior = pd.read_csv('./data/order_products__prior.csv', dtype=orders_products_dtype)

In [None]:
df_train = pd.read_csv('./data/order_products__train.csv', dtype=orders_products_dtype)

In [None]:
df_temp = pd.read_csv('./data/orders.csv', dtype=orders_dtype)

In [None]:
df_temp[df_temp['eval_set'] == 'test'].sort_values('order_id').head(10)

In [None]:
pd.read_csv('./data/sample_submission.csv', dtype=orders_products_dtype).head()

In [None]:
print(df_temp[df_temp['user_id'] == 1])
print(df_temp[df_temp['user_id'] == 3])

In [None]:
df_train[df_train['order_id'] == 1492625].head()

In [None]:
df_prior[df_prior['order_id'] == 1374495].head()


In [None]:
df_prior[df_prior['order_id'] == 444309].head()


In [None]:
df_prior[df_prior['order_id'] == 3002854].head()

## More (User Specific) Features...

In [None]:
# USER FEATURES
from collections import OrderedDict

user_features = ['user_total_orders','user_avg_cartsize','user_total_products','user_avg_days_since_prior_order']

df_user_features = (orders_products_prior_df.groupby(['user_id'],as_index=False)
                                           .agg(OrderedDict(
                                                   [('order_id',['nunique', (lambda x: x.shape[0] / x.nunique())]),
                                                    ('product_id','nunique'),
                                                    ('days_since_prior_order','mean')]
                                           )))

df_user_features.columns = ['user_id'] + user_features
# df_user_features.head()

In [None]:
# Merge user feature columns
df_X = df_X.merge(df_user_features, on=['user_id'])

### Data Cleaning

In [None]:
df_X.isnull().sum().plot(kind='barh')
plt.title('Number of NaN values per feature')

In [None]:
df_X['user_product_average_days_since_prior_order'].hist();

### Create Bins for `user_product_average_days_since_prior_order`

In [None]:
def convert_to_bin(x):
    if x < 5:
        return 0
    elif x < 10:
        return 1
    elif x < 15:
        return 2
    elif x < 20:
        return 3
    elif x < 25:
        return 4
    elif x <= 30:
        return 5
    else:
        return 6

mapped_df = df_X['user_product_average_days_since_prior_order'].map(convert_to_bin)
up_adspo_bins = pd.get_dummies(mapped_df)
up_adspo_bins.columns = ['usadspo_0-5', 'usadspo_5-10', 'usadspo_10-15', 'usadspo_15-20', 'usadspo_20-25', 'usadspo_25-30', 'usadspo_firstTime']
df_X = pd.concat([df_X, up_adspo_bins], axis=1)
# df_X.drop(['user_product_average_days_since_prior_order'], axis=1, inplace=True)

In [None]:
plot_labels = ['usadspo_0-5', 'usadspo_5-10', 'usadspo_10-15', 
         'usadspo_15-20', 'usadspo_20-25', 'usadspo_25-30', 
         'usadspo_firstTime']
plt.figure(figsize=(8,5))
df_X[plot_labels].apply('sum', axis=0).plot(kind='barh');
plt.title('Bin Distribution of Average Days Since Prior Order')
plt.yticks(np.arange(len(plot_labels)), ['0-5', '5-10', '10-15', 'l5-20', '20-25','25-30', 'First Time Ordering'])
plt.ylabel('Days Since Last Order')
plt.xlabel('Order Count')


## Product Detail Features

In [None]:
# Create Feature Dataframe for Products
products_df = products_df.merge(departments_df, on='department_id').merge(aisles_df, on='aisle_id').drop(['product_name','aisle_id','department_id'], axis=1)
products_df_departments = pd.concat([products_df, pd.get_dummies(products_df['department'])], axis=1).drop(['department','aisle'], axis=1)
products_df_aisles = pd.concat([products_df, pd.get_dummies(products_df['aisle'])], axis=1).drop(['department','aisle'], axis=1)

In [None]:
# Merge department feature columns
df_X = df_X.merge(products_df_departments, on=['product_id'])
# Merge aisle feature columns
df_X = df_X.merge(products_df_aisles, on=['product_id'])

# Data Subsetting

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix,accuracy_score, recall_score

In [None]:
# CREATE SUBSET OF USERS
np.random.seed(153)
total_users = df_X['user_id'].unique() 
user_subset = np.random.choice(total_users, size=int(total_users.shape[0] * .05), replace=False)

df_subset = df_X[df_X['user_id'].isin(user_subset)] 


### Randomized Train Test Split with subset

In [None]:
# Generate test train split with subset 70/30
np.random.seed(48)
subset_total_users = df_subset['user_id'].unique() 
test_set = np.random.choice(subset_total_users, size=int(subset_total_users.shape[0] * .30), replace=False)

df_X_tr, df_X_te = df_subset[~df_subset['user_id'].isin(test_set)], df_subset[df_subset['user_id'].isin(test_set)] 


In [None]:
# Check weights
df_X.in_cart.value_counts(normalize=True).plot(kind='barh');
plt.title('Proportion of Products Reordered (Label)')
plt.xlabel('Proportion')
plt.yticks([0,1], ['Not Reordered', 'Reordered'])

In [None]:
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Logistic Regression with one feature

In [None]:
# Create training with single feature
feature_columns = ['user_total_orders']
y_tr, y_te = df_X_tr['in_cart'], df_X_te['in_cart']
X_tr, X_te = df_X_tr[feature_columns], \
             df_X_te[feature_columns]


### Scaling Data for Regression

In [None]:
# Scale Data
scaler = StandardScaler()
X_tr = scaler.fit_transform(X_tr)
X_te = scaler.transform(X_te)

In [None]:
lr = LogisticRegression()
lr.fit(X_tr, y_tr)
print('Simple Logistic Regression; Test F1: %.3f, Test AUC: %.3f' % \
      (f1_score(lr.predict(X_te), y_te), roc_auc_score(y_te, lr.predict_proba(X_te)[:,1]))) 


# Oversample positive samples to be 40% of targets 
ROS = imblearn.over_sampling.RandomOverSampler(random_state=159)
X_tr_rs, y_tr_rs = ROS.fit_sample(X_tr, y_tr)

lr_os = LogisticRegression() 
lr_os.fit(X_tr_rs, y_tr_rs)

print('Logistic Regression on Oversampled Train Data; Test F1: %.3f, Test AUC: %.3f' % \
      (f1_score(lr_os.predict(X_te), y_te), roc_auc_score(y_te, lr_os.predict_proba(X_te)[:,1])))


In [None]:
all_features = [
    'user_product_times_ordered',
    'user_product_add_to_cart_order_median',
    'user_product_hour_of_day_mean',
    'user_total_orders',
    'user_avg_cartsize',
    'user_total_products',
    'user_avg_days_since_prior_order', 
    ['usadspo_0-5', 'usadspo_5-10', 'usadspo_10-15', 
         'usadspo_15-20', 'usadspo_20-25', 'usadspo_25-30', 
         'usadspo_firstTime'],
    'last_reorder',
    'max_consec_reorders',
    ['alcohol', 'babies', 'bakery', 'beverages',
        'breakfast', 'bulk', 'canned goods', 'dairy eggs', 'deli',
        'dry goods pasta', 'frozen', 'household', 'international',
        'meat seafood', 'pantry', 'personal care', 'pets',
        'produce', 'snacks'],
    ['air fresheners candles', 'asian foods', 
    'baby accessories', 'baby bath body care', 
    'baby food formula', 'bakery desserts', 
    'baking ingredients', 'baking supplies decor', 
    'beauty', 'beers coolers', 'body lotions soap', 
    'bread', 'breakfast bakery', 'breakfast bars pastries', 
    'bulk dried fruits vegetables', 'bulk grains rice dried goods', 
    'buns rolls', 'butter', 'candy chocolate', 
    'canned fruit applesauce', 'canned jarred vegetables', 
    'canned meals beans', 'canned meat seafood', 
    'cat food care', 'cereal', 'chips pretzels', 
    'cleaning products', 'cocoa drink mixes', 
    'coffee', 'cold flu allergy', 'condiments', 
    'cookies cakes', 'crackers', 'cream', 'deodorants', 
    'diapers wipes', 'digestion', 'dish detergents', 
    'dog food care', 'doughs gelatins bake mixes', 
    'dry pasta', 'eggs', 'energy granola bars', 
    'energy sports drinks', 'eye ear care', 
    'facial care', 'feminine care', 'first aid', 
    'food storage', 'fresh dips tapenades', 
    'fresh fruits', 'fresh herbs', 'fresh pasta', 
    'fresh vegetables', 'frozen appetizers sides', 
    'frozen breads doughs', 'frozen breakfast', 
    'frozen dessert', 'frozen juice', 'frozen meals', 
    'frozen meat seafood', 'frozen pizza', 
    'frozen produce', 'frozen vegan vegetarian', 
    'fruit vegetable snacks', 'grains rice dried goods', 
    'granola', 'hair care', 'honeys syrups nectars', 
    'hot cereal pancake mixes', 'hot dogs bacon sausage', 
    'ice cream ice', 'ice cream toppings', 'indian foods', 
    'instant foods', 'juice nectars', 'kitchen supplies', 
    'kosher foods', 'latino foods', 'laundry', 
    'lunch meat', 'marinades meat preparation', 
    'meat counter', 'milk', 'mint gum', 'missing_y', 
    'more household', 'muscles joints pain relief', 
    'nuts seeds dried fruit', 'oils vinegars', 
    'oral hygiene', 'other_y', 'other creams cheeses', 
    'packaged cheese', 'packaged meat', 'packaged poultry', 
    'packaged produce', 'packaged seafood', 
    'packaged vegetables fruits', 'paper goods', 
    'pasta sauce', 'pickled goods olives', 
    'plates bowls cups flatware', 'popcorn jerky', 
    'poultry counter', 'prepared meals', 'prepared soups salads', 
    'preserved dips spreads', 'protein meal replacements', 
    'red wines', 'refrigerated', 'refrigerated pudding desserts', 
    'salad dressing toppings', 'seafood counter', 
    'shave needs', 'skin care', 'soap', 'soft drinks', 
    'soup broth bouillon', 'soy lactosefree', 
    'specialty cheeses', 'specialty wines champagnes', 
    'spices seasonings', 'spirits', 'spreads', 'tea', 
    'tofu meat alternatives', 'tortillas flat bread', 
    'trail mix snack mix', 'trash bags liners', 
    'vitamins supplements', 'water seltzer sparkling water', 
    'white wines', 'yogurt']
]

In [None]:
f1_scores = []
for i in range(len(all_features)):
    if isinstance(all_features[i], str):
        feature_columns = [all_features[i]]
    elif isinstance(all_features[i], list):
        feature_columns = all_features[i]
    y_tr, y_te = df_X_tr['in_cart'], df_X_te['in_cart']
    X_tr, X_te = df_X_tr[feature_columns], \
                 df_X_te[feature_columns]

    scaler = StandardScaler()
    X_tr = scaler.fit_transform(X_tr)
    X_te = scaler.transform(X_te)

    # oversample positive samples to be 40% of targets 
    ROS = imblearn.over_sampling.RandomOverSampler(random_state=159)
    X_tr_rs, y_tr_rs = ROS.fit_sample(X_tr, y_tr)

    lr = LogisticRegression()
    lr.fit(X_tr_rs, y_tr_rs)
    f1_scr = f1_score(lr.predict(X_te), y_te)
#     print('Features used: {}'.format(feature_columns))
#     print('F1 Score: {}'.format(f1_scr))
    f1_scores.append(f1_scr)

In [None]:
x_labels = [
    'P Order Frequency',
    'UP Add to cart order',
    'UP Mean Hour Of Day',
    'U Total Orders',
    'U Avg Cartsize',
    'U Total Products',
    'U Prior Orders Days', 
    'UP Prior Orders Days',
    'Last Reorder',
    'Max Consec Reorders',
    'Departments',
    'Aisles'
]

y_pos = np.arange(len(x_labels))
plt.figure(figsize=(20,10))

plt.bar(y_pos, f1_scores, align='center', alpha=0.5)

plt.xticks(y_pos, x_labels)
plt.ylabel('F1 Score')
plt.title('F1 Score For Varying Features')

# plt.bar(x_labels, f1_scores)
# plt.xticks(x_labels)
plt.show()

In [None]:
sorted(list(zip(x_labels, f1_scores)), key=lambda x: x[1], reverse=True)

In [None]:
all_log_features = [
    'user_product_times_ordered',
    'user_product_add_to_cart_order_median',
    'user_product_hour_of_day_mean',
    'user_total_orders',
    'user_avg_cartsize',
    'user_total_products',
#     'user_avg_days_since_prior_order', 
    'usadspo_0-5', 'usadspo_5-10', 'usadspo_10-15', 
         'usadspo_15-20', 'usadspo_20-25', 'usadspo_25-30', 
         'usadspo_firstTime',
    'last_reorder',
    'max_consec_reorders',
    'alcohol', 'babies', 'bakery', 'beverages',
        'breakfast', 'bulk', 'canned goods', 'dairy eggs', 'deli',
        'dry goods pasta', 'frozen', 'household', 'international',
        'meat seafood', 'pantry', 'personal care', 'pets',
        'produce', 'snacks',
    'air fresheners candles', 'asian foods', 
        'baby accessories', 'baby bath body care', 
        'baby food formula', 'bakery desserts', 
        'baking ingredients', 'baking supplies decor', 
        'beauty', 'beers coolers', 'body lotions soap', 
        'bread', 'breakfast bakery', 'breakfast bars pastries', 
        'bulk dried fruits vegetables', 'bulk grains rice dried goods', 
        'buns rolls', 'butter', 'candy chocolate', 
        'canned fruit applesauce', 'canned jarred vegetables', 
        'canned meals beans', 'canned meat seafood', 
        'cat food care', 'cereal', 'chips pretzels', 
        'cleaning products', 'cocoa drink mixes', 
        'coffee', 'cold flu allergy', 'condiments', 
        'cookies cakes', 'crackers', 'cream', 'deodorants', 
        'diapers wipes', 'digestion', 'dish detergents', 
        'dog food care', 'doughs gelatins bake mixes', 
        'dry pasta', 'eggs', 'energy granola bars', 
        'energy sports drinks', 'eye ear care', 
        'facial care', 'feminine care', 'first aid', 
        'food storage', 'fresh dips tapenades', 
        'fresh fruits', 'fresh herbs', 'fresh pasta', 
        'fresh vegetables', 'frozen appetizers sides', 
        'frozen breads doughs', 'frozen breakfast', 
        'frozen dessert', 'frozen juice', 'frozen meals', 
        'frozen meat seafood', 'frozen pizza', 
        'frozen produce', 'frozen vegan vegetarian', 
        'fruit vegetable snacks', 'grains rice dried goods', 
        'granola', 'hair care', 'honeys syrups nectars', 
        'hot cereal pancake mixes', 'hot dogs bacon sausage', 
        'ice cream ice', 'ice cream toppings', 'indian foods', 
        'instant foods', 'juice nectars', 'kitchen supplies', 
        'kosher foods', 'latino foods', 'laundry', 
        'lunch meat', 'marinades meat preparation', 
        'meat counter', 'milk', 'mint gum', 'missing_y', 
        'more household', 'muscles joints pain relief', 
        'nuts seeds dried fruit', 'oils vinegars', 
        'oral hygiene', 'other_y', 'other creams cheeses', 
        'packaged cheese', 'packaged meat', 'packaged poultry', 
        'packaged produce', 'packaged seafood', 
        'packaged vegetables fruits', 'paper goods', 
        'pasta sauce', 'pickled goods olives', 
        'plates bowls cups flatware', 'popcorn jerky', 
        'poultry counter', 'prepared meals', 'prepared soups salads', 
        'preserved dips spreads', 'protein meal replacements', 
        'red wines', 'refrigerated', 'refrigerated pudding desserts', 
        'salad dressing toppings', 'seafood counter', 
        'shave needs', 'skin care', 'soap', 'soft drinks', 
        'soup broth bouillon', 'soy lactosefree', 
        'specialty cheeses', 'specialty wines champagnes', 
        'spices seasonings', 'spirits', 'spreads', 'tea', 
        'tofu meat alternatives', 'tortillas flat bread', 
        'trail mix snack mix', 'trash bags liners', 
        'vitamins supplements', 'water seltzer sparkling water', 
        'white wines', 'yogurt'
]

In [None]:
y_tr, y_te = df_X_tr['in_cart'], df_X_te['in_cart']
X_tr, X_te = df_X_tr[all_log_features], \
             df_X_te[all_log_features]

scaler = StandardScaler()
X_tr = scaler.fit_transform(X_tr)
X_te = scaler.transform(X_te)

# oversample positive samples to be 40% of targets 
ROS = imblearn.over_sampling.RandomOverSampler(random_state=159)
X_tr_rs, y_tr_rs = ROS.fit_sample(X_tr, y_tr)

lr_af = LogisticRegression(n_jobs=-1)
lr_af.fit(X_tr_rs, y_tr_rs)
f1_scr = f1_score(lr_af.predict(X_te), y_te)
#     print('Features used: {}'.format(feature_columns))
print('F1 Score for All Features: {}'.format(f1_scr))
print('Recall Score for All Features: {}'.format(recall_score(lr_af.predict(X_te), y_te)))

In [None]:
pd.DataFrame(sorted(list(zip(lr.coef_[0], all_log_features)), key=lambda x:x[0], reverse=True)[:10])

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_features = ['user_product_times_ordered',
        'user_product_add_to_cart_order_median',
        'user_product_hour_of_day_mean',
        'user_total_orders', 'user_avg_cartsize', 'user_total_products',
        'user_avg_days_since_prior_order', 'user_product_average_days_since_prior_order','last_reorder', 'max_consec_reorders',
        'alcohol', 'babies', 'bakery', 'beverages',
            'breakfast', 'bulk', 'canned goods', 'dairy eggs', 'deli',
            'dry goods pasta', 'frozen', 'household', 'international',
            'meat seafood', 'pantry', 'personal care', 'pets',
            'produce', 'snacks',
        'air fresheners candles', 'asian foods', 
            'baby accessories', 'baby bath body care', 
            'baby food formula', 'bakery desserts', 
            'baking ingredients', 'baking supplies decor', 
            'beauty', 'beers coolers', 'body lotions soap', 
            'bread', 'breakfast bakery', 'breakfast bars pastries', 
            'bulk dried fruits vegetables', 'bulk grains rice dried goods', 
            'buns rolls', 'butter', 'candy chocolate', 
            'canned fruit applesauce', 'canned jarred vegetables', 
            'canned meals beans', 'canned meat seafood', 
            'cat food care', 'cereal', 'chips pretzels', 
            'cleaning products', 'cocoa drink mixes', 
            'coffee', 'cold flu allergy', 'condiments', 
            'cookies cakes', 'crackers', 'cream', 'deodorants', 
            'diapers wipes', 'digestion', 'dish detergents', 
            'dog food care', 'doughs gelatins bake mixes', 
            'dry pasta', 'eggs', 'energy granola bars', 
            'energy sports drinks', 'eye ear care', 
            'facial care', 'feminine care', 'first aid', 
            'food storage', 'fresh dips tapenades', 
            'fresh fruits', 'fresh herbs', 'fresh pasta', 
            'fresh vegetables', 'frozen appetizers sides', 
            'frozen breads doughs', 'frozen breakfast', 
            'frozen dessert', 'frozen juice', 'frozen meals', 
            'frozen meat seafood', 'frozen pizza', 
            'frozen produce', 'frozen vegan vegetarian', 
            'fruit vegetable snacks', 'grains rice dried goods', 
            'granola', 'hair care', 'honeys syrups nectars', 
            'hot cereal pancake mixes', 'hot dogs bacon sausage', 
            'ice cream ice', 'ice cream toppings', 'indian foods', 
            'instant foods', 'juice nectars', 'kitchen supplies', 
            'kosher foods', 'latino foods', 'laundry', 
            'lunch meat', 'marinades meat preparation', 
            'meat counter', 'milk', 'mint gum', 'missing_y', 
            'more household', 'muscles joints pain relief', 
            'nuts seeds dried fruit', 'oils vinegars', 
            'oral hygiene', 'other_y', 'other creams cheeses', 
            'packaged cheese', 'packaged meat', 'packaged poultry', 
            'packaged produce', 'packaged seafood', 
            'packaged vegetables fruits', 'paper goods', 
            'pasta sauce', 'pickled goods olives', 
            'plates bowls cups flatware', 'popcorn jerky', 
            'poultry counter', 'prepared meals', 'prepared soups salads', 
            'preserved dips spreads', 'protein meal replacements', 
            'red wines', 'refrigerated', 'refrigerated pudding desserts', 
            'salad dressing toppings', 'seafood counter', 
            'shave needs', 'skin care', 'soap', 'soft drinks', 
            'soup broth bouillon', 'soy lactosefree', 
            'specialty cheeses', 'specialty wines champagnes', 
            'spices seasonings', 'spirits', 'spreads', 'tea', 
            'tofu meat alternatives', 'tortillas flat bread', 
            'trail mix snack mix', 'trash bags liners', 
            'vitamins supplements', 'water seltzer sparkling water', 
            'white wines', 'yogurt']

y_tr, y_te = df_X_tr['in_cart'], df_X_te['in_cart']
X_tr, X_te = df_X_tr[rf_features].copy(), \
             df_X_te[rf_features].copy()
X_tr[['user_product_average_days_since_prior_order']]=X_tr[['user_product_average_days_since_prior_order']].fillna(-1)
X_te[['user_product_average_days_since_prior_order']]=X_te[['user_product_average_days_since_prior_order']].fillna(-1)

# oversample positive samples
ROS = imblearn.over_sampling.RandomOverSampler(random_state=159)
X_tr_rs, y_tr_rs = ROS.fit_sample(X_tr, y_tr)

In [None]:
# Convert back to dataframe
X_tr_rs = pd.DataFrame(X_tr_rs)
X_tr_rs.columns=rf_features

In [None]:
# Reset dtypes
temp_dtypes = {
    'user_product_times_ordered': np.int32,
    'user_product_add_to_cart_order_median': np.float32,
    'user_product_hour_of_day_mean': np.float32,
    'user_total_orders': np.int32,
    'user_avg_cartsize': np.float32,
    'user_total_products': np.int32,
    'user_avg_days_since_prior_order': np.float16,
    'user_product_average_days_since_prior_order': np.float16,
    'last_reorder': np.int64,
    'max_consec_reorders': np.int64,
    'alcohol': np.uint8,
    'babies': np.uint8,
    'bakery': np.uint8,
    'beverages': np.uint8,
    'breakfast': np.uint8,
    'bulk': np.uint8,
    'canned goods': np.uint8,
    'dairy eggs': np.uint8,
    'deli': np.uint8,
    'dry goods pasta': np.uint8,
    'frozen': np.uint8,
    'household': np.uint8,
    'international': np.uint8,
    'meat seafood': np.uint8,
#     'missing': np.uint8,
#     'other': np.uint8,
    'pantry': np.uint8,
    'personal care': np.uint8,
    'pets': np.uint8,
    'produce': np.uint8,
    'snacks': np.uint8,
    'air fresheners candles': np.uint8, 'asian foods': np.uint8, 'baby accessories': np.uint8, 'baby bath body care': np.uint8, 'baby food formula': np.uint8, 'bakery desserts': np.uint8, 'baking ingredients': np.uint8, 'baking supplies decor': np.uint8, 'beauty': np.uint8, 'beers coolers': np.uint8, 'body lotions soap': np.uint8, 'bread': np.uint8, 'breakfast bakery': np.uint8, 'breakfast bars pastries': np.uint8, 'bulk dried fruits vegetables': np.uint8, 'bulk grains rice dried goods': np.uint8, 'buns rolls': np.uint8, 'butter': np.uint8, 'candy chocolate': np.uint8, 'canned fruit applesauce': np.uint8, 'canned jarred vegetables': np.uint8, 'canned meals beans': np.uint8, 'canned meat seafood': np.uint8, 'cat food care': np.uint8, 'cereal': np.uint8, 'chips pretzels': np.uint8, 'cleaning products': np.uint8, 'cocoa drink mixes': np.uint8, 'coffee': np.uint8, 'cold flu allergy': np.uint8, 'condiments': np.uint8, 'cookies cakes': np.uint8, 'crackers': np.uint8, 'cream': np.uint8, 'deodorants': np.uint8, 'diapers wipes': np.uint8, 'digestion': np.uint8, 'dish detergents': np.uint8, 'dog food care': np.uint8, 'doughs gelatins bake mixes': np.uint8, 'dry pasta': np.uint8, 'eggs': np.uint8, 'energy granola bars': np.uint8, 'energy sports drinks': np.uint8, 'eye ear care': np.uint8, 'facial care': np.uint8, 'feminine care': np.uint8, 'first aid': np.uint8, 'food storage': np.uint8, 'fresh dips tapenades': np.uint8, 'fresh fruits': np.uint8, 'fresh herbs': np.uint8, 'fresh pasta': np.uint8, 'fresh vegetables': np.uint8, 'frozen appetizers sides': np.uint8, 'frozen breads doughs': np.uint8, 'frozen breakfast': np.uint8, 'frozen dessert': np.uint8, 'frozen juice': np.uint8, 'frozen meals': np.uint8, 'frozen meat seafood': np.uint8, 'frozen pizza': np.uint8, 'frozen produce': np.uint8, 'frozen vegan vegetarian': np.uint8, 'fruit vegetable snacks': np.uint8, 'grains rice dried goods': np.uint8, 'granola': np.uint8, 'hair care': np.uint8, 'honeys syrups nectars': np.uint8, 'hot cereal pancake mixes': np.uint8, 'hot dogs bacon sausage': np.uint8, 'ice cream ice': np.uint8, 'ice cream toppings': np.uint8, 'indian foods': np.uint8, 'instant foods': np.uint8, 'juice nectars': np.uint8, 'kitchen supplies': np.uint8, 'kosher foods': np.uint8, 'latino foods': np.uint8, 'laundry': np.uint8, 'lunch meat': np.uint8, 'marinades meat preparation': np.uint8, 'meat counter': np.uint8, 'milk': np.uint8, 'mint gum': np.uint8, 'missing_y': np.uint8, 'more household': np.uint8, 'muscles joints pain relief': np.uint8, 'nuts seeds dried fruit': np.uint8, 'oils vinegars': np.uint8, 'oral hygiene': np.uint8, 'other_y': np.uint8, 'other creams cheeses': np.uint8, 'packaged cheese': np.uint8, 'packaged meat': np.uint8, 'packaged poultry': np.uint8, 'packaged produce': np.uint8, 'packaged seafood': np.uint8, 'packaged vegetables fruits': np.uint8, 'paper goods': np.uint8, 'pasta sauce': np.uint8, 'pickled goods olives': np.uint8, 'plates bowls cups flatware': np.uint8, 'popcorn jerky': np.uint8, 'poultry counter': np.uint8, 'prepared meals': np.uint8, 'prepared soups salads': np.uint8, 'preserved dips spreads': np.uint8, 'protein meal replacements': np.uint8, 'red wines': np.uint8, 'refrigerated': np.uint8, 'refrigerated pudding desserts': np.uint8, 'salad dressing toppings': np.uint8, 'seafood counter': np.uint8, 'shave needs': np.uint8, 'skin care': np.uint8, 'soap': np.uint8, 'soft drinks': np.uint8, 'soup broth bouillon': np.uint8, 'soy lactosefree': np.uint8, 'specialty cheeses': np.uint8, 'specialty wines champagnes': np.uint8, 'spices seasonings': np.uint8, 'spirits': np.uint8, 'spreads': np.uint8, 'tea': np.uint8, 'tofu meat alternatives': np.uint8, 'tortillas flat bread': np.uint8, 'trail mix snack mix': np.uint8, 'trash bags liners': np.uint8, 'vitamins supplements': np.uint8, 'water seltzer sparkling water': np.uint8, 'white wines': np.uint8, 'yogurt': np.uint8    
}

for col, col_type in temp_dtypes.items():
    X_tr_rs[col] = X_tr_rs[col].astype(col_type)

In [None]:
rfmodel = RandomForestClassifier(n_estimators = 750,
                                min_samples_leaf = 20, n_jobs=-1, max_features='sqrt')
rfmodel.fit(X_tr_rs, y_tr_rs)
y_pred = rfmodel.predict(X_te)
print('Test F1 Score: {}'.format(f1_score(y_pred, y_te)))
y_tr_pred = rfmodel.predict(X_tr)
print('Training F1 Score: {}'.format(f1_score(y_tr_pred, y_tr)))

In [None]:
rfmodel = RandomForestClassifier(n_estimators = 500,
                                min_samples_leaf = 20, n_jobs=-1, max_features='sqrt')
rfmodel.fit(X_tr_rs, y_tr_rs)
y_pred = rfmodel.predict(X_te)
print('Test F1 Score: {}'.format(f1_score(y_pred, y_te)))
y_tr_pred = rfmodel.predict(X_tr)
print('Training F1 Score: {}'.format(f1_score(y_tr_pred, y_tr)))

In [None]:
rfmodel = RandomForestClassifier(n_estimators = 1000,
                                min_samples_leaf = 20, n_jobs=-1, max_features='sqrt')
rfmodel.fit(X_tr_rs, y_tr_rs)
y_pred = rfmodel.predict(X_te)
print('Test F1 Score: {}'.format(f1_score(y_pred, y_te)))
y_tr_pred = rfmodel.predict(X_tr)
print('Training F1 Score: {}'.format(f1_score(y_tr_pred, y_tr)))

In [None]:
rfmodel = RandomForestClassifier(n_estimators = 500,
                                min_samples_leaf = 15, n_jobs=-1, max_features='sqrt')
rfmodel.fit(X_tr_rs, y_tr_rs)
y_pred = rfmodel.predict(X_te)
print('Test F1 Score: {}'.format(f1_score(y_pred, y_te)))
y_tr_pred = rfmodel.predict(X_tr)
print('Training F1 Score: {}'.format(f1_score(y_tr_pred, y_tr)))

In [None]:
rfmodel = RandomForestClassifier(n_estimators = 500,
                                min_samples_leaf = 10, n_jobs=-1, max_features='sqrt')
rfmodel.fit(X_tr_rs, y_tr_rs)
y_pred = rfmodel.predict(X_te)
print('Test F1 Score: {}'.format(f1_score(y_pred, y_te)))
y_tr_pred = rfmodel.predict(X_tr)
print('Training F1 Score: {}'.format(f1_score(y_tr_pred, y_tr)))

In [None]:
rfmodel = RandomForestClassifier(n_estimators = 500,
                                min_samples_leaf = 5, n_jobs=-1, max_features='sqrt')
rfmodel.fit(X_tr_rs, y_tr_rs)
y_pred = rfmodel.predict(X_te)
print('Test F1 Score: {}'.format(f1_score(y_pred, y_te)))
print('Test Recall Score: {}'.format(recall_score(y_pred, y_te)))

y_tr_pred = rfmodel.predict(X_tr)
print('Training F1 Score: {}'.format(f1_score(y_tr_pred, y_tr)))


In [None]:
import xgboost as xgb
from xgboost import XGBClassifier

In [None]:
gbm = XGBClassifier( 
                       n_estimators=40000, #arbitrary large number
                       max_depth=7,
                       objective='binary:logistic', #new objective
                       learning_rate=.05, 
                       subsample=.8,
                       min_child_weight=12,
                       colsample_bytree=.8,
                       n_jobs=-1
                      )
eval_set=[(X_tr_rs, y_tr_rs),(X_te,y_te)]

fit_model = gbm.fit( 
                    X_tr_rs, y_tr_rs, 
                    eval_set=eval_set,
                    eval_metric='error', #new evaluation metric: classification error (could also use AUC, e.g.)
                    early_stopping_rounds=50,
                    verbose=True
                   )

In [None]:
y_pred = gbm.predict(X_te, ntree_limit=gbm.best_ntree_limit)
print('Test F1 Score: {}'.format(f1_score(y_pred, y_te)))

In [None]:
gbm = XGBClassifier( 
                       n_estimators=40000, #arbitrary large number
                       max_depth=6,
                       objective='binary:logistic', #new objective
                       learning_rate=.05, 
                       subsample=.8,
                       min_child_weight=10,
                       colsample_bytree=.8,
                       n_jobs=-1
                      )
eval_set=[(X_tr_rs, y_tr_rs),(X_te,y_te)]

fit_model = gbm.fit( 
                    X_tr_rs, y_tr_rs, 
                    eval_set=eval_set,
                    eval_metric='error', #new evaluation metric: classification error (could also use AUC, e.g.)
                    early_stopping_rounds=50,
                    verbose=True
                   )

In [None]:
y_pred = gbm.predict(X_te, ntree_limit=gbm.best_ntree_limit)
print('Test F1 Score: {}'.format(f1_score(y_pred, y_te)))

In [None]:
gbm = XGBClassifier( 
                       n_estimators=40000, #arbitrary large number
                       max_depth=8,
                       objective='binary:logistic', #new objective
                       learning_rate=.05, 
                       subsample=.8,
                       min_child_weight=20,
                       colsample_bytree=.8,
                       n_jobs=-1
                      )
eval_set=[(X_tr_rs, y_tr_rs),(X_te,y_te)]

fit_model = gbm.fit( 
                    X_tr_rs, y_tr_rs, 
                    eval_set=eval_set,
                    eval_metric='error', #new evaluation metric: classification error (could also use AUC, e.g.)
                    early_stopping_rounds=50,
                    verbose=True
                   )


In [None]:
y_pred = gbm.predict(X_te, ntree_limit=gbm.best_ntree_limit)
print('Test F1 Score: {}'.format(f1_score(y_pred, y_te)))
print('Test Recall Score: {}'.format(recall_score(y_pred, y_te)))

In [None]:
gbm_scores = gbm.get_booster().get_score(importance_type='gain') #extract raw gain scores
sorted(gbm_scores.items(), key=lambda x: x[1], reverse=True)[:10]

In [None]:
xgb.plot_importance(gbm, importance_type='gain', max_num_features=15)

In [None]:
df_X.in_cart.value_counts(normalize=True)