# HA3 "Recommender System" take 07

## Setup

In [1]:
import numpy as np
import pandas as pd

from random import randrange

## Load Data

In [2]:
file_path = "../../data/assignment_03/"

baskets_pq = "baskets.parquet"
coupons_pq = "coupons.parquet"
prediction_index_pq = "prediction_index.parquet"

b_df = pd.read_parquet(file_path + baskets_pq)
c_df = pd.read_parquet(file_path + coupons_pq)
pi_df = pd.read_parquet(file_path + prediction_index_pq)

print(f"baskets_df: {b_df.shape}")
print(f"coupons_df: {c_df.shape}")
print(f"prediction_index_df: {pi_df.shape}")

baskets_df: (68841598, 4)
coupons_df: (45000000, 4)
prediction_index_df: (500000, 3)


## Data Preprocessing

### A. Reduce number of shoppers

In [3]:
shoppers = [randrange(0, 2000) for i in range(3)]
shoppers

[577, 1799, 1580]

### B. Original Prices

In [4]:
orig_price = b_df.groupby(["product"])["price"].agg(["max"]).reset_index()
print(f"orig_price.shape: {orig_price.shape}")
orig_price.head(2)

orig_price.shape: (250, 2)


Unnamed: 0,product,max
0,0,688
1,1,560


### C. Generate master dataframe with selected shoppers

In [5]:
def gen_complete_week_prod_df_for_shoppers(list_shoppers, orig_df):
    products = np.array([p for p in range(250)])
    orig_price_list = orig_df["max"].values
    
    weeks = [i for i in range(90)]
    
    product_price_pair = np.stack((products, orig_price_list), axis=0).T
    
    prod_in_weeks = []
    for w in weeks:
        for p in product_price_pair:
            prod_in_weeks.append([w, *p])
    
    df_template = pd.DataFrame(data=prod_in_weeks, columns=["week", "product", "orig_price"])
    
    df_master = None;
    
    for idx, shoppers in enumerate(list_shoppers):

        df_local = df_template.copy()
        df_local["shopper"] = shoppers
        
        if idx == 0:
            df_master = df_local
        else:
            # stack it
            df_master = pd.concat([df_master, df_local], axis=0)
    return df_master

In [6]:
df = gen_complete_week_prod_df_for_shoppers(shoppers, orig_price)
print(f"df.shape: {df.shape}")
df.head(3)

df.shape: (67500, 4)


Unnamed: 0,week,product,orig_price,shopper
0,0,0,688,577
1,0,1,560,577
2,0,2,773,577


### D. Target Column added

In [7]:
b_df["target"] = 1
df = pd.merge(df, b_df, how="left", on=["week", "shopper", "product"])
df["target"] = df["target"].fillna(0)
print(f"df.shape: {df.shape}")
df.head(3)

df.shape: (67500, 6)


Unnamed: 0,week,product,orig_price,shopper,price,target
0,0,0,688,577,,0.0
1,0,1,560,577,,0.0
2,0,2,773,577,,0.0


### E. Discount Given Column added

In [8]:
c_df["coupon_given"] = 1
df = df.merge(right=c_df, how="left", on=["week", "shopper", "product"])
df["discount"] = df["discount"].fillna(0)
df["coupon_given"] = df["coupon_given"].fillna(0)
print(f"df.shape: {df.shape}")
df.head(3)

df.shape: (67500, 8)


Unnamed: 0,week,product,orig_price,shopper,price,target,discount,coupon_given
0,0,0,688,577,,0.0,0.0,0.0
1,0,1,560,577,,0.0,0.0,0.0
2,0,2,773,577,,0.0,0.0,0.0


### F. Price with discount

In [9]:
df["price_w_discount"] = df["orig_price"] - df["discount"]
df.head(3)

Unnamed: 0,week,product,orig_price,shopper,price,target,discount,coupon_given,price_w_discount
0,0,0,688,577,,0.0,0.0,0.0,688.0
1,0,1,560,577,,0.0,0.0,0.0,560.0
2,0,2,773,577,,0.0,0.0,0.0,773.0


In [10]:
# price column is no longer necessary so removing it
df = df.drop(columns=["price"])
df.head(3)

Unnamed: 0,week,product,orig_price,shopper,target,discount,coupon_given,price_w_discount
0,0,0,688,577,0.0,0.0,0.0,688.0
1,0,1,560,577,0.0,0.0,0.0,560.0
2,0,2,773,577,0.0,0.0,0.0,773.0


## Split Data Before Feature Engineering

In [11]:
# Train: 0 - 84
# Test: 85 - 89

# Separate train and test
train = df[df["week"] < 85]
test = df[df["week"] >= 85]

b_train = b_df[b_df["week"] < 85]
b_test = b_df[b_df["week"] >= 85]

c_train = c_df[c_df["week"] < 85]
c_test = c_df[c_df["week"] >= 85]

print(f"Master: {df.shape}, Train: {train.shape}, Test: {test.shape}")

Master: (67500, 8), Train: (63750, 8), Test: (3750, 8)


## Feature Engineering

### A. Shopper's Perspective - Average Basket Size per Shopper per Week

In [12]:
num_prod_per_week_per_shopper = b_train.groupby(["shopper", "week"], as_index=False)["product"].count()
num_prod_per_week_per_shopper = num_prod_per_week_per_shopper.rename({"product": "avg_basket_size"}, axis=1)
avg_baskets = num_prod_per_week_per_shopper.groupby(["shopper"], as_index=False)["avg_basket_size"].mean()

In [13]:
# Merge to shoppers on train and test

train = train.merge(avg_baskets, how="left", on="shopper")
test = test.merge(avg_baskets, how="left", on="shopper")

In [14]:
train.sample(3)

Unnamed: 0,week,product,orig_price,shopper,target,discount,coupon_given,price_w_discount,avg_basket_size
6536,26,36,696,577,0.0,0.0,0.0,696.0,6.964706
49843,29,93,577,1580,0.0,0.0,0.0,577.0,6.929412
28981,30,231,452,1799,0.0,0.0,0.0,452.0,7.223529


In [15]:
test.sample(3)

Unnamed: 0,week,product,orig_price,shopper,target,discount,coupon_given,price_w_discount,avg_basket_size
2996,86,246,702,1580,0.0,0.0,0.0,702.0,6.929412
3400,88,150,503,1580,0.0,0.0,0.0,503.0,6.929412
2771,86,21,463,1580,1.0,0.0,0.0,463.0,6.929412


### B Shopper's Perspective - Average price per Shopper per Week 
- How much on average did a shopper spend a week?

In [16]:
avg_price_per_shopper_per_week = b_train.groupby(["shopper"], as_index=False)["price"].mean()

In [17]:
# Include it to the shoppers

train = train.merge(avg_price_per_shopper_per_week, how="left", on="shopper")
test = test.merge(avg_price_per_shopper_per_week, how="left", on="shopper")

In [18]:
train = train.rename({"price": "avg_price"}, axis=1)
test = test.rename({"price": "avg_price"}, axis=1)

In [19]:
train.sample(3)

Unnamed: 0,week,product,orig_price,shopper,target,discount,coupon_given,price_w_discount,avg_basket_size,avg_price
35440,56,190,721,1799,0.0,0.0,0.0,721.0,7.223529,597.037459
12199,48,199,604,577,0.0,0.0,0.0,604.0,6.964706,584.422297
45491,11,241,699,1580,0.0,0.0,0.0,699.0,6.929412,592.933786


In [20]:
test.sample(3)

Unnamed: 0,week,product,orig_price,shopper,target,discount,coupon_given,price_w_discount,avg_basket_size,avg_price
3344,88,94,443,1580,0.0,0.0,0.0,443.0,6.929412,592.933786
3644,89,144,562,1580,0.0,0.0,0.0,562.0,6.929412,592.933786
2088,88,88,471,1799,0.0,0.0,0.0,471.0,7.223529,597.037459


### C. Shopper's Perspective on each product - Reorder (boolean)
- Did a shopper reorder a product?

In [21]:
reordered = (b_train.groupby(['shopper']).product.value_counts()>1).astype(int)
reordered = reordered.to_frame('reordered').reset_index()
reordered.head(3)

Unnamed: 0,shopper,product,reordered
0,0,91,1
1,0,71,1
2,0,225,1


In [22]:
train = train.merge(reordered, how="left", on=["shopper","product"])
test = test.merge(reordered, how="left", on=["shopper","product"])

train["reordered"] = train["reordered"].fillna(0)
test["reordered"] = test["reordered"].fillna(0)

In [23]:
train.sample(3)

Unnamed: 0,week,product,orig_price,shopper,target,discount,coupon_given,price_w_discount,avg_basket_size,avg_price,reordered
13324,53,74,779,577,0.0,0.0,0.0,779.0,6.964706,584.422297,0.0
20898,83,148,463,577,0.0,0.0,0.0,463.0,6.964706,584.422297,1.0
54659,48,159,667,1580,0.0,0.0,0.0,667.0,6.929412,592.933786,0.0


In [24]:
test.sample(3)

Unnamed: 0,week,product,orig_price,shopper,target,discount,coupon_given,price_w_discount,avg_basket_size,avg_price,reordered
1338,85,88,471,1799,0.0,0.0,0.0,471.0,7.223529,597.037459,0.0
1597,86,97,632,1799,0.0,0.0,0.0,632.0,7.223529,597.037459,0.0
2186,88,186,499,1799,0.0,0.0,0.0,499.0,7.223529,597.037459,0.0


### D. Num of Weeks since last purchase

In [42]:
nrordered_product_week = pd.DataFrame(train.groupby(['week', 'shopper','product']).size()).reset_index()
nrordered_product_week.columns = ['week', 'shopper', 'product','number']
nrordered_product_week.loc[nrordered_product_week['number'] > 1]

new_test = pd.DataFrame(train.groupby(['shopper','product','week','target']).size().reset_index())
new_test.head(n=10)

mask = new_test.groupby(['shopper', 'product'])['target'].cumsum().replace(0, False).astype(bool) # Mask starting zeros as NaN
df_out = new_test.assign(last_prod_order = new_test.groupby(['shopper','product', new_test["target"].astype(bool).cumsum()]).cumcount().where(mask))
df_out.last_prod_order[df_out.last_prod_order.isna()]= 0
df_out = df_out[['shopper', 'product','week','last_prod_order']]

train = pd.merge(train, df_out, on = ['shopper', 'week', 'product'], how = 'left')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out.last_prod_order[df_out.last_prod_order.isna()]= 0


In [44]:
train.sample(3)

Unnamed: 0,week,product,orig_price,shopper,target,discount,coupon_given,price_w_discount,avg_basket_size,avg_price,reordered,last_prod_order
60409,71,159,667,1580,0.0,0.0,0.0,667.0,6.929412,592.933786,0.0,0.0
45789,13,39,593,1580,0.0,0.0,0.0,593.0,6.929412,592.933786,0.0,0.0
62590,80,90,549,1580,0.0,0.0,0.0,549.0,6.929412,592.933786,0.0,0.0


In [None]:
test["last_prod_order"] = 

### D. Product's Perspective - Purchased Frequency per Week

## Model Declaration and Setup for Predictive Analysis

### A. X_train, X_test, y_train, y_test split

In [25]:
train.head(2)

Unnamed: 0,week,product,orig_price,shopper,target,discount,coupon_given,price_w_discount,avg_basket_size,avg_price,reordered
0,0,0,688,577,0.0,0.0,0.0,688.0,6.964706,584.422297,0.0
1,0,1,560,577,0.0,0.0,0.0,560.0,6.964706,584.422297,1.0


In [26]:
test.head(2)

Unnamed: 0,week,product,orig_price,shopper,target,discount,coupon_given,price_w_discount,avg_basket_size,avg_price,reordered
0,85,0,688,577,0.0,0.0,0.0,688.0,6.964706,584.422297,0.0
1,85,1,560,577,1.0,0.0,0.0,560.0,6.964706,584.422297,1.0


In [27]:
# Features to drop for X
features_to_drop = ["target", "week"]

X_train = train.drop(columns=features_to_drop)
y_train = train[["target"]]

X_test = test.drop(columns=features_to_drop)
y_test = test[["target"]]

In [28]:
X_train.shape

(63750, 9)

### B. Model Declaration

In [29]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import mean_absolute_error
# from sklearn import tree

#### B - 1. RandomForestClassifier

In [30]:
rfc = RandomForestClassifier(random_state=888, max_depth=10, verbose=1)
rfc.fit(X_train, y_train)

  rfc.fit(X_train, y_train)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.1s finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=888,
                       verbose=1, warm_start=False)

In [31]:
rfc_pred_test = rfc.predict(X_test)
rfc_pred_test

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


array([0., 0., 0., ..., 0., 0., 0.])

In [32]:
mean_absolute_error(rfc_pred_test, y_test)

0.027733333333333332

In [33]:
rfc_pred_test_prob = rfc.predict_proba(X_test)
rfc_pred_test_prob[:,0]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


array([0.99974917, 0.7859498 , 0.99992213, ..., 0.99981205, 0.99994039,
       0.99995841])

#### B - 2. DecisionTreeClassifier

In [34]:
dtc = DecisionTreeClassifier(criterion="gini", max_depth=10)
dtc.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=10, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [35]:
dtc_pred_test = dtc.predict(X_test)
dtc_pred_test

array([0., 0., 0., ..., 0., 0., 0.])

In [36]:
mean_absolute_error(dtc_pred_test, y_test)

0.025866666666666666

In [37]:
dtc_pred_test_prob = dtc.predict_proba(X_test)
dtc_pred_test_prob[:,0]

array([1.        , 0.80478088, 1.        , ..., 1.        , 1.        ,
       1.        ])

#### B - 3. LGBM

In [38]:
import lightgbm as lgb

lgb_clf = lgb.LGBMClassifier()
lgb_clf.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [39]:
lgb_pred_test = lgb_clf.predict(X_test)
lgb_pred_test

array([0., 0., 0., ..., 0., 0., 0.])

In [40]:
mean_absolute_error(lgb_pred_test, y_test)

0.0248

In [41]:
lgb_pred_test_prob = clf.predict_proba(X_test)
lgb_pred_test_prob[:,0]

NameError: name 'clf' is not defined

In [None]:
lgb_clf.score(X_train, y_train)

In [None]:
lgb_clf.score(X_test, y_test)

In [None]:
lgb.plot_importance(lgb_clf, max_num_features=10)

In [None]:
lgb.plot_importance(lgb_clf, max_num_features=10, importance_type='gain')

## Model Assessment - ROC AUC

In [None]:
from sklearn import metrics  # Important lib where we find various performance measures

# cmat = metrics.confusion_matrix(y_test, pred_y)
# print(cmat)

In [None]:
metrics.plot_roc_curve(rfc, X_test, y_test)

In [None]:
metrics.plot_roc_curve(dtc, X_test, y_test)

In [None]:
metrics.plot_roc_curve(clf, X_test, y_test)

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

y_pred = lgb_clf.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(estimator=lgb_clf, X=X_test, y_true=y_test, display_labels=['No', 'Yes'], normalize='true')

In [None]:
confusion

In [None]:
FP = confusion[0][1]
FN = confusion[1][0]
TP = confusion[1][1]
TN = confusion[0][0]

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

In [None]:
print(f'TPR: {TPR:.4f}')
print(f'TNR: {TNR:.4f}')
print(f'FPR: {FPR:.4f}')
print(f'FNR: {FNR:.4f}')
print(f'Precision: {PPV:.4f}')
print(f'False discovery rate: {FDR:.4f}')