<a href="https://colab.research.google.com/github/uhdang/mlim_group_3_final_assignment/blob/main/master_copy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Master Copy

## IDEAS

#### Feature Ideas
- Exercise #7 - slide 5
    - own-product moving widow purchase frequencies
    - customer embedding (matrix-factorization)

## Setup

In [64]:
import numpy as np
import pandas as pd

from random import randrange

## Load Data

In [65]:
file_path = "../data/"

baskets_pq = "baskets.parquet"
coupons_pq = "coupons.parquet"
coupon_index_pq = "coupon_index.parquet"

b_df = pd.read_parquet(file_path + baskets_pq)
c_df = pd.read_parquet(file_path + coupons_pq)
ci_df = pd.read_parquet(file_path + coupon_index_pq)

print(f"baskets_df: {b_df.shape}")
print(f"coupons_df: {c_df.shape}")
print(f"coupon_index_df: {ci_df.shape}")

baskets_df: (68841598, 4)
coupons_df: (45000000, 4)
coupon_index_df: (10000, 3)


## Data Preprocessing

### A. Number of Target Shoppers

In [66]:
num_of_shoppers = 100

shoppers = [i for i in range(num_of_shoppers)]
print(f"shoppers.shape: {shoppers}")

shoppers.shape: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


### B. Generate Master Dataframe

In [67]:
def get_original_prices(basket_df):
    return basket_df.groupby(["product"])["price"].agg(["max"]).reset_index()

In [68]:
def gen_complete_week_prod_df_for_shoppers(list_shoppers, orig_df):
    products = np.array(list(range(250)))
    orig_price_list = orig_df["max"].values
    
    weeks = list(range(90))
    
    product_price_pair = np.stack((products, orig_price_list), axis=0).T
    
    prod_in_weeks = []
    for w in weeks:
        for p in product_price_pair:
            prod_in_weeks.append([w, *p])
    
    df_template = pd.DataFrame(data=prod_in_weeks, columns=["week", "product", "orig_price"])
    
    df_master = None;
    
    for idx, shoppers in enumerate(list_shoppers):

        df_local = df_template.copy()
        df_local["shopper"] = shoppers
        
        if idx == 0:
            df_master = df_local
        else:
            # stack it
            df_master = pd.concat([df_master, df_local], axis=0)
    return df_master

In [69]:
orig_price_tb = get_original_prices(b_df)
df = gen_complete_week_prod_df_for_shoppers(shoppers, orig_price_tb)
print(f"df.shape: {df.shape}")
df.head(3)

df.shape: (2250000, 4)


Unnamed: 0,week,product,orig_price,shopper
0,0,0,688,0
1,0,1,560,0
2,0,2,773,0


### C. Target Column added while adding Basket data

In [70]:
b_df["target"] = 1
df = df.merge(b_df, how="left", on=["week", "shopper", "product"])
df["target"] = df["target"].fillna(0)
print(f"df.shape: {df.shape}")

df.shape: (2250000, 6)


In [71]:
# df.sample(50)
df.head(2)

Unnamed: 0,week,product,orig_price,shopper,price,target
0,0,0,688,0,,0.0
1,0,1,560,0,,0.0


### D. Discount_given column added while adding Discout data

In [72]:
c_df["coupon_given"] = 1
df = df.merge(right=c_df, how="left", on=["week", "shopper", "product"])
df["discount"] = df["discount"].fillna(0)
df["coupon_given"] = df["coupon_given"].fillna(0)
print(f"df.shape: {df.shape}")
df.head(3)

df.shape: (2250000, 8)


Unnamed: 0,week,product,orig_price,shopper,price,target,discount,coupon_given
0,0,0,688,0,,0.0,0.0,0.0
1,0,1,560,0,,0.0,0.0,0.0
2,0,2,773,0,,0.0,0.0,0.0


### E. Price with discount

In [73]:
df["price_w_discount"] = df["orig_price"] - df["discount"]

# price column is no longer necessary so removing it
df = df.drop(columns=["price"])

In [74]:
# df.head(60)

## Split Data Before Feature Engineering

In [75]:
# Train: 0 - 88
# Test: 88 - 89
cut_day = 89

# Separate train and test
train = df[df["week"] < cut_day]
test = df[df["week"] >= cut_day]

b_train = b_df[b_df["week"] < cut_day]
b_test = b_df[b_df["week"] >= cut_day]

c_train = c_df[c_df["week"] < cut_day]
c_test = c_df[c_df["week"] >= cut_day]

print(f"Master: {df.shape}, Train: {train.shape}, Test: {test.shape}")

Master: (2250000, 8), Train: (2225000, 8), Test: (25000, 8)


### A. Average Basket Size per Shopper per week

In [76]:
average_basket_size = b_train.groupby(['shopper', 'week'])["product"].count().groupby('shopper').mean().to_frame('average_basket_size').reset_index()

In [77]:
# Merge to shoppers on train and test

train = train.merge(average_basket_size, how="left", on="shopper")
test = test.merge(average_basket_size, how="left", on="shopper")

In [78]:
train.head(2)

Unnamed: 0,week,product,orig_price,shopper,target,discount,coupon_given,price_w_discount,average_basket_size
0,0,0,688,0,0.0,0.0,0.0,688.0,8.539326
1,0,1,560,0,0.0,0.0,0.0,560.0,8.539326


### B. Average Price per Shopper per Week

In [79]:
average_price_per_shopper = b_train.groupby(['shopper'])["price"].mean().to_frame('average_price_per_shopper').reset_index()

In [80]:
# Include it to the shoppers

train = train.merge(average_price_per_shopper, how="left", on="shopper")
test = test.merge(average_price_per_shopper, how="left", on="shopper")

In [81]:
train.head(2)

Unnamed: 0,week,product,orig_price,shopper,target,discount,coupon_given,price_w_discount,average_basket_size,average_price_per_shopper
0,0,0,688,0,0.0,0.0,0.0,688.0,8.539326,587.203947
1,0,1,560,0,0.0,0.0,0.0,560.0,8.539326,587.203947


### C. Number of Unique Products purchased per Shopper

In [82]:
unique_products_per_shopper = b_train.groupby(['shopper'])["product"].nunique()
unique_products_per_shopper_df = unique_products_per_shopper.to_frame("num_unique_products").reset_index()

In [83]:
# Include it to the shoppers

train = train.merge(unique_products_per_shopper_df, how="left", on="shopper")
test = test.merge(unique_products_per_shopper_df, how="left", on="shopper")

In [84]:
train.head(2)

Unnamed: 0,week,product,orig_price,shopper,target,discount,coupon_given,price_w_discount,average_basket_size,average_price_per_shopper,num_unique_products
0,0,0,688,0,0.0,0.0,0.0,688.0,8.539326,587.203947,54
1,0,1,560,0,0.0,0.0,0.0,560.0,8.539326,587.203947,54


### D Total Counts for Product

In [85]:
total_count_of_product = b_train.groupby(['shopper', 'product'])["product"].count().to_frame('total_count_of_product').reset_index()

In [86]:
train

Unnamed: 0,week,product,orig_price,shopper,target,discount,coupon_given,price_w_discount,average_basket_size,average_price_per_shopper,num_unique_products
0,0,0,688,0,0.0,0.0,0.0,688.0,8.539326,587.203947,54
1,0,1,560,0,0.0,0.0,0.0,560.0,8.539326,587.203947,54
2,0,2,773,0,0.0,0.0,0.0,773.0,8.539326,587.203947,54
3,0,3,722,0,0.0,0.0,0.0,722.0,8.539326,587.203947,54
4,0,4,620,0,0.0,0.0,0.0,620.0,8.539326,587.203947,54
...,...,...,...,...,...,...,...,...,...,...,...
2224995,88,245,549,99,0.0,0.0,0.0,549.0,8.797753,618.717752,76
2224996,88,246,702,99,0.0,0.0,0.0,702.0,8.797753,618.717752,76
2224997,88,247,670,99,0.0,0.0,0.0,670.0,8.797753,618.717752,76
2224998,88,248,490,99,0.0,0.0,0.0,490.0,8.797753,618.717752,76


In [87]:
# Merge to train and test
train = train.merge(total_count_of_product, how="left", on=["shopper","product"])
test = test.merge(total_count_of_product, how="left", on=["shopper","product"])

In [88]:
train["total_count_of_product"] = train["total_count_of_product"].fillna(0)
test["total_count_of_product"] = test["total_count_of_product"].fillna(0)

### G. Weeks Since Prior Order

#### a. train

In [89]:
# Train set
addkey = train.groupby(['shopper','product'])["target"].apply(lambda x : x.eq(1).shift().fillna(0).cumsum())
train['weeks_since_prior_order'] = train["target"].eq(0).groupby([train['shopper'], train['product'], addkey]).cumcount().add(1)

#### b. test

In [90]:
# Test set
last_weeks_since_prior_order = train.groupby(['shopper', 'product'])["weeks_since_prior_order"].last() + 1
test = test.merge(last_weeks_since_prior_order, on=['shopper', 'product'])

In [91]:
train.sample(2)

Unnamed: 0,week,product,orig_price,shopper,target,discount,coupon_given,price_w_discount,average_basket_size,average_price_per_shopper,num_unique_products,total_count_of_product,weeks_since_prior_order
1579381,87,131,519,70,0.0,0.0,0.0,519.0,6.651685,587.494932,76,0.0,88
1853007,25,7,772,83,0.0,0.0,0.0,772.0,6.955056,588.52504,69,0.0,26


In [92]:
test.sample(2)

Unnamed: 0,week,product,orig_price,shopper,target,discount,coupon_given,price_w_discount,average_basket_size,average_price_per_shopper,num_unique_products,total_count_of_product,weeks_since_prior_order
4674,89,174,753,18,0.0,0.0,0.0,753.0,8.146067,597.177931,78,0.0,90
11363,89,113,710,45,0.0,0.0,0.0,710.0,7.168539,579.360502,67,0.0,90


### H. Category 

#### f - 1. gensim Word2Vec

In [93]:
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

ModuleNotFoundError: No module named 'gensim'

In [None]:
# Caution ! Takes REAALLY Long !!
# TEMPORARY - Since target shopper is limited to 2000, reduce the basket to 2000

# by basket => groupby(["week", "shopper"])

num_shoppers = 5_000

by_basket_str = b_df[b_df["shopper"] < num_shoppers]
by_basket_str["product"] = by_basket_str["product"].astype(str)
by_basket_str = by_basket_str.groupby(["week", "shopper"])["product"].apply(list).reset_index(name="list_prod")

In [None]:
by_basket_str.head()

In [None]:
# basket list
by_basket = by_basket_str["list_prod"].tolist()

# Create a Word2Vec model
w2v_model = Word2Vec(
    sentences=by_basket,
    min_count=10,
    negative=5,
    sample=0,
    sg=1,
    workers=10,
)

# summarize the loaded model
print(w2v_model)

In [None]:
product_keys = [str(product) for product in range(250)]
product_vectors = w2v_model.wv[product_keys]

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=25, random_state=0).fit(product_vectors)
kmeans.labels_

In [None]:
# Generate a product category
products = [product for product in range(250)]
prods_cat_table = pd.DataFrame(data=products, columns=["product"])
prods_cat_table["category"] = kmeans.labels_
prods_cat_table[['product', 'category']] = prods_cat_table[['product', 'category']].astype('category')

In [None]:
prods_cat_table

In [None]:
train = train.merge(right=prods_cat_table, how="left", on=["product"])
test = test.merge(right=prods_cat_table, how="left", on=["product"])

In [None]:
train.head(2)

### I. Weeks since prior order from the Same Category

#### I - 1. train

In [None]:
new_test_cat = pd.DataFrame(train.groupby(['shopper','category','week'])['target'].sum().reset_index())
mask = new_test_cat.groupby(['shopper', 'category'])['target'].cumsum().replace(0, False).astype(bool) # Mask starting zeros as NaN
df_out_cat = new_test_cat.assign(last_cat_order = new_test_cat.groupby(['shopper','category', new_test_cat.target.astype(bool).cumsum()]).cumcount().where(mask))
df_out_cat["last_cat_order"][df_out_cat["last_cat_order"].isna()]= 0
df_out_cat = df_out_cat[['shopper', 'category','week','last_cat_order']]
train = train.merge(df_out_cat, on = ['shopper', 'week', 'category'], how = 'left')

In [None]:
train.head(2)

#### I - 2. test

In [None]:
# Test set
last_weeks_since_prior_order_from_same_cat = train.groupby(['shopper', 'product'])["last_cat_order"].last() + 1
test = test.merge(last_weeks_since_prior_order_from_same_cat, on=['shopper', 'product'])

In [None]:
test.sample(2)

## Model Declaration and Setup for Predictive Analysis

### A. X_train, X_test, y_train, y_test split

In [None]:
# Features to drop for X
features_to_drop = ["target", "week"]

X_train = train.drop(columns=features_to_drop)
y_train = train["target"]

X_test = test.drop(columns=features_to_drop)
y_test = test["target"]

In [None]:
X_train

In [None]:
y_train

### B. Model Declaration

In [None]:
import lightgbm as lgb
from lightgbm.sklearn import LGBMClassifier

In [None]:
from collections import Counter

# count examples in each class
counter = Counter(y_train)
# estimate scale_pos_weight value
estimate = counter[0] / counter[1]

lgb_clf = LGBMClassifier(
    scale_pos_weight=1 #estimate
    )

In [None]:
counter

In [None]:
estimate

In [None]:
categorical = X_train.select_dtypes(exclude=np.number).columns.tolist()
for cats in categorical:
  X_train[cats] = X_train[cats].astype('category')
  X_test[cats] = X_test[cats].astype('category')

In [None]:
y_train

In [None]:
X_train.info()

In [None]:
lgb_clf.fit(X_train, y_train, categorical_feature=categorical)

In [None]:
lgb_clf.score(X_train, y_train)

In [None]:
lgb_clf.score(X_test, y_test)

### Evaluation

In [None]:
lgb.plot_importance(lgb_clf, max_num_features=10)

In [None]:
lgb.plot_importance(lgb_clf, max_num_features=10, importance_type='gain')

In [None]:
from sklearn.metrics import plot_roc_curve
plot_roc_curve(estimator=lgb_clf, X=X_test, y=y_test)

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

y_pred = lgb_clf.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(estimator=lgb_clf, X=X_test, y_true=y_test, display_labels=['No', 'Yes'], normalize='true')

In [None]:
y_pred

In [None]:
X_test.info()

In [None]:
confusion

In [None]:
FP = confusion[0][1]
FN = confusion[1][0]
TP = confusion[1][1]
TN = confusion[0][0]

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# Precision
PPV = TP/(TP+FP)
# False discovery rate -> 1-Precision
FDR = FP/(TP+FP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)