# Master Copy

## IDEAS

#### Feature Ideas
- Exercise #7 - slide 5
    - own-product moving widow purchase frequencies
    - customer embedding (matrix-factorization)

## Setup

In [3]:
import numpy as np
import pandas as pd

from random import randrange

## Load Data

In [6]:
file_path = "./data/"

baskets_pq = "baskets.parquet"
coupons_pq = "coupons.parquet"
coupon_index_pq = "coupon_index.parquet"

b_df = pd.read_parquet(file_path + baskets_pq)
c_df = pd.read_parquet(file_path + coupons_pq)
ci_df = pd.read_parquet(file_path + coupon_index_pq)

print(f"baskets_df: {b_df.shape}")
print(f"coupons_df: {c_df.shape}")
print(f"coupon_index_df: {ci_df.shape}")

baskets_df: (68841598, 4)
coupons_df: (45000000, 4)
coupon_index_df: (10000, 3)


## Data Preprocessing

### A. Number of Target Shoppers

In [19]:
num_of_shoppers = 100

shoppers = [i for i in range(num_of_shoppers)]
print(f"shoppers.shape: {shoppers}")

shoppers.shape: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


### B. Generate Master Dataframe

In [20]:
def get_original_prices(basket_df):
    return basket_df.groupby(["product"])["price"].agg(["max"]).reset_index()

In [21]:
def gen_complete_week_prod_df_for_shoppers(list_shoppers, orig_df):
    products = np.array(list(range(250)))
    orig_price_list = orig_df["max"].values
    
    weeks = list(range(90))
    
    product_price_pair = np.stack((products, orig_price_list), axis=0).T
    
    prod_in_weeks = []
    for w in weeks:
        for p in product_price_pair:
            prod_in_weeks.append([w, *p])
    
    df_template = pd.DataFrame(data=prod_in_weeks, columns=["week", "product", "orig_price"])
    
    df_master = None;
    
    for idx, shoppers in enumerate(list_shoppers):

        df_local = df_template.copy()
        df_local["shopper"] = shoppers
        
        if idx == 0:
            df_master = df_local
        else:
            # stack it
            df_master = pd.concat([df_master, df_local], axis=0)
    return df_master

In [103]:
orig_price_tb = get_original_prices(b_df)
df = gen_complete_week_prod_df_for_shoppers(shoppers, orig_price_tb)
print(f"df.shape: {df.shape}")
df.head(3)

df.shape: (2250000, 4)


Unnamed: 0,week,product,orig_price,shopper
0,0,0,688,0
1,0,1,560,0
2,0,2,773,0


### C. Target Column added while adding Basket data

In [104]:
b_df["target"] = 1
df = df.merge(b_df, how="left", on=["week", "shopper", "product"])
df["target"] = df["target"].fillna(0)
print(f"df.shape: {df.shape}")

df.shape: (2250000, 6)


In [105]:
# df.sample(50)
df.head(2)

Unnamed: 0,week,product,orig_price,shopper,price,target
0,0,0,688,0,,0.0
1,0,1,560,0,,0.0


### D. Discount_given column added while adding Discout data

In [106]:
c_df["coupon_given"] = 1
df = df.merge(right=c_df, how="left", on=["week", "shopper", "product"])
df["discount"] = df["discount"].fillna(0)
df["coupon_given"] = df["coupon_given"].fillna(0)
print(f"df.shape: {df.shape}")
df.head(3)

df.shape: (2250000, 8)


Unnamed: 0,week,product,orig_price,shopper,price,target,discount,coupon_given
0,0,0,688,0,,0.0,0.0,0.0
1,0,1,560,0,,0.0,0.0,0.0
2,0,2,773,0,,0.0,0.0,0.0


### E. Price with discount

In [107]:
df["price_w_discount"] = df["orig_price"] - df["discount"]

# price column is no longer necessary so removing it
df = df.drop(columns=["price"])

In [108]:
# df.head(60)

## Split Data Before Feature Engineering

In [109]:
# Train: 0 - 88
# Test: 88 - 89
cut_day = 89

# Separate train and test
train = df[df["week"] < cut_day]
test = df[df["week"] >= cut_day]

b_train = b_df[b_df["week"] < cut_day]
b_test = b_df[b_df["week"] >= cut_day]

c_train = c_df[c_df["week"] < cut_day]
c_test = c_df[c_df["week"] >= cut_day]

print(f"Master: {df.shape}, Train: {train.shape}, Test: {test.shape}")

Master: (2250000, 8), Train: (2225000, 8), Test: (25000, 8)


### A. Average Basket Size per Shopper per week

In [110]:
average_basket_size = b_train.groupby(['shopper', 'week'])["product"].count().groupby('shopper').mean().to_frame('average_basket_size').reset_index()

In [111]:
# Merge to shoppers on train and test

train = train.merge(average_basket_size, how="left", on="shopper")
test = test.merge(average_basket_size, how="left", on="shopper")

In [112]:
train.head(2)

Unnamed: 0,week,product,orig_price,shopper,target,discount,coupon_given,price_w_discount,average_basket_size
0,0,0,688,0,0.0,0.0,0.0,688.0,8.539326
1,0,1,560,0,0.0,0.0,0.0,560.0,8.539326


### B. Average Price per Shopper per Week

In [113]:
average_price_per_shopper = b_train.groupby(['shopper'])["price"].mean().to_frame('average_price_per_shopper').reset_index()

In [114]:
# Include it to the shoppers

train = train.merge(average_price_per_shopper, how="left", on="shopper")
test = test.merge(average_price_per_shopper, how="left", on="shopper")

In [115]:
train.head(2)

Unnamed: 0,week,product,orig_price,shopper,target,discount,coupon_given,price_w_discount,average_basket_size,average_price_per_shopper
0,0,0,688,0,0.0,0.0,0.0,688.0,8.539326,587.203947
1,0,1,560,0,0.0,0.0,0.0,560.0,8.539326,587.203947


### C. Number of Unique Products purchased per Shopper

In [116]:
unique_products_per_shopper = b_train.groupby(['shopper'])["product"].nunique()
unique_products_per_shopper_df = unique_products_per_shopper.to_frame("num_unique_products").reset_index()

In [117]:
# Include it to the shoppers

train = train.merge(num_unique_prods_per_shopper, how="left", on="shopper")
test = test.merge(num_unique_prods_per_shopper, how="left", on="shopper")

In [118]:
train.head(2)

Unnamed: 0,week,product,orig_price,shopper,target,discount,coupon_given,price_w_discount,average_basket_size,average_price_per_shopper,num_unique_prods
0,0,0,688,0,0.0,0.0,0.0,688.0,8.539326,587.203947,54
1,0,1,560,0,0.0,0.0,0.0,560.0,8.539326,587.203947,54


### D Total Counts for Product

In [119]:
total_count_of_product = b_train.groupby(['shopper', 'product'])["product"].count().to_frame('total_count_of_product').reset_index()

In [120]:
# Merge to train and test
train = train.merge(total_count_of_product, how="left", on=["shopper","product"])
test = test.merge(total_count_of_product, how="left", on=["shopper","product"])

In [121]:
train["total_count_of_product"] = train["total_count_of_product"].fillna(0)
test["total_count_of_product"] = test["total_count_of_product"].fillna(0)

### G. Weeks Since Prior Order

#### a. train

In [123]:
# Train set
addkey = train.groupby(['shopper','product'])["target"].apply(lambda x : x.eq(1).shift().fillna(0).cumsum())
train['weeks_since_prior_order'] = train["target"].eq(0).groupby([train['shopper'], train['product'], addkey]).cumcount().add(1)

#### b. test

In [125]:
# Test set
last_weeks_since_prior_order = train.groupby(['shopper', 'product'])["weeks_since_prior_order"].last() + 1
test = test.merge(last_weeks_since_prior_order, on=['shopper', 'product'])

In [126]:
train.sample(2)

Unnamed: 0,week,product,orig_price,shopper,target,discount,coupon_given,price_w_discount,average_basket_size,average_price_per_shopper,num_unique_prods,total_count_of_product,weeks_since_prior_order
1168348,45,98,481,52,0.0,0.0,0.0,481.0,6.078652,573.565619,74,0.0,46
2144112,32,112,727,96,0.0,0.0,0.0,727.0,7.033708,565.047923,80,0.0,33


In [127]:
test.sample(2)

Unnamed: 0,week,product,orig_price,shopper,target,discount,coupon_given,price_w_discount,average_basket_size,average_price_per_shopper,num_unique_prods,total_count_of_product,weeks_since_prior_order
12548,89,48,465,50,0.0,0.0,0.0,465.0,9.348315,563.960337,85,0.0,90
20564,89,64,575,82,0.0,0.0,0.0,575.0,7.775281,572.184971,88,0.0,90


### H. Category 

#### f - 1. gensim Word2Vec

In [128]:
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [129]:
# Caution ! Takes REAALLY Long !!
# TEMPORARY - Since target shopper is limited to 2000, reduce the basket to 2000

# by basket => groupby(["week", "shopper"])

num_shoppers = 5_000

by_basket_str = b_df[b_df["shopper"] < num_shoppers]
by_basket_str["product"] = by_basket_str["product"].astype(str)
by_basket_str = by_basket_str.groupby(["week", "shopper"])["product"].apply(list).reset_index(name="list_prod")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  by_basket_str["product"] = by_basket_str["product"].astype(str)


In [130]:
by_basket_str.head()

Unnamed: 0,week,shopper,list_prod
0,0,0,"[71, 91, 116, 123, 157, 167, 171, 184, 207, 225]"
1,0,1,"[22, 35, 92, 146, 168, 200, 211, 229]"
2,0,2,"[14, 121, 164, 218, 230, 249]"
3,0,3,"[6, 67, 98, 137, 145, 192, 222]"
4,0,4,"[25, 76, 156, 188, 234]"


In [131]:
# basket list
by_basket = by_basket_str["list_prod"].tolist()

# Create a Word2Vec model
w2v_model = Word2Vec(
    sentences=by_basket,
    min_count=10,
    negative=5,
    sample=0,
    sg=1,
    workers=10,
)

# summarize the loaded model
print(w2v_model)

Word2Vec(vocab=250, size=100, alpha=0.025)


#### 