# HA 3 "Recommender System" take 05

## Setup

In [1]:
import numpy as np
import pandas as pd

from random import randrange
from gensim.models import Word2Vec

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error

## Load Data

In [2]:
file_path = "../../data/assignment_03/"

baskets_pq = "baskets.parquet"
coupons_pq = "coupons.parquet"
prediction_index_pq = "prediction_index.parquet"

b_df = pd.read_parquet(file_path + baskets_pq)
c_df = pd.read_parquet(file_path + coupons_pq)
pi_df = pd.read_parquet(file_path + prediction_index_pq)

print(f"baskets_df: {b_df.shape}")
print(f"coupons_df: {c_df.shape}")
print(f"prediction_index_df: {pi_df.shape}")

baskets_df: (68841598, 4)
coupons_df: (45000000, 4)
prediction_index_df: (500000, 3)


In [3]:
b_df.sample(1)

Unnamed: 0,week,shopper,product,price
67213102,87,87154,248,490


In [4]:
c_df.sample(1)

Unnamed: 0,week,shopper,product,discount
17111519,34,22303,88,15


In [5]:
pi_df.sample(1)

Unnamed: 0,week,shopper,product
54552,90,218,52


## Data Preprocessing

### A. dfs with 2000 Shoppers

In [6]:
# 2000 shoppers
b_df_2000 = b_df[b_df["shopper"] <= 2000]
print(f"b_df_2000: {b_df_2000.shape}")

b_df_2000: (1379319, 4)


In [7]:
c_df_2000 = c_df[c_df["shopper"] <= 2000]
print(f"c_df_2000: {c_df_2000.shape}")

c_df_2000: (900450, 4)


### B. dfs with 3 Shoppers

In [8]:
# Choose 3 shoppers within 2000 randomly
# random_three_shoppers = [randrange(2000) for shopper in range(3)]
random_three_shoppers = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
random_three_shoppers

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

### C. Get Original Price to include it to master DF

In [9]:
orig_price = b_df.groupby(["product"])["price"].agg(["max"]).reset_index()
print(f"b_df.shape: {orig_price.shape}")
orig_price.head()

b_df.shape: (250, 2)


Unnamed: 0,product,max
0,0,688
1,1,560
2,2,773
3,3,722
4,4,620


In [10]:
orig_price["max"].values

array([688, 560, 773, 722, 620, 721, 568, 772, 669, 759, 711, 611, 781,
       788, 755, 620, 743, 725, 665, 744, 563, 463, 528, 495, 558, 540,
       628, 582, 442, 542, 644, 784, 684, 665, 706, 602, 696, 772, 699,
       593, 470, 660, 527, 546, 485, 590, 523, 592, 465, 613, 673, 640,
       607, 672, 773, 757, 761, 716, 713, 527, 795, 822, 573, 719, 575,
       719, 743, 637, 748, 752, 737, 629, 788, 837, 779, 654, 582, 721,
       625, 736, 614, 513, 598, 637, 501, 550, 481, 521, 471, 700, 549,
       605, 543, 577, 443, 586, 502, 632, 481, 519, 472, 506, 576, 539,
       637, 504, 478, 646, 528, 667, 646, 699, 727, 710, 579, 674, 715,
       737, 750, 532, 639, 659, 660, 483, 707, 657, 598, 619, 630, 540,
       613, 519, 646, 509, 528, 684, 652, 660, 581, 605, 580, 589, 551,
       470, 562, 506, 596, 453, 463, 623, 503, 496, 571, 501, 569, 613,
       575, 592, 566, 667, 679, 697, 510, 656, 628, 550, 707, 582, 588,
       673, 615, 639, 653, 765, 753, 692, 681, 694, 569, 593, 55

### D. Generate a master dataframe with 3 generated shoppers

In [11]:
def gen_complete_week_prod_df_for_shoppers(list_shoppers):
    products = np.array([p for p in range(250)])
    orig_price_list = orig_price["max"].values
    
    weeks = [i for i in range(90)]
    
    product_price_pair = np.stack((products, orig_price_list), axis=0).T
    
    prod_in_weeks = []
    for w in weeks:
        for p in product_price_pair:
            prod_in_weeks.append([w, *p])
    
    df_template = pd.DataFrame(data=prod_in_weeks, columns=["week", "product", "orig_price"])
    
    df_master = None;
    
    for idx, shoppers in enumerate(list_shoppers):

        df_local = df_template.copy()
        df_local["shopper"] = shoppers
        
        if idx == 0:
            df_master = df_local
        else:
            # stack it
            df_master = pd.concat([df_master, df_local], axis=0)
    return df_master
    

In [12]:
df_3 = gen_complete_week_prod_df_for_shoppers(random_three_shoppers)
print(f"df_3.shape: {df_3.shape}")
df_3.head(3)

df_3.shape: (225000, 4)


Unnamed: 0,week,product,orig_price,shopper
0,0,0,688,0
1,0,1,560,0
2,0,2,773,0


In [13]:
df_3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 225000 entries, 0 to 22499
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   week        225000 non-null  int64
 1   product     225000 non-null  int64
 2   orig_price  225000 non-null  int64
 3   shopper     225000 non-null  int64
dtypes: int64(4)
memory usage: 8.6 MB


### E. Generate Bakset and Coupon df for selected shoppers
- Bakset DF for selected shoppers: b_df_3
- Coupon DF for selected shoppers: c_df_#

In [14]:
b_df_3 = b_df_2000[b_df_2000["shopper"].isin(random_three_shoppers)]
print(f"b_df_3.shape: {b_df_3.shape}")
b_df_3.head()

b_df_3.shape: (7151, 4)


Unnamed: 0,week,shopper,product,price
0,0,0,71,629
1,0,0,91,605
2,0,0,116,715
3,0,0,123,483
4,0,0,157,592


In [15]:
# rahdom_three_shoppers: random_three_shoppers

c_df_3 = c_df_2000[c_df_2000["shopper"].isin(random_three_shoppers)]
print(f"c_df_3.shape: {c_df_3.shape}")
c_df_3.head()

c_df_3.shape: (4500, 4)


Unnamed: 0,week,shopper,product,discount
0,0,0,35,35
1,0,0,193,40
2,0,0,27,30
3,0,0,177,35
4,0,0,5,30


## Feature Engineering - Applicable regardless of split data set

### Feature 1. Column Bought

In [16]:
b_df_3["bought"] = 1
b_df_3.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  b_df_3["bought"] = 1


Unnamed: 0,week,shopper,product,price,bought
0,0,0,71,629,1
1,0,0,91,605,1


In [17]:
# Merge b_df_3 to df_3
df_3_b = df_3.merge(right=b_df_3, how="left", on=["week", "shopper", "product"])
df_3_b.head(3)

Unnamed: 0,week,product,orig_price,shopper,price,bought
0,0,0,688,0,,
1,0,1,560,0,,
2,0,2,773,0,,


In [18]:
df_3_b["bought"] = df_3_b["bought"].fillna(0)

In [19]:
df_3_b.sample(5)

Unnamed: 0,week,product,orig_price,shopper,price,bought
169637,48,137,660,7,,0.0
130741,72,241,699,5,,0.0
96936,27,186,499,4,,0.0
9415,37,165,550,0,,0.0
91468,5,218,739,4,,0.0


### Feature 2. Discount Given

In [20]:
c_df_3["coupon_given"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c_df_3["coupon_given"] = 1


In [21]:
df_3_b_c = df_3_b.merge(right=c_df_3, how="left", on=["week", "shopper", "product"])
df_3_b_c.head(2)

Unnamed: 0,week,product,orig_price,shopper,price,bought,discount,coupon_given
0,0,0,688,0,,0.0,,
1,0,1,560,0,,0.0,,


In [22]:
df_3_b_c["discount"] = df_3_b_c["discount"].fillna(0)
df_3_b_c["coupon_given"] = df_3_b_c["coupon_given"].fillna(0)

In [23]:
df_3_b_c.sample(3)

Unnamed: 0,week,product,orig_price,shopper,price,bought,discount,coupon_given
113071,2,71,629,5,,0.0,0.0,0.0
38485,63,235,458,1,458.0,1.0,0.0,0.0
108724,74,224,712,4,,0.0,0.0,0.0


### Feature 3. Price with discount
- originla price - discount

In [24]:
df_3_b_c["price_w_discount"] = df_3_b_c["orig_price"] - df_3_b_c["discount"]
df_3_b_c.head()

Unnamed: 0,week,product,orig_price,shopper,price,bought,discount,coupon_given,price_w_discount
0,0,0,688,0,,0.0,0.0,0.0,688.0
1,0,1,560,0,,0.0,0.0,0.0,560.0
2,0,2,773,0,,0.0,0.0,0.0,773.0
3,0,3,722,0,,0.0,0.0,0.0,722.0
4,0,4,620,0,,0.0,0.0,0.0,620.0


In [25]:
# Check orig_price != price_w_discount
df_3_b_c[df_3_b_c["discount"] != 0].head()

Unnamed: 0,week,product,orig_price,shopper,price,bought,discount,coupon_given,price_w_discount
5,0,5,721,0,,0.0,30.0,1.0,691.0
27,0,27,582,0,,0.0,30.0,1.0,552.0
35,0,35,602,0,,0.0,35.0,1.0,567.0
177,0,177,694,0,,0.0,35.0,1.0,659.0
193,0,193,574,0,,0.0,40.0,1.0,534.0


In [26]:
# drop price column
df = df_3_b_c.drop(columns="price")
df.head(2)

Unnamed: 0,week,product,orig_price,shopper,bought,discount,coupon_given,price_w_discount
0,0,0,688,0,0.0,0.0,0.0,688.0
1,0,1,560,0,0.0,0.0,0.0,560.0


### Feature 4. Bought with Coupon - USELESS

In [27]:
# # if bought == 1 and coupon_given == 1, bought WITH coupon
# bought_with_coupon = (df["bought"] == 1) & (df["coupon_given"] == 1)
# # bought_with_coupon

# df.loc[bought_with_coupon, "bought_with_coupon"] = 1
# df["bought_with_coupon"] = df["bought_with_coupon"].fillna(0)

### Feature 5. Bought without Coupon - USELESS

In [28]:
# # bought == 1 and coupon_given == 0
# bought_wo_coupon = (df["bought"] == 1) & (df["coupon_given"] == 0)
# # bought_wo_coupon

# df.loc[bought_wo_coupon, "bought_wo_coupon"] = 1
# df["bought_wo_coupon"] = df["bought_wo_coupon"].fillna(0)

### Feature 6. Category - UPCOMING

## Predictive Analysis

### A. Train Set and Test Set Prep

In [29]:
# Split data into train + test
# train: week 0 - 79
# test: week 80 - 89

# X_train, X_test, y_train, y_test

# 1. Separate train and test
train = df[df["week"] < 85]
test = df[df["week"] >= 85]

# We drop "week" column
train = train.drop(columns=["week"])
test = test.drop(columns=["week"])


print(f"Master: {df.shape}, Train: {train.shape}, Test: {test.shape}")

Master: (225000, 8), Train: (212500, 7), Test: (12500, 7)


In [30]:
# 2. Features and Target separation 
X_train = train.drop(columns=["bought"])
y_train = train[["bought"]]
X_test = test.drop(columns=["bought"])
y_test = test[["bought"]]

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

X_train: (212500, 6), y_train: (212500, 1)
X_test: (12500, 6), y_test: (12500, 1)


### B. Valid Set (Week 90) Setup

In [31]:
df.columns

Index(['week', 'product', 'orig_price', 'shopper', 'bought', 'discount',
       'coupon_given', 'price_w_discount'],
      dtype='object')

In [32]:
# Prediction Index DF for random_three_shoppers

# rahdom_three_shoppers: random_three_shoppers

valid_set = pi_df[pi_df["shopper"].isin(random_three_shoppers)]
print(f"valid_set.shape: {valid_set.shape}")
valid_set.head()

valid_set.shape: (2500, 3)


Unnamed: 0,week,shopper,product
0,90,0,0
1,90,0,1
2,90,0,2
3,90,0,3
4,90,0,4


In [33]:
# orig_price["max"].values.tolist()

In [34]:
# Populate original price
original_price_list = []

for i in range(len(random_three_shoppers)):
    original_price_list.append(orig_price["max"].values.tolist())

o_p_list = list(np.concatenate(original_price_list).flat)

valid_set["orig_price"] = o_p_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_set["orig_price"] = o_p_list


In [35]:
valid_set.head()

Unnamed: 0,week,shopper,product,orig_price
0,90,0,0,688
1,90,0,1,560
2,90,0,2,773
3,90,0,3,722
4,90,0,4,620


In [36]:
vs_master = valid_set.copy()

##### 5 Scenarios 
- 1. No Discount
- 2. 15%
- 3. 20%
- 4. 25%
- 5. 30%

#### 5 - 1. No Discount

In [37]:
vs_nd = valid_set.copy()

In [38]:
# Features to add
# 1. bought: 1
# 2. discount: 0
# 3. coupon_given: 0
# 4. price_w_discount

In [39]:
vs_nd["discount"] = 0.0
vs_nd["coupon_given"] = 0
vs_nd["price_w_discount"] = vs_nd["orig_price"]

In [40]:
vs_master = vs_nd
print(f"vs_master.shape: {vs_master.shape}")
vs_master.head()

vs_master.shape: (2500, 7)


Unnamed: 0,week,shopper,product,orig_price,discount,coupon_given,price_w_discount
0,90,0,0,688,0.0,0,688
1,90,0,1,560,0.0,0,560
2,90,0,2,773,0.0,0,773
3,90,0,3,722,0.0,0,722
4,90,0,4,620,0.0,0,620


#### 5 - 2. 15% Discount

In [41]:
# Features to add
# 1. bought: 1
# 2. discount: 15%
# 3. coupon_given: 1
# 4. price_w_discount: orig_price - discount

In [42]:
vs_15d = valid_set.copy()
vs_15d["discount"] = vs_15d["orig_price"] * 0.15
vs_15d["coupon_given"] = 1
vs_15d["price_w_discount"] = vs_15d["orig_price"] - vs_15d["discount"]

#### 5 - 3. 20% Discount

In [43]:
vs_20d = valid_set.copy()
vs_20d["discount"] = vs_20d["orig_price"] * 0.20
vs_20d["coupon_given"] = 1
vs_20d["price_w_discount"] = vs_20d["orig_price"] - vs_20d["discount"]

#### 5 - 4. 25% Discount

In [44]:
vs_25d = valid_set.copy()
vs_25d["discount"] = vs_25d["orig_price"] * 0.25
vs_25d["coupon_given"] = 1
vs_25d["price_w_discount"] = vs_25d["orig_price"] - vs_25d["discount"]

In [45]:
frames = [vs_nd, vs_15d, vs_20d, vs_25d]
vs_master = pd.concat(frames)

In [46]:
vs_master.shape

(10000, 7)

In [47]:
vs_master.head()

Unnamed: 0,week,shopper,product,orig_price,discount,coupon_given,price_w_discount
0,90,0,0,688,0.0,0,688.0
1,90,0,1,560,0.0,0,560.0
2,90,0,2,773,0.0,0,773.0
3,90,0,3,722,0.0,0,722.0
4,90,0,4,620,0.0,0,620.0


In [48]:
# vs_master is the week 90 valid set

### C. Feature Engineering - Applicable to Test data Split

### Feature 7. Mean Basket Size per Shopper (Train Test split required)

In [None]:
X_train.head(2)

In [None]:
X_test.head(2)

In [51]:
df.head(2)

Unnamed: 0,week,product,orig_price,shopper,bought,discount,coupon_given,price_w_discount
0,0,0,688,0,0.0,0.0,0.0,688.0
1,0,1,560,0,0.0,0.0,0.0,560.0


In [52]:
# i.e.
b_df[(b_df["week"] == 0) & (b_df["shopper"] == 0)]["product"].count()

10

In [53]:
baskets_agg = b_df_2000.groupby(['shopper','week'], as_index=False)['product'].count()
baskets_agg = baskets_agg.rename({'product':'basket_size'}, axis = 1)
basket_mean = baskets_agg.groupby(['shopper'], as_index = False)['basket_size'].mean()
basket_mean

Unnamed: 0,shopper,basket_size
0,0,8.555556
1,1,7.388889
2,2,5.788889
3,3,8.355556
4,4,6.200000
...,...,...
1996,1996,9.011111
1997,1997,6.211111
1998,1998,5.022222
1999,1999,8.877778


In [54]:
df = pd.merge(basket_mean, df, on="shopper")
df.head()

Unnamed: 0,shopper,basket_size,week,product,orig_price,bought,discount,coupon_given,price_w_discount
0,0,8.555556,0,0,688,0.0,0.0,0.0,688.0
1,0,8.555556,0,1,560,0.0,0.0,0.0,560.0
2,0,8.555556,0,2,773,0.0,0.0,0.0,773.0
3,0,8.555556,0,3,722,0.0,0.0,0.0,722.0
4,0,8.555556,0,4,620,0.0,0.0,0.0,620.0


### Feature 8. Frequency of Product Purchase (Train Test split required)

In [55]:
# How often is the product bought in a week
total_bought = b_df.groupby(['product', "week"], as_index = False).agg('count')
total_bought = total_bought.rename({"shopper": "total_bought"}, axis=1)
total_bought = total_bought.drop(columns="price")
print(f"total_bought.shape: {total_bought.shape}")

total_bought.shape: (22500, 3)


In [56]:
total_bought.head()

Unnamed: 0,product,week,total_bought
0,0,0,1700
1,0,1,1844
2,0,2,1788
3,0,3,1830
4,0,4,1796


In [57]:
df = pd.merge(total_bought, df, on=["product", "week"])
print(f"df.shape: {df.shape}")
df.head(2)

df.shape: (225000, 10)


Unnamed: 0,product,week,total_bought,shopper,basket_size,orig_price,bought,discount,coupon_given,price_w_discount
0,0,0,1700,0,8.555556,688,0.0,0.0,0.0,688.0
1,0,0,1700,1,7.388889,688,0.0,0.0,0.0,688.0


In [58]:
df.sample(5)

Unnamed: 0,product,week,total_bought,shopper,basket_size,orig_price,bought,discount,coupon_given,price_w_discount
220428,244,82,1830,8,9.766667,694,0.0,0.0,0.0,694.0
221014,245,51,1501,4,6.2,549,0.0,0.0,0.0,549.0
193474,214,87,3370,4,6.2,505,0.0,0.0,0.0,505.0
191921,213,22,4197,1,7.388889,592,0.0,0.0,0.0,592.0
157446,174,84,708,6,7.522222,753,0.0,0.0,0.0,753.0


### Feature 9. Frequency Of Product Purchase per Shopper (Train Test Split required)

In [59]:
b_df["week"].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89])

In [60]:
# For each product, how many times a week has it been bought?
num_weeks = len(b_df["week"].unique())
total_bought_shopper = b_df.groupby(["product", "shopper"], as_index=False).agg("count")
total_bought_shopper = total_bought_shopper.rename({"week": "total_bought"}, axis=1)
total_bought_shopper = pd.merge(total_bought_shopper, b_df, on=["product", "shopper"])

total_bought_shopper['total_bought_s'] = total_bought_shopper["total_bought"] / num_weeks

total_bought_shopper = total_bought_shopper[['shopper', 'product', 'total_bought_s']]
total_bought_shopper = total_bought_shopper.drop_duplicates()

print(f"total_bought_shopper.shape: {total_bought_shopper.shape}")

total_bought_shopper.shape: (7086910, 3)


In [61]:
# df = pd.merge(df, total_bought_shopper, on=["shopper", "product"], how="left")
df

Unnamed: 0,product,week,total_bought,shopper,basket_size,orig_price,bought,discount,coupon_given,price_w_discount
0,0,0,1700,0,8.555556,688,0.0,0.0,0.0,688.0
1,0,0,1700,1,7.388889,688,0.0,0.0,0.0,688.0
2,0,0,1700,2,5.788889,688,0.0,0.0,0.0,688.0
3,0,0,1700,3,8.355556,688,0.0,0.0,0.0,688.0
4,0,0,1700,4,6.200000,688,0.0,0.0,0.0,688.0
...,...,...,...,...,...,...,...,...,...,...
224995,249,89,6253,5,7.933333,499,0.0,0.0,0.0,499.0
224996,249,89,6253,6,7.522222,499,0.0,0.0,0.0,499.0
224997,249,89,6253,7,9.566667,499,1.0,0.0,0.0,499.0
224998,249,89,6253,8,9.766667,499,0.0,0.0,0.0,499.0


### Feature 8. num of weeks since last purchase (train test split required)

In [62]:
# for Each Shopper, go through each product, for each week

In [63]:
df

Unnamed: 0,product,week,total_bought,shopper,basket_size,orig_price,bought,discount,coupon_given,price_w_discount
0,0,0,1700,0,8.555556,688,0.0,0.0,0.0,688.0
1,0,0,1700,1,7.388889,688,0.0,0.0,0.0,688.0
2,0,0,1700,2,5.788889,688,0.0,0.0,0.0,688.0
3,0,0,1700,3,8.355556,688,0.0,0.0,0.0,688.0
4,0,0,1700,4,6.200000,688,0.0,0.0,0.0,688.0
...,...,...,...,...,...,...,...,...,...,...
224995,249,89,6253,5,7.933333,499,0.0,0.0,0.0,499.0
224996,249,89,6253,6,7.522222,499,0.0,0.0,0.0,499.0
224997,249,89,6253,7,9.566667,499,1.0,0.0,0.0,499.0
224998,249,89,6253,8,9.766667,499,0.0,0.0,0.0,499.0


In [64]:
nrordered_product_week = pd.DataFrame(df.groupby(['week', 'shopper','product']).size()).reset_index()
nrordered_product_week.columns = ['week', 'shopper', 'product','number']
nrordered_product_week.loc[nrordered_product_week['number'] > 1]

new_test = pd.DataFrame(df.groupby(['shopper','product','week','bought']).size().reset_index())
new_test.head(n=10)

mask = new_test.groupby(['shopper', 'product'])['bought'].cumsum().replace(0, False).astype(bool) # Mask starting zeros as NaN
df_out = new_test.assign(last_prod_order = new_test.groupby(['shopper','product', new_test["bought"].astype(bool).cumsum()]).cumcount().where(mask))
df_out.last_prod_order[df_out.last_prod_order.isna()]= 0
df_out = df_out[['shopper', 'product','week','last_prod_order']]

df = pd.merge(df, df_out, on = ['shopper', 'week', 'product'], how = 'left')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out.last_prod_order[df_out.last_prod_order.isna()]= 0


In [65]:
df

Unnamed: 0,product,week,total_bought,shopper,basket_size,orig_price,bought,discount,coupon_given,price_w_discount,last_prod_order
0,0,0,1700,0,8.555556,688,0.0,0.0,0.0,688.0,0.0
1,0,0,1700,1,7.388889,688,0.0,0.0,0.0,688.0,0.0
2,0,0,1700,2,5.788889,688,0.0,0.0,0.0,688.0,0.0
3,0,0,1700,3,8.355556,688,0.0,0.0,0.0,688.0,0.0
4,0,0,1700,4,6.200000,688,0.0,0.0,0.0,688.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
224995,249,89,6253,5,7.933333,499,0.0,0.0,0.0,499.0,0.0
224996,249,89,6253,6,7.522222,499,0.0,0.0,0.0,499.0,0.0
224997,249,89,6253,7,9.566667,499,1.0,0.0,0.0,499.0,0.0
224998,249,89,6253,8,9.766667,499,0.0,0.0,0.0,499.0,1.0


In [66]:
df.sample(10)

Unnamed: 0,product,week,total_bought,shopper,basket_size,orig_price,bought,discount,coupon_given,price_w_discount,last_prod_order
151232,168,3,3474,2,5.788889,588,0.0,0.0,0.0,588.0,0.0
129246,143,54,5530,6,7.522222,470,0.0,0.0,0.0,470.0,0.0
5885,6,48,2608,5,7.933333,568,0.0,35.0,1.0,533.0,0.0
65507,72,70,2150,7,9.566667,788,0.0,0.0,0.0,788.0,0.0
29769,33,6,2985,9,8.377778,665,0.0,0.0,0.0,665.0,0.0
84911,94,31,1467,1,7.388889,443,0.0,0.0,0.0,443.0,4.0
164426,182,62,2153,6,7.522222,656,0.0,0.0,0.0,656.0,0.0
52895,58,69,2046,5,7.933333,713,0.0,0.0,0.0,713.0,22.0
47083,52,28,1494,3,8.355556,607,0.0,0.0,0.0,607.0,0.0
46577,51,67,1716,7,9.566667,640,0.0,0.0,0.0,640.0,0.0


### D. Model Declaration and Setup

In [67]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import mean_absolute_error
# from sklearn import tree

In [68]:
X_test

Unnamed: 0,product,orig_price,shopper,discount,coupon_given,price_w_discount
21250,0,688,0,0.0,0.0,688.0
21251,1,560,0,0.0,0.0,560.0
21252,2,773,0,0.0,0.0,773.0
21253,3,722,0,0.0,0.0,722.0
21254,4,620,0,0.0,0.0,620.0
...,...,...,...,...,...,...
224995,245,549,9,0.0,0.0,549.0
224996,246,702,9,0.0,0.0,702.0
224997,247,670,9,0.0,0.0,670.0
224998,248,490,9,0.0,0.0,490.0


#### D - 1. RandomForestClassifier

In [69]:

rfc = RandomForestClassifier(random_state=888, max_depth=10)
rfc.fit(X_train, y_train)

  rfc.fit(X_train, y_train)


RandomForestClassifier(max_depth=10, random_state=888)

In [70]:
rfc_pred_test = rfc.predict(X_test)
rfc_pred_test

array([0., 0., 0., ..., 0., 0., 0.])

In [71]:
mean_absolute_error(rfc_pred_test, y_test)

0.03088

In [72]:
rfc_pred_test_prob = rfc.predict_proba(X_test)
rfc_pred_test_prob[:,0]

array([0.98944432, 0.96630579, 0.99701385, ..., 0.99735097, 0.99015862,
       0.78489126])

#### D - 2. DecisionTreeClassifer

In [73]:
dtc = DecisionTreeClassifier(criterion="gini", max_depth=10)
dtc.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=10)

In [74]:
dtc_pred_test = dtc.predict(X_test)
dtc_pred_test

array([0., 0., 0., ..., 0., 0., 0.])

In [75]:
mean_absolute_error(dtc_pred_test, y_test)

0.0296

In [76]:
dtc_pred_test_prob = dtc.predict_proba(X_test)
dtc_pred_test_prob[:,0]

array([1.        , 0.9567703 , 1.        , ..., 1.        , 1.        ,
       0.77108434])

### E. Valid Set

In [77]:
X_valid = vs_master

In [78]:
X_valid.columns

Index(['week', 'shopper', 'product', 'orig_price', 'discount', 'coupon_given',
       'price_w_discount'],
      dtype='object')

In [79]:
X_test.columns

Index(['product', 'orig_price', 'shopper', 'discount', 'coupon_given',
       'price_w_discount'],
      dtype='object')

In [80]:
X_valid.head(2)

Unnamed: 0,week,shopper,product,orig_price,discount,coupon_given,price_w_discount
0,90,0,0,688,0.0,0,688.0
1,90,0,1,560,0.0,0,560.0


#### E - 1. rfc 

In [81]:
rfc_valid_pred = rfc.predict(X_valid)
rfc_valid_pred

ValueError: Number of features of the model must match the input. Model n_features is 6 and input n_features is 7 

In [None]:
rfc_pred_prob = rfc.predict_proba(X_valid)
rfc_pred_prob[:,0]

#### E - 2. dtc

In [None]:
dtc_valid_pred = dtc.predict(X_valid)
dtc_valid_pred

In [None]:
dtc_pred_prob = dtc.predict_proba(X_valid)
dtc_pred_prob

In [None]:
# LGVM

### F. Model Assessment - AUC

In [None]:
dtc_pred_test

In [None]:
y_test

In [None]:
foo = pd.DataFrame(data=rfc_pred_test)
foo.

In [None]:
y_test

In [None]:
from sklearn import metrics  # Important lib where we find various performance measures

rfc_roc_plot = metrics.plot_roc_curve(dtc, foo["0"], y_test["bought"])

In [None]:
# 

# NOTE
- Remove Week