# Base model 

It is created by calculating average number of items sold per week for each product.
Model takes product_id and week for which to make a prediction as an input and returns prediction (average calculated from train set).

In [1]:
import pandas as pd

In [48]:
sessions = pd.read_json("../data/raw/second/sessions.jsonl", lines=True)
# bierzemy pod uwagę tylko te sesje, które zakończły się zakupem
buy_sessions = sessions[sessions['event_type'] == 'BUY_PRODUCT'].copy()
# data z timestampu w celu późniejszego grupowania
buy_sessions['date']=buy_sessions['timestamp'].dt.date
# te wartości i tak by znikły przy grupowaniu
buy_sessions.drop(columns=['user_id', 'event_type', 'offered_discount', 'session_id', 'timestamp'], inplace=True)
# grupujemy po dacie i ID produktu oraz dodajemy 0 dla produktów które nie były sprzedane danego dnia
df = buy_sessions.groupby(by=['date', 'product_id']).count().unstack(fill_value=0).stack().rename(columns={'purchase_id': 'amount'}).reset_index()
# nie ma to w sumie wielkiego znaczenia, ale lepiej żeby było to stringiem - ID nie musi być liczbą
df['product_id'] = df['product_id'].astype(str)

In [49]:
# co prawda przedtem robiliśmy w drugą stronę, ale teraz musimy to zamienić z powrotem na timestamp żeby pd.Grouper działało
df['date'] = pd.to_datetime(df['date'])
# grupujemy w tygodnie
df=df.groupby(by=[pd.Grouper(key='date', freq='W'), pd.Grouper(key='product_id')]).sum()

In [53]:
df.reset_index('product_id', inplace=True)

In [54]:
# tworzymy zbiór treningowy i testowy
split_date = pd.to_datetime('2021-09-01')
train = df.loc[df.index <= split_date].copy()
test = df.loc[df.index > split_date].copy()

In [122]:
# definicja modelu
class BaseModel:
    def __init__(self, train_data):
        self.avg_per_prod = train.groupby(by='product_id').mean().round()
        self.avg_per_prod['amount'] = self.avg_per_prod['amount'].astype(int)
        
    def predict(self, prod_id):
        return self.avg_per_prod.loc[prod_id, :][0]

In [123]:
# tworzymy model
model = BaseModel(train)

In [126]:
assert model.predict('1002') == 2
assert model.predict('1006') == 1
assert model.predict('1314') == 0

In [113]:
test['amount_prediction'] = test['product_id'].apply(lambda x: model.predict(x))
test

Unnamed: 0_level_0,product_id,amount,amount_prediction
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-09-05,1002,0,2
2021-09-05,1003,0,2
2021-09-05,1004,0,2
2021-09-05,1005,0,2
2021-09-05,1006,0,1
...,...,...,...
2021-12-26,1310,0,0
2021-12-26,1311,0,0
2021-12-26,1312,0,0
2021-12-26,1313,0,0


In [114]:
from sklearn.metrics import mean_squared_error as MSE
# traktujemy RMSE jako miernik jakości różnych wersji modelu
rmse=MSE(test['amount'], test['amount_prediction'],squared=False)
rmse

2.6559454030299237

In [128]:
import pickle
file_name = "base_model.pkl"

# save
pickle.dump(model, open(f"../models/{file_name}", "wb"))

# load
base_model_loaded = pickle.load(open(f"../models/{file_name}", "rb"))
print(base_model_loaded)

<__main__.BaseModel object at 0x7fd7540e3590>
