<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Data-preparation" data-toc-modified-id="Data-preparation-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data preparation</a></span><ul class="toc-item"><li><span><a href="#Join-shops-with-different-ids-but-same-name-into-one-shop" data-toc-modified-id="Join-shops-with-different-ids-but-same-name-into-one-shop-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Join shops with different ids but same name into one shop</a></span></li><li><span><a href="#Join-items-with-same-name-into-one-based-on-the-one-used-in-test-dataset" data-toc-modified-id="Join-items-with-same-name-into-one-based-on-the-one-used-in-test-dataset-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Join items with same name into one based on the one used in test dataset</a></span><ul class="toc-item"><li><span><a href="#Impute-negative-price-value" data-toc-modified-id="Impute-negative-price-value-1.2.1"><span class="toc-item-num">1.2.1&nbsp;&nbsp;</span>Impute negative price value</a></span></li></ul></li></ul></li><li><span><a href="#Models" data-toc-modified-id="Models-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Models</a></span><ul class="toc-item"><li><span><a href="#Average-of-last-3-months" data-toc-modified-id="Average-of-last-3-months-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Average of last 3 months</a></span></li><li><span><a href="#Baseline:-average-of-last-3-months" data-toc-modified-id="Baseline:-average-of-last-3-months-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Baseline: average of last 3 months</a></span></li><li><span><a href="#Add-lag-features" data-toc-modified-id="Add-lag-features-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Add lag features</a></span></li></ul></li><li><span><a href="#LigthGBM" data-toc-modified-id="LigthGBM-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>LigthGBM</a></span></li></ul></div>

In [1]:
import warnings

warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import seaborn as sns
import datetime
import calendar
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelBinarizer, LabelEncoder
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMRegressor

%load_ext autoreload
%autoreload 2

from data_preparation import get_shops, get_item_categories, get_items, prepare_full_dataset, get_duplicate_dict, generate_all_ids_dataset, prepare_zero_dataset
from time_series_feats import get_avg_metric, get_lag_metric, calculate_and_add_lag

## Data preparation

In [2]:
pd.read_csv('sales_train.csv.zip')

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.00,1.0
1,03.01.2013,0,25,2552,899.00,1.0
2,05.01.2013,0,25,2552,899.00,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.00,1.0
...,...,...,...,...,...,...
2935844,10.10.2015,33,25,7409,299.00,1.0
2935845,09.10.2015,33,25,7460,299.00,1.0
2935846,14.10.2015,33,25,7459,349.00,1.0
2935847,22.10.2015,33,25,7440,299.00,1.0


In [2]:
shops = get_shops()
item_categories = get_item_categories()
items = get_items(item_categories)
train_df = prepare_full_dataset(pd.read_csv('sales_train.csv.zip'), items, shops, item_categories)
test_df = prepare_full_dataset(pd.read_csv('test.csv.zip'), items, shops, item_categories, default_date='01.11.2015')

### Join shops with different ids but same name into one shop

In [5]:
#shops[shops.shop_id.isin([0,1,10,11,57,58])]

Unnamed: 0,shop_name,shop_id,shop_type
0,"!Якутск Орджоникидзе, 56 фран",0,МАГАЗИН
1,"!Якутск ТЦ ""Центральный"" фран",1,ТЦ
10,Жуковский ул. Чкалова 39м?,10,МАГАЗИН
11,Жуковский ул. Чкалова 39м²,11,МАГАЗИН
57,"Якутск Орджоникидзе, 56",57,МАГАЗИН
58,"Якутск ТЦ ""Центральный""",58,ТЦ


### Join items with same name into one based on the one used in test dataset

In [3]:
dup_item_dict = get_duplicate_dict(items, test_df)
train_df['item_id'] = train_df['item_id'].apply(lambda x: dup_item_dict[x] if x in dup_item_dict else x)

#### Impute negative price value

In [5]:
train_df[train_df.item_price < 0]

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,shop_name,shop_type,item_name,item_category_id,item_name_correct,item_category_name,category,sub_category,city,month,year
1158744,2013-05-15,4,32,2973,-1.0,1.0,"Москва ТЦ ""Серебряный Дом""",ТЦ,"DmC Devil May Cry [PS3, русские субтитры]",19,"dmc devil may cry [ps3, русские субтитры]",Игры - PS3,Игры,PS3,Москва,5,2013


In [4]:
emply_values = train_df[(train_df.item_id == 2973) & (train_df.year== 2013) & (train_df.month == 5) & (train_df.city == 'Москва')]

In [5]:
item_price_to_impute = emply_values[emply_values.item_price > 0]['item_price'].mean()

In [6]:
train_df['item_price'] = train_df['item_price'].apply(lambda x: x if x > 0 else item_price_to_impute)

In [8]:
train_df[train_df.item_price < 0]

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,shop_name,shop_type,item_name,item_category_id,item_name_correct,item_category_name,category,sub_category,city,month,year


## Models

### Average of last 3 months 

In [19]:
train_df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,shop_name,shop_type,item_name,item_category_id,item_name_correct,item_category_name,category,sub_category,city,month,year
0,2013-01-02,0,59,22154,999.0,1.0,"Ярославль ТЦ ""Альтаир""",ТЦ,ЯВЛЕНИЕ 2012 (BD),37,явление 2012 (bd),Кино - Blu-Ray,Кино,Blu-Ray,Ярославль,1,2013
1,2013-01-02,0,25,22154,999.0,1.0,"Москва ТРК ""Атриум""",ТРК,ЯВЛЕНИЕ 2012 (BD),37,явление 2012 (bd),Кино - Blu-Ray,Кино,Blu-Ray,Москва,1,2013
2,2013-01-03,0,25,22154,999.0,1.0,"Москва ТРК ""Атриум""",ТРК,ЯВЛЕНИЕ 2012 (BD),37,явление 2012 (bd),Кино - Blu-Ray,Кино,Blu-Ray,Москва,1,2013
3,2013-01-20,0,25,22154,999.0,1.0,"Москва ТРК ""Атриум""",ТРК,ЯВЛЕНИЕ 2012 (BD),37,явление 2012 (bd),Кино - Blu-Ray,Кино,Blu-Ray,Москва,1,2013
4,2013-01-23,0,25,22154,999.0,1.0,"Москва ТРК ""Атриум""",ТРК,ЯВЛЕНИЕ 2012 (BD),37,явление 2012 (bd),Кино - Blu-Ray,Кино,Blu-Ray,Москва,1,2013


In [7]:
baseline_train = train_df.groupby( ['date_block_num', 'shop_id', 'shop_type', 'item_id', 'month', 'year', 
                                    'category', 'item_category_id', 'sub_category', 'city']).agg({'item_cnt_day': 'sum', 'item_price':'mean'}).reset_index()
baseline_train = baseline_train.rename(columns={'item_cnt_day':'item_cnt_month'})

In [10]:
baseline_train.head()

Unnamed: 0,date_block_num,shop_id,shop_type,item_id,month,year,category,item_category_id,sub_category,city,item_cnt_month,item_price
0,0,2,ТЦ,27,1,2013,Игры,19,PS3,Адыгея,1.0,2499.0
1,0,2,ТЦ,33,1,2013,Кино,37,Blu-Ray,Адыгея,1.0,499.0
2,0,2,ТЦ,317,1,2013,Книги,45,Аудиокниги 1С,Адыгея,1.0,299.0
3,0,2,ТЦ,438,1,2013,Книги,45,Аудиокниги 1С,Адыгея,1.0,299.0
4,0,2,ТЦ,471,1,2013,Книги,49,Методические материалы 1С,Адыгея,2.0,399.0


In [16]:
def get_last_x_month_df(df, test_df, num_of_months):
    max_month = df['date_block_num'].max()
    max_month_used = max_month + 1 - num_of_months
    
    df_last_x_month = df[df.date_block_num >= max_month_used]
    df_last_x_month = df_last_x_month.groupby(['item_id', 'shop_id', 'month', 'year']).agg({'item_cnt_month': 'sum'}).reset_index()
    
    max_month_row = df[df.date_block_num == max_month].head(1)
    min_month_row = df[df.date_block_num == max_month_used].head(1)
    start_month, start_year = min_month_row.month.values[0], min_month_row.year.values[0]
    end_month, end_year = max_month_row.month.values[0], max_month_row.year.values[0]
    
    item_id_dataset_reduced = generate_all_ids_dataset(start_year, start_month, end_year, end_month, test_df)
    full_last_x_df = item_id_dataset_reduced.merge(df_last_x_month, how='left', on=['item_id', 'shop_id',
                                                                                    'month', 'year'])
    full_last_x_df = full_last_x_df.fillna(-100)
    return full_last_x_df

In [17]:
def get_avg_prediction(df):
    avg_results = df.groupby(['item_id', 'shop_id', 'ID']).agg({'item_cnt_month':'mean'}).reset_index()
    #avg_results['item_cnt_month'] = avg_results['item_cnt_month'].apply(lambda x: float(int(x + 0.5)))
    return avg_results

### Baseline: average of last 3 months

In [40]:
#last_3_month = get_last_x_month_df(baseline_train, test_df, 3)

In [41]:
#last_3_month['item_cnt_month'] = last_3_month['item_cnt_month'].apply(lambda x: x if x >= 0 else 0)

In [11]:
#last_3_month.tail(10)

In [43]:
#baseline_predict = get_avg_prediction(last_3_month)

In [45]:
def save_prediction(predicted_df, filename):
    subm = pd.read_csv('sample_submission.csv.zip')[['ID']]
    res = subm.merge(predicted_df, on='ID', how='left')
    res = res.fillna(0)
    res = res.drop(columns=['item_id', 'shop_id'])
    res.to_csv(filename, index=False)
    return res

In [12]:
#save_prediction(baseline_predict, 'avg_without_round.csv')

In [8]:
item_ids = items[~items.item_id.isin(dup_item_dict.keys())]['item_id'].unique()
shop_ids = shops[~shops.shop_id.isin([0, 1, 10])]['shop_id'].unique()

In [9]:
%%time
all_ids_all = generate_all_ids_dataset(2013, 1, 2015, 10, item_ids, shop_ids)
#all_ids_all = generate_all_ids_dataset(2013, 1, 2015, 10, test_df)

Wall time: 6min 53s


In [18]:
%%time
all_ids_for_test = generate_all_ids_dataset(2015, 11, 2015, 11, item_ids, shop_ids)

Wall time: 9.52 s


In [24]:
all_ids_all_extended

Unnamed: 0,item_id,shop_id,year,month,shop_type,item_category_id,category,sub_category,city,date_block_num
0,0,2,2013,1,ТЦ,40,Кино,DVD,Адыгея,0
1,0,2,2013,2,ТЦ,40,Кино,DVD,Адыгея,1
2,0,2,2013,3,ТЦ,40,Кино,DVD,Адыгея,2
3,0,2,2013,4,ТЦ,40,Кино,DVD,Адыгея,3
4,0,2,2013,5,ТЦ,40,Кино,DVD,Адыгея,4
...,...,...,...,...,...,...,...,...,...,...
42851113,22099,59,2015,6,ТЦ,83,Элементы питания,Элементы питания,Ярославль,29
42851114,22099,59,2015,7,ТЦ,83,Элементы питания,Элементы питания,Ярославль,30
42851115,22099,59,2015,8,ТЦ,83,Элементы питания,Элементы питания,Ярославль,31
42851116,22099,59,2015,9,ТЦ,83,Элементы питания,Элементы питания,Ярославль,32


In [10]:
%%time
all_ids_all_extended = prepare_zero_dataset(all_ids_all, items, shops, item_categories)

Wall time: 10min 25s


In [20]:
%%time
all_ids_test_extended = prepare_zero_dataset(all_ids_for_test, items, shops, item_categories)

Wall time: 5.29 s


In [21]:
all_ids_test_extended.head()

Unnamed: 0,item_id,shop_id,year,month,shop_type,item_category_id,category,sub_category,city
0,0,2,2015,11,ТЦ,40,Кино,DVD,Адыгея
1,0,3,2015,11,ТРК,40,Кино,DVD,Балашиха
2,0,4,2015,11,ТЦ,40,Кино,DVD,Волжский
3,0,5,2015,11,ТРЦ,40,Кино,DVD,Вологда
4,0,6,2015,11,МАГАЗИН,40,Кино,DVD,Воронеж


In [16]:
all_ids_all_extended

Unnamed: 0,item_id,shop_id,year,month,shop_type,item_category_id,category,sub_category,city
0,0,2,2013,1,ТЦ,40,Кино,DVD,Адыгея
1,0,2,2013,2,ТЦ,40,Кино,DVD,Адыгея
2,0,2,2013,3,ТЦ,40,Кино,DVD,Адыгея
3,0,2,2013,4,ТЦ,40,Кино,DVD,Адыгея
4,0,2,2013,5,ТЦ,40,Кино,DVD,Адыгея
...,...,...,...,...,...,...,...,...,...
42851113,22099,59,2015,6,ТЦ,83,Элементы питания,Элементы питания,Ярославль
42851114,22099,59,2015,7,ТЦ,83,Элементы питания,Элементы питания,Ярославль
42851115,22099,59,2015,8,ТЦ,83,Элементы питания,Элементы питания,Ярославль
42851116,22099,59,2015,9,ТЦ,83,Элементы питания,Элементы питания,Ярославль


In [11]:
block_num_df = baseline_train[['date_block_num', 'month', 'year']].drop_duplicates()

In [12]:
all_ids_all_extended_with_date = all_ids_all_extended.merge(block_num_df, how='left', on=['month', 'year'])

In [26]:
test_df.head()

Unnamed: 0,ID,shop_id,item_id,shop_name,shop_type,item_name,item_category_id,item_name_correct,item_category_name,category,sub_category,city,date,month,year
0,0,5,5037,"Вологда ТРЦ ""Мармелад""",ТРЦ,"NHL 15 [PS3, русские субтитры]",19,"nhl 15 [ps3, русские субтитры]",Игры - PS3,Игры,PS3,Вологда,2015-11-01,11,2015
1,5100,4,5037,"Волжский ТЦ ""Волга Молл""",ТЦ,"NHL 15 [PS3, русские субтитры]",19,"nhl 15 [ps3, русские субтитры]",Игры - PS3,Игры,PS3,Волжский,2015-11-01,11,2015
2,10200,6,5037,"Воронеж (Плехановская, 13)",МАГАЗИН,"NHL 15 [PS3, русские субтитры]",19,"nhl 15 [ps3, русские субтитры]",Игры - PS3,Игры,PS3,Воронеж,2015-11-01,11,2015
3,15300,3,5037,"Балашиха ТРК ""Октябрь-Киномир""",ТРК,"NHL 15 [PS3, русские субтитры]",19,"nhl 15 [ps3, русские субтитры]",Игры - PS3,Игры,PS3,Балашиха,2015-11-01,11,2015
4,20400,2,5037,"Адыгея ТЦ ""Мега""",ТЦ,"NHL 15 [PS3, русские субтитры]",19,"nhl 15 [ps3, русские субтитры]",Игры - PS3,Игры,PS3,Адыгея,2015-11-01,11,2015


In [27]:
%%time
full_test_with_zeroes = all_ids_test_extended.merge(
    test_df.drop(columns=['ID', 'shop_name', 'item_name', 'item_name_correct',
                          'item_category_name', 'date']), 
                 how='left', on=[ 'shop_id', 'shop_type', 'item_id', 'month', 'year', 
                                 'category', 'sub_category', 'city',  'item_category_id'])

Wall time: 3.74 s


In [28]:
full_test_with_zeroes.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1260327 entries, 0 to 1260326
Data columns (total 9 columns):
 #   Column            Non-Null Count    Dtype 
---  ------            --------------    ----- 
 0   item_id           1260327 non-null  object
 1   shop_id           1260327 non-null  object
 2   year              1260327 non-null  int64 
 3   month             1260327 non-null  int64 
 4   shop_type         1260327 non-null  object
 5   item_category_id  1260327 non-null  int64 
 6   category          1260327 non-null  object
 7   sub_category      1260327 non-null  object
 8   city              1260327 non-null  object
dtypes: int64(3), object(6)
memory usage: 96.2+ MB


In [33]:
all_ids_all_extended.head()

Unnamed: 0,item_id,shop_id,year,month,shop_type,item_category_id,category,sub_category,city
0,0,2,2013,1,ТЦ,40,Кино,DVD,Адыгея
1,0,2,2013,2,ТЦ,40,Кино,DVD,Адыгея
2,0,2,2013,3,ТЦ,40,Кино,DVD,Адыгея
3,0,2,2013,4,ТЦ,40,Кино,DVD,Адыгея
4,0,2,2013,5,ТЦ,40,Кино,DVD,Адыгея


In [13]:
%%time
full_train_with_zeroes = all_ids_all_extended_with_date.merge(baseline_train, how='left', 
                                                    on=[ 'shop_id', 'shop_type', 'item_id', 'month', 'year', 
                                                         'category', 'sub_category', 'city', 'date_block_num', 'item_category_id'])

Wall time: 4min 30s


In [14]:
full_train_with_zeroes.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42851118 entries, 0 to 42851117
Data columns (total 12 columns):
 #   Column            Non-Null Count     Dtype  
---  ------            --------------     -----  
 0   item_id           42851118 non-null  object 
 1   shop_id           42851118 non-null  object 
 2   year              42851118 non-null  int64  
 3   month             42851118 non-null  int64  
 4   shop_type         42851118 non-null  object 
 5   item_category_id  42851118 non-null  int64  
 6   category          42851118 non-null  object 
 7   sub_category      42851118 non-null  object 
 8   city              42851118 non-null  object 
 9   date_block_num    42851118 non-null  int64  
 10  item_cnt_month    1608811 non-null   float64
 11  item_price        1608811 non-null   float64
dtypes: float64(2), int64(4), object(6)
memory usage: 4.2+ GB


In [36]:
del all_ids_all
del all_ids_all_extended
del all_ids_all_extended_with_date
#del all_ids_test_extended
del block_num_df
del item_ids
del shop_ids

In [15]:
category_encoder = LabelEncoder()
full_train_with_zeroes['category_code'] = category_encoder.fit_transform(full_train_with_zeroes['category'])
test_df['category_code'] = category_encoder.transform(test_df['category'])

sub_category_encoder = LabelEncoder()
full_train_with_zeroes['sub_category_code'] = sub_category_encoder.fit_transform(full_train_with_zeroes['sub_category'])
test_df['sub_category_code'] = sub_category_encoder.transform(test_df['sub_category'])

city_encoder = LabelEncoder()
full_train_with_zeroes['city_code'] = city_encoder.fit_transform(full_train_with_zeroes['city'])
test_df['city_code'] = city_encoder.transform(test_df['city'])

shop_encoder = LabelEncoder()
full_train_with_zeroes['shop_type_code'] = shop_encoder.fit_transform(full_train_with_zeroes['shop_type'])
test_df['shop_type_code'] = shop_encoder.transform(test_df['shop_type'])

In [16]:
full_train_with_zeroes.drop(columns=['category', 'sub_category', 'city', 'shop_type'], inplace=True)
test_df.drop(columns=['category', 'sub_category', 'city', 'shop_type'], inplace=True)

In [39]:
full_test_with_zeroes['date_block_num'] = 34
full_df = pd.concat([full_train_with_zeroes, full_test_with_zeroes])

In [40]:
del full_train_with_zeroes
del full_test_with_zeroes

In [18]:
full_train_with_zeroes.sort_values(by='date_block_num', inplace=True)

In [19]:
full_train_with_zeroes['item_price'] = full_train_with_zeroes.groupby(['item_id', 'shop_id'])['item_price'].ffill().bfill()
full_train_with_zeroes['item_price'] = full_train_with_zeroes.groupby(['item_id'])['item_price'].ffill().bfill()

In [20]:
full_train_with_zeroes.head(5)

Unnamed: 0,item_id,shop_id,year,month,item_category_id,date_block_num,item_cnt_month,item_price,category_code,sub_category_code,city_code,shop_type_code
0,0,2,2013,1,40,0,,149.0,11,6,0,4
7730002,18884,40,2013,1,40,0,,149.0,11,6,18,2
34666400,8060,44,2013,1,20,0,,149.0,5,13,20,4
7730036,18884,41,2013,1,40,0,,149.0,11,6,18,4
34666366,8060,43,2013,1,20,0,,149.0,5,13,19,1


In [24]:
full_train_with_zeroes.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42851118 entries, 0 to 42851117
Data columns (total 12 columns):
 #   Column             Non-Null Count     Dtype  
---  ------             --------------     -----  
 0   item_id            42851118 non-null  int64  
 1   shop_id            42851118 non-null  int64  
 2   year               42851118 non-null  int64  
 3   month              42851118 non-null  int64  
 4   item_category_id   42851118 non-null  int64  
 5   date_block_num     42851118 non-null  int64  
 6   item_cnt_month     42851118 non-null  float64
 7   item_price         42851118 non-null  float64
 8   category_code      42851118 non-null  int32  
 9   sub_category_code  42851118 non-null  int32  
 10  city_code          42851118 non-null  int32  
 11  shop_type_code     42851118 non-null  int32  
dtypes: float64(2), int32(4), int64(6)
memory usage: 3.5 GB


In [23]:
full_train_with_zeroes.fillna(0, inplace=True)

In [25]:
real_train = full_train_with_zeroes[full_train_with_zeroes.date_block_num > 11]

In [27]:
def enrich_real_train(full_df, real_df, cols):
    group_by_cols, group_cols = calculate_and_add_lag(full_df, cols)
    real_df = pd.merge(real_df, group_by_cols, on=group_cols, how='left')
    return real_df

### Add lag features

In [28]:
%%time
real_train = enrich_real_train(full_train_with_zeroes, real_train, ['date_block_num', 'item_id'])

Wall time: 23.8 s


In [29]:
%time
real_train = enrich_real_train(full_train_with_zeroes, real_train, ['date_block_num', 'shop_id'])

Wall time: 0 ns


In [30]:
%%time
real_train = enrich_real_train(full_train_with_zeroes, real_train, ['date_block_num', 'shop_id', 'item_id'])

Wall time: 12min 20s


In [31]:
%%time
real_train = enrich_real_train(full_train_with_zeroes, real_train, ['date_block_num',  'item_category_id'])

Wall time: 27.2 s


In [32]:
%%time
real_train = enrich_real_train(full_train_with_zeroes, real_train, ['date_block_num', 'category_code'])
#full_df = calculate_and_add_lag(full_df, ['date_block_num', 'shop_id', 'category_code'])

Wall time: 36.9 s


In [33]:
%%time
real_train = enrich_real_train(full_train_with_zeroes, real_train, ['date_block_num', 'sub_category_code'])

Wall time: 39 s


In [35]:
del full_train_with_zeroes

In [34]:
real_train.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27727194 entries, 0 to 27727193
Data columns (total 48 columns):
 #   Column                                 Non-Null Count     Dtype  
---  ------                                 --------------     -----  
 0   item_id                                27727194 non-null  int64  
 1   shop_id                                27727194 non-null  int64  
 2   year                                   27727194 non-null  int64  
 3   month                                  27727194 non-null  int64  
 4   item_category_id                       27727194 non-null  int64  
 5   date_block_num                         27727194 non-null  int64  
 6   item_cnt_month                         27727194 non-null  float64
 7   item_price                             27727194 non-null  float64
 8   category_code                          27727194 non-null  int32  
 9   sub_category_code                      27727194 non-null  int32  
 10  city_code                   

In [48]:

# %%time
# full_train_with_zeroes = calculate_and_add_lag(full_train_with_zeroes, ['date_block_num', 'shop_id', 'sub_category'])

In [49]:
# %%time
# full_train_with_zeroes = calculate_and_add_lag(full_train_with_zeroes, ['date_block_num', 'sub_category'])

In [50]:
# %%time
# full_train_with_zeroes = calculate_and_add_lag(full_train_with_zeroes, ['date_block_num', 'category'])

## LigthGBM

In [36]:
y = real_train['item_cnt_month']
real_train.drop(columns= ['item_cnt_month'], inplace=True)

In [38]:
X = real_train

In [45]:
X.to_csv('real_X.csv')
y.to_csv('real_y.csv')

MemoryError: Unable to allocate 9.61 MiB for an array with shape (37, 2127) and data type <U32

In [43]:
def cross_val_ts(df_x, df_y, date_block_start, model):
    scores = []
    for i in range(date_block_start, 33):
        X_train = df_x[df_x.date_block_num < i]
        y_train = df_y[df_x.date_block_num < i]
        
        X_test = df_x[df_x.date_block_num == i]
        y_test = df_y[df_x.date_block_num == i]
        
        model.fit(df_x[df_x.date_block_num < i], df_y[df_x.date_block_num < i])
        y_pred = model.predict(X_test)
        score = mean_squared_error(y_test, y_pred, squared=False)
        scores.append(score)
        print(f"{i} date_block_num: score {score}")
    return scores

In [44]:
lgbm = LGBMRegressor()


cv_score = cross_val_ts(X, y, 19, lgbm)


MemoryError: Unable to allocate 26.4 MiB for an array with shape (27727194,) and data type bool

In [None]:
cv_score

In [121]:
X = baseline_train.drop(columns=['item_cnt_month']).sort_values(by="date_block_num")
y = baseline_train['item_cnt_month']
lr = LinearRegression()
tscv = TimeSeriesSplit(5)
cv_score = cross_val_score(lr, X, y, cv=tscv, scoring='r2')