In [3]:
import numpy as np
import pandas as pd

from random import random
import os

from sklearn.metrics import mean_squared_log_error as msle
from dateutil.relativedelta import relativedelta
import statsmodels.api as sm

from matplotlib import pyplot as plt, style
style.use('seaborn-darkgrid')
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as px
from tqdm import tqdm

from ydata_profiling import ProfileReport

import gc
gc.enable()
from warnings import filterwarnings, simplefilter
filterwarnings('ignore')
simplefilter('ignore')

In [4]:
test = pd.read_csv('test_features.csv')
train = pd.read_csv('train_features.csv')

In [5]:
train_test = pd.concat([train, test], axis=0)

In [6]:
train_test

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,day_of_week,weekend,month,day_of_year,...,year,dcoilwtico,city,state,type,cluster,transactions,national_day,regional_day,local_day
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,2,0,1,1,...,2013,0.00,Quito,Pichincha,D,13,0.0,Holiday,,
1,1,2013-01-01,1,BABY CARE,0.0,0,2,0,1,1,...,2013,0.00,Quito,Pichincha,D,13,0.0,Holiday,,
2,2,2013-01-01,1,BEAUTY,0.0,0,2,0,1,1,...,2013,0.00,Quito,Pichincha,D,13,0.0,Holiday,,
3,3,2013-01-01,1,BEVERAGES,0.0,0,2,0,1,1,...,2013,0.00,Quito,Pichincha,D,13,0.0,Holiday,,
4,4,2013-01-01,1,BOOKS,0.0,0,2,0,1,1,...,2013,0.00,Quito,Pichincha,D,13,0.0,Holiday,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28507,3029263,2017-08-31,54,POULTRY,,0,4,0,8,243,...,2017,47.26,El Carmen,Manabi,C,3,0.0,,,
28508,3029264,2017-08-31,54,PREPARED FOODS,,0,4,0,8,243,...,2017,47.26,El Carmen,Manabi,C,3,0.0,,,
28509,3029265,2017-08-31,54,PRODUCE,,1,4,0,8,243,...,2017,47.26,El Carmen,Manabi,C,3,0.0,,,
28510,3029266,2017-08-31,54,SCHOOL AND OFFICE SUPPLIES,,0,4,0,8,243,...,2017,47.26,El Carmen,Manabi,C,3,0.0,,,


In [None]:
alphas = [0.95, 0.9, 0.8, 0.5]
#lags =[1, 7,30]

In [7]:
lags=[*range(1, 16), 16, 17, 18, 19, 20, 21, 22, 30, 31, 90, 180, 364]

In [8]:
for lag in lags:
        train_test[f"sales_t-{lag}"] = train_test.groupby(['store_nbr', 'family'])['sales'].transform(
        lambda x: x.shift(lag))

In [9]:
train_test

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,day_of_week,weekend,month,day_of_year,...,sales_t-18,sales_t-19,sales_t-20,sales_t-21,sales_t-22,sales_t-30,sales_t-31,sales_t-90,sales_t-180,sales_t-364
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,2,0,1,1,...,,,,,,,,,,
1,1,2013-01-01,1,BABY CARE,0.0,0,2,0,1,1,...,,,,,,,,,,
2,2,2013-01-01,1,BEAUTY,0.0,0,2,0,1,1,...,,,,,,,,,,
3,3,2013-01-01,1,BEVERAGES,0.0,0,2,0,1,1,...,,,,,,,,,,
4,4,2013-01-01,1,BOOKS,0.0,0,2,0,1,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28507,3029263,2017-08-31,54,POULTRY,,0,4,0,8,243,...,79.062996,91.671,80.759,49.666,51.015,72.303,96.937996,78.164,34.394,34.800
28508,3029264,2017-08-31,54,PREPARED FOODS,,0,4,0,8,243,...,91.000000,81.000,54.000,61.000,91.000,114.000,100.000000,66.000,101.000,66.000
28509,3029265,2017-08-31,54,PRODUCE,,1,4,0,8,243,...,877.304000,696.920,546.250,498.122,595.185,1045.444,633.389000,669.161,635.936,513.323
28510,3029266,2017-08-31,54,SCHOOL AND OFFICE SUPPLIES,,0,4,0,8,243,...,0.000000,0.000,0.000,0.000,0.000,0.000,0.000000,0.000,0.000,0.000


In [10]:
windows = [16, 17, 18, 30]

In [None]:
def roll_mean_features(dataframe, windows):
    def add_noise(x):
        return x + np.random.normal(size=len(x))

    for window in windows:
        dataframe['sales_roll_mean_' + str(window)] = dataframe.groupby(["store_nbr", "family"])['sales']. \
                                                          transform(
            lambda x: x.shift(16).rolling(window=window, min_periods=7, win_type="triang").mean())
        dataframe['sales_roll_mean_' + str(window)] = dataframe.groupby(["store_nbr", "family"])['sales_roll_mean_' + str(window)].transform(add_noise)
    return dataframe

In [15]:
def add_noise(x):
    return x + np.random.normal(size=len(x))

In [16]:
for window in range(windows):
        train_test['sales_roll_mean_' + str(window)] = train_test.groupby(["store_nbr", "family"])['sales']. \
                                                      transform(
            lambda x: x.shift(16).rolling(window=window, min_periods=0, win_type="triang").mean())
        train_test['sales_roll_mean_' + str(window)] = train_test.groupby(["store_nbr", "family"])['sales_roll_mean_' + str(window)].transform(add_noise)

In [17]:
train_test

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,day_of_week,weekend,month,day_of_year,...,sales_roll_mean_6,sales_roll_mean_7,sales_roll_mean_8,sales_roll_mean_9,sales_roll_mean_10,sales_roll_mean_11,sales_roll_mean_12,sales_roll_mean_13,sales_roll_mean_14,sales_roll_mean_15
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,2,0,1,1,...,,,,,,,,,,
1,1,2013-01-01,1,BABY CARE,0.0,0,2,0,1,1,...,,,,,,,,,,
2,2,2013-01-01,1,BEAUTY,0.0,0,2,0,1,1,...,,,,,,,,,,
3,3,2013-01-01,1,BEVERAGES,0.0,0,2,0,1,1,...,,,,,,,,,,
4,4,2013-01-01,1,BOOKS,0.0,0,2,0,1,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28507,3029263,2017-08-31,54,POULTRY,,0,4,0,8,243,...,78.412919,72.922771,73.212148,70.396023,66.778018,70.381253,67.003839,68.420401,68.854212,67.949804
28508,3029264,2017-08-31,54,PREPARED FOODS,,0,4,0,8,243,...,89.803064,85.640183,80.803345,80.873462,80.247505,82.335871,84.690871,86.195953,86.422247,85.452530
28509,3029265,2017-08-31,54,PRODUCE,,1,4,0,8,243,...,704.790541,670.175961,652.907163,648.504022,644.405117,662.669088,662.776601,666.759124,666.147988,670.415513
28510,3029266,2017-08-31,54,SCHOOL AND OFFICE SUPPLIES,,0,4,0,8,243,...,0.252993,1.316606,-0.330108,-0.101776,1.660569,-0.613346,0.856926,0.861250,-0.747044,-1.501694


In [19]:
df_pred = test.copy()

In [25]:
for dirname, _, filenames in os.walk('./submissions/turkish/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        df_pred['sub_'+filename.split('_')[1].split('.')[0]] = pd.read_csv(os.path.join(dirname, filename), index_col=['id']).sales

./submissions/turkish/test_features.csv


AttributeError: 'DataFrame' object has no attribute 'sales'

In [23]:
df_pred

Unnamed: 0,id,date,store_nbr,family,onpromotion,day_of_week,weekend,month,day_of_year,week_of_year,...,dcoilwtico,city,state,type,cluster,transactions,national_day,regional_day,local_day,sub_features
0,3000888,2017-08-16,1,AUTOMOTIVE,0,3,0,8,228,33,...,46.80,Quito,Pichincha,D,13,0.0,,,,0.000
1,3000889,2017-08-16,1,BABY CARE,0,3,0,8,228,33,...,46.80,Quito,Pichincha,D,13,0.0,,,,0.000
2,3000890,2017-08-16,1,BEAUTY,2,3,0,8,228,33,...,46.80,Quito,Pichincha,D,13,0.0,,,,0.000
3,3000891,2017-08-16,1,BEVERAGES,20,3,0,8,228,33,...,46.80,Quito,Pichincha,D,13,0.0,,,,0.000
4,3000892,2017-08-16,1,BOOKS,0,3,0,8,228,33,...,46.80,Quito,Pichincha,D,13,0.0,,,,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28507,3029263,2017-08-31,54,POULTRY,0,4,0,8,243,35,...,47.26,El Carmen,Manabi,C,3,0.0,,,,310.617
28508,3029264,2017-08-31,54,PREPARED FOODS,0,4,0,8,243,35,...,47.26,El Carmen,Manabi,C,3,0.0,,,,54.000
28509,3029265,2017-08-31,54,PRODUCE,1,4,0,8,243,35,...,47.26,El Carmen,Manabi,C,3,0.0,,,,0.000
28510,3029266,2017-08-31,54,SCHOOL AND OFFICE SUPPLIES,0,4,0,8,243,35,...,47.26,El Carmen,Manabi,C,3,0.0,,,,0.000


In [24]:
test_sub = pd.read_csv('./sumbissions/turkish/test.csv')

FileNotFoundError: [Errno 2] No such file or directory: './sumbissions/turkish/test.csv'