In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import os
from tqdm.notebook import tqdm
from sklearn.preprocessing import OrdinalEncoder
import random 

import torch
import pandas as pd
import numpy as np
import gc
import math
import datetime
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error

import os
import time
import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader,Dataset
from tqdm import tqdm_notebook as tqdm

device = torch.device('cuda')
# device = torch.device('cpu')

NUM_ITEMS = 30490
DAYS_PRED = 28


def seed_everything(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything()    

# Make data

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
path = "../data"
calendar = pd.read_csv(os.path.join(path, "calendar.csv"))
selling_prices = pd.read_csv(os.path.join(path, "sell_prices.csv"))
sample_submission = pd.read_csv(os.path.join(path, "sample_submission.csv"))
sales = pd.read_csv(os.path.join(path, "sales_train_evaluation.csv"))

In [None]:
def prep_calendar(df):
    df = df.drop(["date", "weekday"], axis=1)
    df = df.assign(d = df.d.str[2:].astype(int))
    df = df.fillna("missing")
    cols = list(set(df.columns) - {"wm_yr_wk", "d"})
    df[cols] = OrdinalEncoder(dtype="int").fit_transform(df[cols])
    df = reduce_mem_usage(df)
    return df

calendar = prep_calendar(calendar)

In [None]:
def prep_selling_prices(df):
    gr = df.groupby(["store_id", "item_id"])["sell_price"]
    df["sell_price_rel_diff"] = gr.pct_change()
    df["sell_price_roll_sd7"] = gr.transform(lambda x: x.rolling(7).std())
    df["sell_price_cumrel"] = (gr.shift(0) - gr.cummin()) / (1 + gr.cummax() - gr.cummin())
    df["price_unique"] = gr.transform('nunique')
    df = reduce_mem_usage(df)
    return df

selling_prices = prep_selling_prices(selling_prices)

In [None]:
def reshape_sales(df, drop_d = None):
    if drop_d is not None:
        df = df.drop(["d_" + str(i + 1) for i in range(drop_d)], axis=1)
    df = df.assign(id=df.id.str.replace("_evaluation", ""))
    df = df.reindex(columns=df.columns.tolist() + ["d_" + str(1913 + i + 28  + 1) for i in range(28)])
    df = df.melt(id_vars=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"],
                 var_name='d', value_name='demand')
    df = df.assign(d=df.d.str[2:].astype("int16"))
    return df

sales = reshape_sales(sales, 1000)

In [None]:
def prep_sales(df):
    df['lag_t28'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28))
    df['lag_t90'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(90))
    df['lag_t180'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(180))
    df['lag_t365'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(365))
    df['rolling_mean_t7'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).mean())
    df['rolling_mean_t30'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).mean())
    df['rolling_mean_t60'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(60).mean())
    df['rolling_mean_t90'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(90).mean())
    df['rolling_mean_t180'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(180).mean())
    df['rolling_std_t7'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).std())
    df['rolling_std_t30'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).std())
    df['rolling_std_t90'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(90).std())

    # Remove rows with NAs except for submission rows. rolling_mean_t180 was selected as it produces most missings
    df = df[(df.d >= 1941) | (pd.notna(df.rolling_mean_t180))]
    df = reduce_mem_usage(df)

    return df

sales = prep_sales(sales)

In [None]:
sales = sales.merge(calendar, how="left", on="d")
gc.collect()
sales.head()

In [None]:
sales = sales.merge(selling_prices, how="left", on=["wm_yr_wk", "store_id", "item_id"])
sales.drop(["wm_yr_wk"], axis=1, inplace=True)
gc.collect()
sales.head()

In [None]:
del selling_prices; gc.collect()

In [None]:
cat_id_cols = ["item_id", "dept_id", "store_id", "cat_id", "state_id"]
cat_cols = cat_id_cols + ["wday", "month", "year", "event_name_1", 
                          "event_type_1", "event_name_2", "event_type_2"]

for i, v in tqdm(enumerate(cat_id_cols)):
    sales[v] = OrdinalEncoder(dtype="int").fit_transform(sales[[v]])

sales = reduce_mem_usage(sales)
sales.head()
gc.collect()

In [None]:
num_cols = ["sell_price", "sell_price_rel_diff", "sell_price_roll_sd7", "sell_price_cumrel",
            "lag_t28", "rolling_mean_t7", "rolling_mean_t30", "rolling_mean_t60", 
            "rolling_mean_t90", "rolling_mean_t180", "rolling_std_t7", "rolling_std_t30"]
bool_cols = ["snap_CA", "snap_TX", "snap_WI"]

dense_cols = num_cols + bool_cols

# Need to do column by column due to memory constraints
for i, v in tqdm(enumerate(num_cols)):
    sales[v] = sales[v].fillna(sales[v].median())
    
sales.head()

In [None]:
test = sales[sales.d >= 1914]
test = test.assign(id=test.id + "_" + np.where(test.d <= 1941, "validation", "evaluation"),
                   F="F" + (test.d - 1913 - 28 * (test.d > 1941)).astype("str"))
test.head()
gc.collect()

In [None]:
def make_X(df):
    X = {"dense1": df[dense_cols].to_numpy()}
    for i, v in enumerate(cat_cols):
        X[v] = df[[v]].to_numpy()
    X['id'] = df[['id']]        
    X['d'] = df[['d']]        
    return X

# Submission data
X_test = make_X(test)

# One month of validation data
flag = (sales.d < 1942) & (sales.d >= 1942 - 28)
valid = (make_X(sales[flag]),
         sales["demand"][flag])

# Rest is used for training
# flag = sales.d < 1942 - 28
flag = sales.d < 1942 
X_train = make_X(sales[flag])
y_train = sales["demand"][flag]
                             
del sales, flag
gc.collect()

In [None]:
import pickle
def save(x, fname):
    with open(fname, "wb") as handle:
        pickle.dump(x, handle)
        
save(X_train, "X_train_final.tmp")
del X_train; gc.collect()
save(y_train, "y_train_final.tmp")
del y_train; gc.collect()
save(X_test, "X_test.tmp")
del X_test; gc.collect()
save(valid, "valid.tmp")
del valid; gc.collect()
save(test, "test.tmp")

In [None]:
!ls *tmp -algh

# Load data

In [2]:
import pickle
def load(fname):
    with open(fname, "rb") as handle:
        return pickle.load(handle)            

In [3]:
# save(X_test, "X_test.tmp")
# save(valid, "valid.tmp")
# save(X_train, "X_train.tmp")
# save(y_train, "y_train.tmp")
# save(test, "test.tmp")

In [4]:
X_test = load("X_test.tmp")
valid = load("valid.tmp")
X_train = load("X_train_final.tmp")
y_train = load("y_train_final.tmp")
test = load("test.tmp")
fday = X_train['d'].values.flatten().min()

In [5]:
X_train['d'].tail()

Unnamed: 0,d
22379655,1941
22379656,1941
22379657,1941
22379658,1941
22379659,1941


In [6]:
valid[0]['d'].tail()

Unnamed: 0,d
22379655,1941
22379656,1941
22379657,1941
22379658,1941
22379659,1941


In [7]:
path = "../data"
sales_df = pd.read_csv(os.path.join(path, "sales_train_evaluation.csv"))
sales_df.id = sales_df.id.apply(lambda x : "_".join(x.split("_")[:-1]))
sales_df.index = sales_df.id
sales_df = sales_df.drop(["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"], axis=1)
cols = [i for i in range(1941)]
sales_df.columns = cols
sales_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1931,1932,1933,1934,1935,1936,1937,1938,1939,1940
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HOBBIES_1_001_CA_1,0,0,0,0,0,0,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
HOBBIES_1_002_CA_1,0,0,0,0,0,0,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
HOBBIES_1_003_CA_1,0,0,0,0,0,0,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
HOBBIES_1_004_CA_1,0,0,0,0,0,0,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
HOBBIES_1_005_CA_1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0


In [8]:
cat_id_cols = ["item_id", "dept_id", "store_id", "cat_id", "state_id"]
cat_cols = cat_id_cols + ["wday", "month", "year", "event_name_1", 
                          "event_type_1", "event_name_2", "event_type_2"]

# DataLoader

In [9]:
class M5Loader:
    def __init__(self, X, y, sales_df, shuffle=True, 
                 batch_size=10000, seq_len=56, 
                 cat_cols=[], ret_garbage=False,
                 reduce=None):
        
        if reduce is not None:
            n = X["dense1"].shape[0]
            k = int((1-reduce)*n)
            reduced_idxs = np.random.choice([i for i in range(n)], k, replace=False)
            
        self.X_cont = X["dense1"]
        self.X_cat = np.concatenate([X[k] for k in cat_cols], axis=1)
        self.ids = X["id"].values.flatten()
        self.ds = X["d"].values.flatten()
        self.y = y
        
        if reduce is not None:
            self.X_cont = self.X_cont[[reduced_idxs]]
            self.X_cat = self.X_cat[[reduced_idxs]]
            self.ids = self.ids[[reduced_idxs]]
            self.ds = self.ds[[reduced_idxs]]
            self.y = self.y[[reduced_idxs]]

        self.sales_df = sales_df
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.n_conts = self.X_cont.shape[1]
        self.len = self.X_cont.shape[0]
        n_batches, remainder = divmod(self.len, self.batch_size)
        
        if remainder > 0:
            n_batches += 1
        self.n_batches = n_batches
        self.remainder = remainder #for debugging
        
        self.idxes = np.array([i for i in range(self.len)])
        self.ret_garbage = ret_garbage #For last 28/56 days of test set which cant be predicted right now
        self.always_garbage = False
        
        
    def __iter__(self):
        self.i = 0
        if self.shuffle:
            ridxes = self.idxes
            np.random.shuffle(ridxes)
            self.X_cat = self.X_cat[[ridxes]]
            self.X_cont = self.X_cont[[ridxes]]
            self.ids = self.ids[[ridxes]]
            self.ds = self.ds[[ridxes]]
            if self.y is not None:
                self.y = self.y[[ridxes]]
                
        return self

    def __next__(self):
        if self.i  >= self.len:
            raise StopIteration
        
        if self.always_garbage:
            self.i += self.batch_size
            return None, None, None, None
            
        if self.y is not None:
            y = torch.FloatTensor(self.y[self.i:self.i+self.batch_size].astype(np.float32))
        else:
            y = None
        
         
        ids = self.ids[self.i:self.i+self.batch_size]      
        ds = self.ds[self.i:self.i+self.batch_size]
        cur_batch_size = ids.shape[0]
        hist = np.zeros((cur_batch_size, self.seq_len))
        horizon = 28 
        
        if self.ret_garbage:
            try:
                for past in range(self.seq_len):
                    hist[:, past] = self.sales_df.lookup(ids, ds-horizon-self.seq_len+past) #TODO: pandas lookup is slow, maybe hash ids and do a npy lookup
            except:
                print("NOOOOOOO This should not happen")
                self.always_garbage = True
                return None, None, None, None
        else:
              for past in range(self.seq_len):
                    hist[:, past] = self.sales_df.lookup(ids, ds-horizon-self.seq_len+past) #TODO: pandas lookup is slow, maybe hash ids and do a npy lookup
                    
        xcont = torch.FloatTensor(self.X_cont[self.i:self.i+self.batch_size])
        xcat = torch.LongTensor(self.X_cat[self.i:self.i+self.batch_size])
        xhist = torch.FloatTensor(hist)
        
        batch = (xcont, xcat, xhist, y)
        self.i += self.batch_size
        return batch

    def __len__(self):
        return self.n_batches           

In [10]:
# bs = 10000
bs = 2**11
# bs = 2**7
shuffle = True
seq_len = int(28*4)
reduce = None

train_loader = M5Loader(X_train, y_train.values, 
                                                sales_df, cat_cols=cat_cols, 
                                                batch_size=bs, seq_len=seq_len,
                                                shuffle=shuffle, reduce=reduce)

val_loader = M5Loader(valid[0], valid[1].values,
                                                sales_df, cat_cols=cat_cols, 
                                                batch_size=bs, seq_len=seq_len,
                                                shuffle=False)

X_test['id'].id = X_test['id'].id.apply(lambda x : "_".join(x.split("_")[:-1]))
test_loader = M5Loader(X_test, y=None,
                                                sales_df=sales_df, cat_cols=cat_cols, 
                                                batch_size=NUM_ITEMS, seq_len=seq_len,
                                                shuffle=False, ret_garbage=False)

print(f"Train loader len: {train_loader.len}")
print(f"Val loader len: {val_loader.len}")
print(f"Test loader len: {test_loader.len}")

Train loader len: 22379660
Val loader len: 853720
Test loader len: 1707440


In [12]:
# for i, col in enumerate(cat_cols):
#     print(col, "\t", np.unique(train_loader.X_cat[:, i]).shape)

In [13]:
uniques = [3049, 7, 10, 3, 3, 7, 12, 6, 31, 5, 5, 5]
# dims = [3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
dims = [5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
emb_dims = [(x, y) for x, y in zip(uniques, dims)]
print(emb_dims)
n_cont = train_loader.n_conts

[(3049, 5), (7, 1), (10, 1), (3, 1), (3, 1), (7, 1), (12, 1), (6, 1), (31, 1), (5, 1), (5, 1), (5, 1)]


In [14]:
%%time

for i, (X_cont, X_cat, xhist, y) in enumerate(tqdm(train_loader)):
    break

HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))

CPU times: user 8.34 s, sys: 172 ms, total: 8.51 s
Wall time: 8.54 s


# Loss functions and metrics

In [14]:
calc_wrmsse = True

class RMSE(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
    
    def forward(self, y_pred, y_true):
        y_pred = y_pred.squeeze()
        y_true = torch.FloatTensor(y).to(device)
        return torch.sqrt(self.mse(y_pred, y_true))    
    
class MSE(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
    
    def forward(self, y_pred, y_true):
        y_pred = y_pred.squeeze()
        y_true = torch.FloatTensor(y).to(device)
        return self.mse(y_pred, y_true)  

def rmse_metric(y_pred, y_true):
    y_pred = np.array(y_pred)
    y_true = np.array(y_true)
    return np.sqrt(np.mean((y_pred-y_true)**2))       

class Assymetric_RMSE(nn.Module):
    def __init__(self, penalty=1.5):
        super().__init__()
        self.mse = nn.MSELoss()
        self.penalty = penalty
        
    def forward(self, y_pred, y_true):
        y_pred = y_pred.squeeze()
        y_true = torch.FloatTensor(y).to(device)
#         error = torch.where(y_true==0, (y_true-y_pred)**2, self.penalty*(y_true-y_pred)**2)
        error = torch.where(y_true==0, self.penalty*(y_true-y_pred)**2, (y_true-y_pred)**2)
        return torch.sqrt(torch.mean(error))
    
class Tweedie(nn.Module):
    def __init__(self, rho=1.5):
        super().__init__()
        self.rho = rho
    
    def forward(self, y_pred, y_true):
        eps = 1e-10
        y_pred = y_pred.squeeze() + eps
        y_true = torch.FloatTensor(y).to(device) + eps
        rho = self.rho
        a = y_true * torch.pow(y_pred, (1-rho))  / (1-rho) 
        b = torch.pow(y_pred, (2-rho))  / (2-rho) 
        tweedie = -a + b
        loss = torch.mean(tweedie)
        return loss       
    
if calc_wrmsse:
    roll_mat_df = pd.read_pickle('../data/roll_mat_df.pkl')
    roll_index = roll_mat_df.index
    roll_mat_csr = csr_matrix(roll_mat_df.values)
    del roll_mat_df; gc.collect()

    sw_df = pd.read_pickle('../data/sw_df.pkl')
    s = sw_df.s.values
    w = sw_df.w.values
    sw = sw_df.sw.values   
    
def rollup(v, roll_mat_csr):
    return roll_mat_csr*v #(v.T*roll_mat_csr.T).T

def wrmsse_metric(preds, y_true, score_only=True, npy=True, roll_mat_csr=None, sw=None, verbose=False):
    preds = np.array(preds).reshape(NUM_ITEMS, -1)
    y_true = np.array(y_true).reshape(NUM_ITEMS, -1)
    
    if verbose:
        print(preds.shape)
        print(y_true.shape)
    
    if roll_mat_csr is None:
        roll_mat_df = pd.read_pickle('../data/roll_mat_df.pkl')
        roll_index = roll_mat_df.index
        roll_mat_csr = csr_matrix(roll_mat_df.values)
        del roll_mat_df; gc.collect()

    if sw is None:
        sw_df = pd.read_pickle('../data/sw_df.pkl')
        s = sw_df.s.values
        w = sw_df.w.values
        sw = sw_df.sw.values

    if not npy:
        preds = preds.values
        y_true = y_true.values
    
    if score_only:
        return np.sum(
                np.sqrt(
                    np.mean(
                        np.square(rollup(preds-y_true, roll_mat_csr))
                            ,axis=1)) * sw)/12 
    else: 
        score_matrix = (np.square(rollup(preds-y_true, roll_mat_csr)) * np.square(w)[:, None])/ s[:, None]
        score = np.sum(np.sqrt(np.mean(score_matrix,axis=1)))/12 
        return score, score_matrix

def rmse_metric(y_pred, y_true):
    y_pred = np.array(y_pred)
    y_true = np.array(y_true)
    return np.sqrt(np.mean((y_pred-y_true)**2))                                

In [15]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform(m.weight)
        m.bias.data.fill_(0.01)
        
class LinearBlock(nn.Module):
    def __init__(self, in_d, out_d, p=0):
        super().__init__()
        self.block = nn.Sequential(
                            nn.Linear(in_d, out_d),
                            nn.ReLU(),
#                             nn.LeakyReLU(),
#                             nn.Tanh(),
        )
    
    def forward(self, x):
        return self.block(x)
    
class ConvModule(nn.Module):
    def __init__(self, seq_len=56):
        super().__init__()
        self.seq_len = seq_len
        
        self.global_conv = nn.Conv1d(1, 1, kernel_size=(seq_len))
        self.week_conv = nn.Conv1d(1, 1, kernel_size=(7))
        self.biweek_conv = nn.Conv1d(1, 1, kernel_size=(14))
        self.month_conv = nn.Conv1d(1, 1, kernel_size=(24)) #yea, yea, a month is not 28 days. But quadweek is a big word. 
        self.bimonth_conv = nn.Conv1d(1, 1, kernel_size=(48)) 

        self.last_week_conv = nn.Conv1d(1, 1, kernel_size=(7))
        self.last_biweek_conv = nn.Conv1d(1, 1, kernel_size=(14))
        
        self.drop_large = nn.Dropout(0.35)
        self.drop_small = nn.Dropout(0.15)
        
    def forward(self, x):
        x = x[:, None, :] #insert 1 channel (bs, channel, timesteps)
        bs = x.shape[0]
        
        out1 = self.global_conv(x).view(bs, -1)
        out1 = self.drop_large(out1)
        
        out2 = self.week_conv(x).view(bs, -1)
        out2 = self.drop_large(out2)
        
        out3 = self.biweek_conv(x).view(bs, -1)
        out3 = self.drop_large(out3)
        
        out4 = self.month_conv(x).view(bs, -1)
        out4 = self.drop_large(out4)
        
        out6 = self.last_week_conv(x[:, :, -7:]).view(bs, -1)
        out6 = self.drop_small(out6)     
        
        out7 = self.last_biweek_conv(x[:, :, -14:]).view(bs, -1)
        out7 = self.drop_small(out7)             
        
        out = torch.cat([out1, out2, out3, out4, out6, out7], axis=1)
        return out

class M5Net(nn.Module):
    def __init__(self, emb_dims, n_cont, seq_len=56, device=device):
        super().__init__()
        self.device = device
        self.convs = ConvModule(seq_len)

        # Embedding layers
        self.emb_layers = nn.ModuleList([nn.Embedding(x, y) for x, y in emb_dims])
        n_embs = sum([y for x, y in emb_dims])
        
        self.n_embs = n_embs
        self.n_cont = n_cont
        
        inp_dim = 384 #got this via an error, todo: write a lazy loader
#         hidden_dim = 300
        hidden_dim = inp_dim//2
        
        self.fn = nn.Sequential(
                 LinearBlock(inp_dim, hidden_dim),
                 LinearBlock(hidden_dim, hidden_dim//2),
                 LinearBlock(hidden_dim//2, hidden_dim//4),
        )          
        
        self.out = nn.Linear(hidden_dim//4, 1)

        self.fn.apply(init_weights)
        self.out.apply(init_weights)
        

    def encode_and_combine_data(self, cont_data, cat_data):
        xcat = [el(cat_data[:, k]) for k, el in enumerate(self.emb_layers)]
        xcat = torch.cat(xcat, 1)
        x = torch.cat([xcat, cont_data], axis=1)
        return x   
    
    def forward(self, cont_data, cat_data, hist_data):
        cont_data = cont_data.to(self.device)
        cat_data = cat_data.to(self.device)
        hist_data = hist_data.to(self.device)
        
        x1 = self.encode_and_combine_data(cont_data, cat_data)
        x2 = self.convs(hist_data)
        x = torch.cat([x1, x2], axis=1)
        x = self.fn(x)
        x = self.out(x)
        return x

In [16]:
model = M5Net(emb_dims, n_cont).to(device)

In [17]:
model

M5Net(
  (convs): ConvModule(
    (global_conv): Conv1d(1, 1, kernel_size=(56,), stride=(1,))
    (week_conv): Conv1d(1, 1, kernel_size=(7,), stride=(1,))
    (biweek_conv): Conv1d(1, 1, kernel_size=(14,), stride=(1,))
    (month_conv): Conv1d(1, 1, kernel_size=(24,), stride=(1,))
    (bimonth_conv): Conv1d(1, 1, kernel_size=(48,), stride=(1,))
    (last_week_conv): Conv1d(1, 1, kernel_size=(7,), stride=(1,))
    (last_biweek_conv): Conv1d(1, 1, kernel_size=(14,), stride=(1,))
    (drop_large): Dropout(p=0.35, inplace=False)
    (drop_small): Dropout(p=0.15, inplace=False)
  )
  (emb_layers): ModuleList(
    (0): Embedding(3049, 5)
    (1): Embedding(7, 1)
    (2): Embedding(10, 1)
    (3): Embedding(3, 1)
    (4): Embedding(3, 1)
    (5): Embedding(7, 1)
    (6): Embedding(12, 1)
    (7): Embedding(6, 1)
    (8): Embedding(31, 1)
    (9): Embedding(5, 1)
    (10): Embedding(5, 1)
    (11): Embedding(5, 1)
  )
  (fn): Sequential(
    (0): LinearBlock(
      (block): Sequential(
       

In [19]:
%%time
criterion = RMSE()
for i, (X_cont, X_cat, X_hist, y) in enumerate(train_loader): #most of the time is spent in fresh randomzing inputs at every __iter__ call so dw
    out = model(X_cont, X_cat, X_hist)
    loss = criterion(out, y)   
    loss.backward()
    print(loss)
    break

tensor(6.2008, device='cuda:0', grad_fn=<SqrtBackward>)
CPU times: user 5.86 s, sys: 160 ms, total: 6.02 s
Wall time: 6.05 s


# Train

In [19]:
epochs = 40
optim = "adam"
lr_adam = 3e-4
lr_sgd = 1e-3
criterion = RMSE()
torch.manual_seed(777)
model = M5Net(emb_dims, n_cont).to(device)
model_name = "final_submit"

if optim == "adam":
    print("Using adam optimizer.")
    optimizer = torch.optim.Adam(model.parameters(), lr=lr_adam)
else:
    print("Not using adam optimizer.")
    optimizer = torch.optim.SGD(model.parameters(), lr=lr_sgd, momentum=0.9)
    
# scheduler = torch.optim.lr_scheduler.MultiStepLR(
#                     optimizer, [20, 25, 30], gamma=0.5, 
#                     last_epoch=-1)

scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
                    optimizer, 5, T_mult=2, 
                    eta_min=3e-5, last_epoch=-1)

def save(m, fname, dirname='/home/timetraveller/Work/M5Models-Final/'):
    with open(os.path.join(dirname, fname), 'wb') as handle:
        pickle.dump(m, handle)
    print("saved model")    
        
def zero_percentage(q):
    l = len(q)
    q = np.array(q)
    return sum(q<1)/l        

Using adam optimizer.


In [None]:
train_losses = []
val_losses = []

for epoch in tqdm(range(epochs)):
    train_loss, val_loss = 0, 0
   
    #Training phase
    model.train()
    ypreds = [] 
    ytrue = []
    bar = tqdm(train_loader)
    
    for i, (X_cont, X_cat, X_hist, y) in enumerate(bar):
        optimizer.zero_grad()
        out = model(X_cont, X_cat, X_hist)
        loss = criterion(out, y)   
        loss.backward()
        optimizer.step()
        scheduler.step(epoch + i/len(train_loader))

        with torch.no_grad():
            train_loss += loss.item()/len(train_loader)
            ypreds += list(out.detach().cpu().numpy().flatten())
            ytrue += list(y.cpu().numpy())
            bar.set_description(f"{loss.item():.3f}")
    
    with torch.no_grad():
        rrmse = rmse_metric(ypreds, ytrue)
        print(f"[Train] Epoch: {epoch} | Loss: {train_loss:.4f} | RMSE: {rrmse:.4f}")
    
    #Validation phase      
    with torch.no_grad():
            model.eval()
            ytrue = []
            ypreds = []
            
            for i, (X_cont, X_cat, X_hist, y) in enumerate(val_loader):
                out = model(X_cont, X_cat, X_hist)
                val_loss += criterion(out, y).item()/len(val_loader)
                ypreds += list(out.detach().cpu().numpy().flatten())
                ytrue += list(y.cpu().numpy())
                
            rrmse = rmse_metric(ypreds, ytrue)    
            wrmsse = wrmsse_metric(ypreds, ytrue, roll_mat_csr=roll_mat_csr, sw=sw)
            zc = zero_percentage(ypreds)
            
            print(f"[Valid] Epoch: {epoch} | Loss: {val_loss:.4f} | RMSE: {rrmse:.4f} | WRMSSE: {wrmsse:.4f} | zc: {zc:.3f}/0.544")
            
            #Test's zc
            for i, (X_cont, X_cat, X_hist, y) in enumerate(test_loader):
                out = model(X_cont, X_cat, X_hist)
                ypreds += list(out.detach().cpu().numpy().flatten())
            zc = zero_percentage(ypreds[-NUM_ITEMS*28:]) #Last 28 days only
            print(f"[Test] Epoch: {epoch} | zc: {zc:.3f}/???")            
      
    train_losses.append(train_loss)    
    val_losses.append(val_loss)   
    save_data = {
            'model' : model,
            'optimizer' : optimizer,
            'scheduler' : scheduler,
            'epoch' : epoch,
            'train_losses' : train_losses,
            'val_losses' : val_losses,
        }
    save(save_data, f"{model_name}_{epoch}")    

HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 0 | Loss: 2.3136 | RMSE: 2.3558
[Valid] Epoch: 0 | Loss: 2.0829 | RMSE: 2.1702 | WRMSSE: 0.4707 | zc: 0.642/0.544
[Test] Epoch: 0 | zc: 0.632/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 1 | Loss: 2.2637 | RMSE: 2.3045
[Valid] Epoch: 1 | Loss: 2.0667 | RMSE: 2.1532 | WRMSSE: 0.6456 | zc: 0.678/0.544
[Test] Epoch: 1 | zc: 0.669/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 2 | Loss: 2.2393 | RMSE: 2.2795
[Valid] Epoch: 2 | Loss: 2.0596 | RMSE: 2.1430 | WRMSSE: 0.6824 | zc: 0.679/0.544
[Test] Epoch: 2 | zc: 0.667/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 3 | Loss: 2.2222 | RMSE: 2.2619
[Valid] Epoch: 3 | Loss: 2.0507 | RMSE: 2.1312 | WRMSSE: 0.4367 | zc: 0.651/0.544
[Test] Epoch: 3 | zc: 0.636/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 4 | Loss: 2.2121 | RMSE: 2.2509
[Valid] Epoch: 4 | Loss: 2.0451 | RMSE: 2.1258 | WRMSSE: 0.4633 | zc: 0.651/0.544
[Test] Epoch: 4 | zc: 0.635/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 5 | Loss: 2.2205 | RMSE: 2.2597
[Valid] Epoch: 5 | Loss: 2.0654 | RMSE: 2.1508 | WRMSSE: 0.5321 | zc: 0.652/0.544
[Test] Epoch: 5 | zc: 0.638/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 6 | Loss: 2.2035 | RMSE: 2.2424
[Valid] Epoch: 6 | Loss: 2.0352 | RMSE: 2.1127 | WRMSSE: 0.4917 | zc: 0.620/0.544
[Test] Epoch: 6 | zc: 0.595/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 7 | Loss: 2.1864 | RMSE: 2.2246
[Valid] Epoch: 7 | Loss: 2.0283 | RMSE: 2.1043 | WRMSSE: 0.5085 | zc: 0.656/0.544
[Test] Epoch: 7 | zc: 0.636/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 8 | Loss: 2.1696 | RMSE: 2.2068
[Valid] Epoch: 8 | Loss: 2.0236 | RMSE: 2.0981 | WRMSSE: 0.4229 | zc: 0.638/0.544
[Test] Epoch: 8 | zc: 0.612/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 9 | Loss: 2.1533 | RMSE: 2.1892
[Valid] Epoch: 9 | Loss: 2.0162 | RMSE: 2.0889 | WRMSSE: 0.5036 | zc: 0.649/0.544
[Test] Epoch: 9 | zc: 0.624/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 10 | Loss: 2.1392 | RMSE: 2.1745
[Valid] Epoch: 10 | Loss: 2.0110 | RMSE: 2.0814 | WRMSSE: 0.4492 | zc: 0.645/0.544
[Test] Epoch: 10 | zc: 0.621/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 11 | Loss: 2.1287 | RMSE: 2.1631
[Valid] Epoch: 11 | Loss: 2.0064 | RMSE: 2.0759 | WRMSSE: 0.4156 | zc: 0.641/0.544
[Test] Epoch: 11 | zc: 0.617/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 12 | Loss: 2.1183 | RMSE: 2.1514
[Valid] Epoch: 12 | Loss: 2.0034 | RMSE: 2.0716 | WRMSSE: 0.4265 | zc: 0.646/0.544
[Test] Epoch: 12 | zc: 0.619/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 13 | Loss: 2.1117 | RMSE: 2.1441
[Valid] Epoch: 13 | Loss: 1.9996 | RMSE: 2.0667 | WRMSSE: 0.4579 | zc: 0.651/0.544
[Test] Epoch: 13 | zc: 0.626/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 14 | Loss: 2.1086 | RMSE: 2.1405
[Valid] Epoch: 14 | Loss: 1.9967 | RMSE: 2.0633 | WRMSSE: 0.4205 | zc: 0.643/0.544
[Test] Epoch: 14 | zc: 0.619/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 15 | Loss: 2.1332 | RMSE: 2.1678
[Valid] Epoch: 15 | Loss: 2.0077 | RMSE: 2.0762 | WRMSSE: 0.4250 | zc: 0.647/0.544
[Test] Epoch: 15 | zc: 0.619/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 16 | Loss: 2.1272 | RMSE: 2.1623
[Valid] Epoch: 16 | Loss: 2.0035 | RMSE: 2.0727 | WRMSSE: 0.5330 | zc: 0.665/0.544
[Test] Epoch: 16 | zc: 0.642/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 17 | Loss: 2.1190 | RMSE: 2.1526
[Valid] Epoch: 17 | Loss: 2.0073 | RMSE: 2.0774 | WRMSSE: 0.4189 | zc: 0.635/0.544
[Test] Epoch: 17 | zc: 0.600/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 18 | Loss: 2.1098 | RMSE: 2.1421
[Valid] Epoch: 18 | Loss: 1.9970 | RMSE: 2.0644 | WRMSSE: 0.4106 | zc: 0.643/0.544
[Test] Epoch: 18 | zc: 0.610/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 19 | Loss: 2.1022 | RMSE: 2.1346
[Valid] Epoch: 19 | Loss: 1.9938 | RMSE: 2.0608 | WRMSSE: 0.5703 | zc: 0.669/0.544
[Test] Epoch: 19 | zc: 0.638/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 20 | Loss: 2.0928 | RMSE: 2.1245
[Valid] Epoch: 20 | Loss: 1.9901 | RMSE: 2.0553 | WRMSSE: 0.4226 | zc: 0.653/0.544
[Test] Epoch: 20 | zc: 0.614/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 21 | Loss: 2.0853 | RMSE: 2.1157
[Valid] Epoch: 21 | Loss: 1.9828 | RMSE: 2.0465 | WRMSSE: 0.4691 | zc: 0.661/0.544
[Test] Epoch: 21 | zc: 0.631/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 22 | Loss: 2.0801 | RMSE: 2.1099
[Valid] Epoch: 22 | Loss: 1.9787 | RMSE: 2.0410 | WRMSSE: 0.4160 | zc: 0.649/0.544
[Test] Epoch: 22 | zc: 0.609/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))


[Train] Epoch: 23 | Loss: 2.0708 | RMSE: 2.0995
[Valid] Epoch: 23 | Loss: 1.9747 | RMSE: 2.0365 | WRMSSE: 0.4197 | zc: 0.651/0.544
[Test] Epoch: 23 | zc: 0.619/???
saved model


HBox(children=(IntProgress(value=0, max=10928), HTML(value='')))

In [None]:
plt.plot(train_losses)
plt.plot(val_losses)
plt.show()

# Predict

In [None]:
X_test['dense1'].shape[0]/NUM_ITEMS

In [22]:
X_test['id'].id = X_test['id'].id.apply(lambda x : "_".join(x.split("_")[:-1]))

In [23]:
test_loader = M5Loader(X_test, y=None,
                                                sales_df=sales_df, cat_cols=cat_cols, 
                                                batch_size=NUM_ITEMS, seq_len=seq_len,
                                                shuffle=False, ret_garbage=True)

In [24]:
pred = []
with torch.no_grad():
#         model.eval()
        for i, (X_cont, X_cat, X_hist, y) in enumerate(tqdm(test_loader)):
            if X_cont is None:
                out_npy = np.zeros_like(out_npy)
            else:    
                out = model(X_cont, X_cat, X_hist)
                out_npy = out.cpu().numpy().flatten()
            pred += list(out_npy)    
pred = np.array(pred)            

HBox(children=(IntProgress(value=0, max=56), HTML(value='')))

NOOOOOOO This should not happen


NameError: name 'out_npy' is not defined

In [None]:
print(pred.max())
print(pred.min())
print(pred.mean())
print((pred[:NUM_ITEMS*28]<1).sum()/pred[:NUM_ITEMS*28].shape[0])
print((pred<1).sum()/pred.shape[0])

In [None]:
pred.shape[0]/NUM_ITEMS

In [None]:
path = "../data/"
sample_submission = pd.read_csv(os.path.join(path, "sample_submission.csv"))
test["demand"] = pred.clip(0)
submission = test.pivot(index="id", columns="F", values="demand").reset_index()[sample_submission.columns]
submission = sample_submission[["id"]].merge(submission, how="left", on="id")
submission.head()

In [None]:
submission.tail()

In [None]:
submission.to_csv("/home/timetraveller/Desktop/kek.csv", index=False)

In [None]:
pd.read_csv("/home/timetraveller/Desktop/kek.csv")

In [None]:
!kaggle competitions submit -f /home/timetraveller/Desktop/kek.csv -m "nn" -c m5-forecasting-accuracy