In [1]:
from sklearn.metrics import root_mean_squared_error

In [2]:
import pandas as pd
import numpy as np

In [3]:
data_train = pd.read_csv('../data_cleaned/data_train.csv')

In [4]:
test = pd.read_csv('../data_cleaned/test.csv')
test['date_block_num'] = 34

In [5]:
data_train = pd.concat([data_train,test ], ignore_index=True).drop('ID', axis=1).fillna(0)

In [6]:
data_train

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.00,1.0
1,03.01.2013,0,25,2552,899.00,1.0
2,06.01.2013,0,25,2554,1709.05,1.0
3,15.01.2013,0,25,2555,1099.00,1.0
4,10.01.2013,0,25,2564,349.00,1.0
...,...,...,...,...,...,...
3142680,0,34,45,18454,0.00,0.0
3142681,0,34,45,16188,0.00,0.0
3142682,0,34,45,15757,0.00,0.0
3142683,0,34,45,19648,0.00,0.0


In [7]:
def prepare_past_ID_s(data_train):
    data_train['shop_item'] = [tuple([shop, item]) for shop, item in zip(data_train['shop_id'], data_train['item_id'])]
    #34 block contains A LOT more shop_item than others
    shop_item_pairs_in_dbn = data_train.groupby('date_block_num')['shop_item'].apply(np.unique)
    data_train = data_train.drop(['shop_item'], axis=1)
    
    shop_item_pairs_WITH_PREV_in_dbn = shop_item_pairs_in_dbn.copy()
    
    print(np.array(shop_item_pairs_WITH_PREV_in_dbn.index))
    arr = np.array(shop_item_pairs_WITH_PREV_in_dbn.index)
    for block in arr[arr>=0]:
        if block == 0:
            continue
        shop_item_pairs_WITH_PREV_in_dbn[block] = np.unique(np.append(shop_item_pairs_WITH_PREV_in_dbn[block -1],
                                                                      #shop_item_pairs_WITH_PREV_in_dbn[block]))
                                                                      shop_item_pairs_in_dbn[block-1]))
        print(len(shop_item_pairs_WITH_PREV_in_dbn[block]))

    return shop_item_pairs_in_dbn, shop_item_pairs_WITH_PREV_in_dbn

In [8]:

def prepare_past_ID_s_CARTESIAN(data_train):
    data_train['shop_item'] = [tuple([shop, item]) for shop, item in zip(data_train['shop_id'], data_train['item_id'])]
    #34 block contains A LOT more shop_item than others
    shop_item_pairs_in_dbn = data_train.groupby('date_block_num')['shop_item'].apply(np.unique)
    data_train = data_train.drop(['shop_item'], axis=1)
    
    shop_item_pairs_WITH_PREV_in_dbn = np.array([None] * len(shop_item_pairs_in_dbn))
    
    #print(np.array(shop_item_pairs_WITH_PREV_in_dbn.index))
    

    cartesians = []
    for dbn in shop_item_pairs_in_dbn.index:
        val = shop_item_pairs_in_dbn[dbn]

        shops = np.unique(list(zip(*val))[0])
        items = np.unique(list(zip(*val))[1])
    
        cartesian_product = np.random.permutation (np.array(np.meshgrid(shops, items)).T.reshape(-1, 2))
        #print(cartesian_product)
        cartesians.append(cartesian_product)
        
    
    shop_item_pairs_WITH_PREV_in_dbn[0] = cartesians[0]
    
    for block in shop_item_pairs_in_dbn.index:
        if block == 0:
            continue
        arr = np.append(shop_item_pairs_WITH_PREV_in_dbn[block - 1],
                             cartesians[block - 1], axis=0)
        
        shop_item_pairs_WITH_PREV_in_dbn[block] = np.unique(arr, axis=0)
        print(len(shop_item_pairs_WITH_PREV_in_dbn[block]))
        
    return shop_item_pairs_in_dbn, shop_item_pairs_WITH_PREV_in_dbn


In [9]:
shop_item_pairs_in_dbn, shop_item_pairs_WITH_PREV_in_dbn = prepare_past_ID_s(data_train)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34]
63170
97186
126662
146380
162679
178831
192610
205423
216508
228547
240348
255924
263803
271937
281288
290301
299290
307259
314673
322344
330092
338979
349486
362849
368644
374512
380928
386411
391327
396282
401230
406603
411762
418882


In [10]:
from sklearn.metrics import root_mean_squared_error

In [11]:
from collections import defaultdict

In [12]:
def make_X_lag_format(data, dbn):
    """
    transform X to lag format
    columns with dbn in names become lag_0, dbn-1 - lag_1 etc.
    """
    
    lag_cols = defaultdict()
    for col in data.columns:
        splitted = col.split('$')
        
        if len(splitted) == 1:
                continue
        
        
        lag_cols[col] = splitted[0] + '_lag_' + str(dbn - int(splitted[1]))

    #print(lag_cols)
    data = data.rename(columns=dict(lag_cols))
    #print(data.columns)
    return data

In [13]:
def prepare_train(data, valid ):
    """
    returns one batch of merged data with required IDs from valid
    """
    #print(data)
    valid_shop_item = valid
    valid_shop_item = list(zip(*valid_shop_item))
    df = pd.DataFrame({'item_id':valid_shop_item[1],'shop_id':valid_shop_item[0]} )
    data = df.merge(data, on=['shop_id','item_id'], how='left').fillna(0)
    
    return data

In [14]:

def prepare_val(data, valid ):
    """
    returns one batch of merged data with required IDs from valid
    """
    
    df = pd.DataFrame({'item_id':valid[:,1],'shop_id':valid[:,0]} )
    data = df.merge(data, on=['shop_id','item_id'], how='left').fillna(0)
    return data

In [15]:
import re


In [16]:
def prepare_data_train_LSTM(data, valid, dbn):
    """
    
    """
    train = prepare_train (data, valid)
    lag_cols = []
    for col in data.columns:

        splitted = col.split('$')
        if len(splitted)==1:
            lag_cols.append(col)
            continue
        #if 'shop_item_cnt' not in col:
        #    continue
            
        for db in range(0,dbn-1):
            
            if db == int(splitted[1]):
                lag_cols.append(col)

    X = train[lag_cols]

    
    Y = train[f'shop_item_cnt${dbn-1}']
    
    return X, Y
        

In [17]:
def prepare_data_validation_LSTM(data, valid, dbn):
    """
    
    """
    test = prepare_val (data, valid)
    
    lag_cols = []
    
    for col in test.columns:
        
        splitted = col.split('$')
            
        if len(splitted) == 1:
            lag_cols.append(col)
            continue
        #if 'shop_item_cnt' not in col:
        #    continue
        for db in range(1,dbn):
            
            if db == int(splitted[1]):
                #print(db, int(''.join(re.findall(r'\d+', col))))
                lag_cols.append(col)

    X = test[lag_cols]
    Y = test[f'shop_item_cnt${dbn}']
    
    return X, Y

In [18]:
np.random.permutation([[1,2],[3,4]])

array([[1, 2],
       [3, 4]])

In [19]:
def create_batch_train(merged, batch_size, dbn):
    """
    
    """
    #merged = pd.read_csv('data/merged.csv', chunksize=500000)
    #merged = pd.read_csv('data/merged.csv')
    train = np.random.permutation (shop_item_pairs_WITH_PREV_in_dbn[dbn])
    #train = shop_item_pairs_WITH_PREV_in_dbn[dbn]
    chunck_num = (len(train)  // batch_size) + 1
    
    for idx in range(chunck_num):#split shop_item_pairs_WITH_PREV_in_dbn into chuncks
        #for chunck in merged:#split merged into chuncks
        train_ret = prepare_data_train_LSTM(merged,train[idx*batch_size:(idx+1)*batch_size], dbn)
       
        if  train_ret[0].empty:
            yield [None, None]
        
        yield train_ret#, test

In [20]:
def create_batch_val(merged, batch_size, dbn):
    """
    
    """
    #merged = pd.read_csv('data/merged.csv', chunksize=500000) - (DOESNT WORK PROPERLY))))) - use it if merged doesnt fit memory
    #merged = pd.read_csv('data/merged.csv')
    val = shop_item_pairs_in_dbn[dbn]

    shops = np.unique(list(zip(*val))[0])
    items = np.unique(list(zip(*val))[1])

    cartesian_product = np.random.permutation (np.array(np.meshgrid(shops, items)).T.reshape(-1, 2))
    
    chunck_num = (len(cartesian_product)  // batch_size) + 1
    for idx in range(chunck_num):
        #for chunck in merged:
        train_ret = prepare_data_validation_LSTM(merged,cartesian_product[idx*batch_size:(idx+1)*batch_size], dbn)
        #When in batches idx no elements that are in (shop, item) in batch of merged
        if  train_ret[0].empty:
            
            yield [None, None]
        #print(len(train_ret))
        
        yield train_ret#, test

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim



class CustomLSTM(nn.Module):

    def __init__(self, embedding_dim=1, hidden_dim=64,hidden_linear=64,  tagset_size=1, N_LEVELS=None, device=None):
        super(CustomLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        
        self.lstm = nn.LSTM(input_size=self.embedding_dim,
                            hidden_size = self.hidden_dim,
                            batch_first=True,
                            proj_size=tagset_size,
                            num_layers=N_LEVELS,
                           device=device)

        self.linear_layers = nn.Sequential(
            nn.Linear(tagset_size, hidden_linear),
            nn.ReLU(),
            nn.Linear(hidden_linear, 1)
        ).to(device)

        # The linear layer that maps from hidden state space to tag space
        

    def forward(self, data):
        
        lstm_out, _ = self.lstm(data)

        
        linear_out = self.linear_layers(lstm_out)
        
        return linear_out

In [22]:
SELECTED_COLUMNS  = ['shop_item_cnt','shop_item_price','shop_category_cnt','shop_item_date_block_num_diff']

In [23]:
def preapre_X_LSTM_format(X_train, dbn, device):
    #Batch come here
    #one row - one train example
    cols=[]
    data = defaultdict(list)
    for col in X_train.columns:
        if 'change' in col:
            continue
        #if 'avg_item_price' in col:
        #   cols.append(col)
        #   continue

        if 'shop_item_cnt' in col:
            cols.append(col)
            continue

        
            
        """
        if not col[-1].isdigit():
            continue

        

        if not any(c in col for c in SELECTED_COLUMNS) :
            continue
        """
        
        
    #print('COLUMNS',list(X_train[cols].columns))
    #print(X_train[cols].columns)
    X_train = X_train[cols].values.reshape(len(X_train), dbn-1, -1)
    #print(X_train)
    return torch.tensor(X_train, device=device).to(dtype=torch.float32)
            

In [24]:
def preapre_Y_LSTM_format(Y_train, device):
    return torch.tensor(Y_train, device=device).to(dtype=torch.float32)

In [25]:
def append_some_columns(X_train, dbn):
    X_train['date_block_num'] = dbn
    X_train['month'] = dbn%12
    return X_train

In [26]:
import xgboost as xgb
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression

In [27]:

device='cuda' if torch.cuda.is_available() else 'cpu'

In [28]:
from torch.nn.functional import normalize

In [29]:
import torch

In [30]:
def train_lstm(model,optimizer,loss_fn,   merged,batch_size, val_month, epochs=20):
    
    first=True
    rmse = 0
    c=0
    
    preds_l=[]
    y_true_l=[]

    grads = []
    for X_train,Y_train  in create_batch_train(merged,batch_size, val_month):
        
        if X_train is None:
            print('None')
            continue
        Y_train = np.clip(Y_train,0,20)
        
        if X_train.empty:
            print('None')
            continue
        X_train = make_X_lag_format(X_train, val_month-1)
        X_train=preapre_X_LSTM_format(X_train, val_month-1, device)
        Y_train = preapre_Y_LSTM_format(Y_train, device)
        
        
        optimizer.zero_grad()

        preds = model(X_train)[:,-1,:]
        
        preds_l.append(torch.squeeze(preds))
        y_true_l.append(torch.squeeze(Y_train))
        
        loss_train = loss_fn(torch.squeeze(preds), 
                             torch.squeeze(Y_train))
        
        
        
        loss_train.backward()
        
        total_norm = torch.max( torch.stack([p.grad.detach().abs().max() for p in model.parameters()]) )
        grads.append(total_norm)
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        
        c+=1

    preds_l = torch.concat(preds_l)
    y_true_l = torch.concat(y_true_l)
    print('mean of max grad,',torch.mean(torch.tensor(grads)))
    with torch.no_grad():
        metric = torch.sqrt(loss_fn(torch.clamp(preds_l,0,20), y_true_l))
        #print(pd.DataFrame(torch.clamp(y_true_l,0,20).numpy(force=True)).describe())
        
    return model, metric

In [31]:
def validate_lstm(model,merged,batch_size, val_month):
    val_error = 0
    c=0
    val_preds=[]
    preds_l=[]
    y_true_l=[]
    #create_batch_train(merged,batch_size, val_month) - return train set, where Y_val
    #is shop_item_cnt_month{val_month}
    loss_fn = nn.MSELoss()

    for X_val, Y_val in create_batch_val(merged,batch_size, val_month):#but then cartesian product used
        
        if X_val is None:
            continue
        if X_val.empty:
            print('None')
            continue
        Y_val = np.clip(Y_val,0,20)        
        X_val = make_X_lag_format(X_val, val_month)
        
        
        X_val=preapre_X_LSTM_format(X_val, val_month, device)
        Y_val = preapre_Y_LSTM_format(Y_val, device)

        #print(X_val.size())
        #X_val = torch.nn.functional.normalize(X_val, p=1, dim=1)
        #X_val /= torch.max(X_val, dim=1, keepdim=True)[0]
        #X_val = torch.nan_to_num(X_val, nan=0.0)
        #X_val /= 20.0
        #Y_val /= 20.0
        #X_val = torch.nn.functional.normalize(X_val, p=1, dim=1)
        #Y_val = torch.nn.functional.normalize(Y_val, p=1, dim=1)
        if c==0:
            pass
            #print('train columns',X_val.columns)
            #print(X_val.date_block_num)

        with torch.no_grad():
            y_val_pred = model(X_val)[:,-1,:]
            loss_rmse = torch.sqrt(loss_fn(torch.squeeze(y_val_pred),
                                                 torch.squeeze(Y_val)))
            
            
            preds_l.append(torch.squeeze(y_val_pred))
            y_true_l.append(torch.squeeze(Y_val))
            val_preds.append(y_val_pred)
            c+=1
            
    preds_l = torch.concat(preds_l)
    y_true_l = torch.concat(y_true_l)

    
    with torch.no_grad():
        metric = torch.sqrt(loss_fn(torch.clamp(preds_l,0,20)*20, y_true_l*20))
        #print(pd.DataFrame(torch.clamp(y_true_l,0,20).numpy(force=True)).describe())
        
            
    return preds_l, metric

In [32]:
def validate_LSTM(merged, epochs=None,start_val_month =None):
    """
    Function for validating model
    
    """
    
    val_errors = []
    batch_size =20000
    val_preds=[]
    lr = 0.003
    step_size=40
    gamma=0.1
    
    for val_month in range(start_val_month, 34):

        
        loss_fn = nn.MSELoss()
        print('date_block_num', val_month)
        print('month', val_month%12)
        
        model = CustomLSTM(embedding_dim=EMBEDDING_DIM,
                   hidden_dim=HIDDEN_DIM,
                   hidden_linear=64,
                   tagset_size=TARGET_SIZE,
                   N_LEVELS=N_LEVELS,
                   device=device)
        
        optimizer = optim.Adam(model.parameters(), lr=lr, betas=(0.8, 0.999))
        scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma)
        
        for epoch in range(epochs):
        
            
            
            model,train_error = train_lstm(model,
                                           optimizer,
                                           loss_fn, 
                                           merged,
                                           batch_size, 
                                           val_month)
            
            scheduler.step()
            if epoch % 3 == 0:
                val_pred, val_error = validate_lstm(model,
                                                    merged,
                                                    batch_size,
                                                    val_month)

                print('prediction')
                print('mean', torch.mean(val_pred))
                print('max', torch.max(val_pred))
                print('min', torch.min(val_pred))
                print('std', torch.std(val_pred))
                print('mean train rmse on epoch', epoch,':',train_error )
                print('mean val rmse on epoch', epoch,':',val_error )
                

            if epoch % 13 == 0:
                print('lr:',scheduler.get_last_lr())
                
            
            val_errors.append(val_error)
            val_preds.append(val_pred)
        

    return val_errors, val_preds

In [33]:
chunksize = 50000
l=[]
with pd.read_csv('data/merged.csv', chunksize=chunksize) as reader:
    for chunk in reader:
        l.append(chunk)

merged = pd.concat(l)

In [34]:
from torch.optim.lr_scheduler import StepLR

In [35]:
EMBEDDING_DIM=1
HIDDEN_DIM=128#512
TARGET_SIZE=2
N_LEVELS=1

epochs=500
start_val_month=22

In [36]:
np.log(0)

  np.log(0)


np.float64(-inf)

In [37]:
def normalize(data):
    data[[col for col in merged.columns if 'cnt' in col ]] = np.clip(data[[col for col in merged.columns if 'cnt' in col ]],0,20)
    data[[col for col in merged.columns if 'price' in col ]] = \
    np.clip(np.nan_to_num( np.log(data[[col for col in merged.columns if 'price' in col ]]),  posinf=0, neginf=0),
            0,
            10000) / 10000
    data[[col for col in merged.columns if 'cnt' in col ]] /= 20
normalize(merged)

  result = func(self.values, **kwargs)


In [38]:
val_errors, val_preds = validate_LSTM(merged, epochs, start_val_month)

date_block_num 22
month 10
mean of max grad, tensor(0.0899)
prediction
mean tensor(-0.0050, device='cuda:0')
max tensor(0.0008, device='cuda:0')
min tensor(-0.0050, device='cuda:0')
std tensor(0.0003, device='cuda:0')
mean train rmse on epoch 0 : tensor(0.0679, device='cuda:0')
mean val rmse on epoch 0 : tensor(1.3628, device='cuda:0')
lr: [0.003]
mean of max grad, tensor(0.0125)
mean of max grad, tensor(0.0026)
mean of max grad, tensor(0.0023)
prediction
mean tensor(0.0116, device='cuda:0')
max tensor(1.0224, device='cuda:0')
min tensor(0.0024, device='cuda:0')
std tensor(0.0373, device='cuda:0')
mean train rmse on epoch 3 : tensor(0.0570, device='cuda:0')
mean val rmse on epoch 3 : tensor(1.1223, device='cuda:0')
mean of max grad, tensor(0.0012)
mean of max grad, tensor(0.0026)
mean of max grad, tensor(0.0013)
prediction
mean tensor(0.0151, device='cuda:0')
max tensor(0.8820, device='cuda:0')
min tensor(0.0076, device='cuda:0')
std tensor(0.0365, device='cuda:0')
mean train rmse on e

KeyboardInterrupt: 

In [None]:
1.1211(best 1.1169, epoch 21),
1.3282, 
1.0123(best 0.9, epoch 9),
0.8688,
0.8658, 
0.9597, 
0.8992,
0.8306,
0.7505,
0.8229,
0.9836   

In [4]:
np. mean([1.1211,  1.3282,  1.0123, 0.8688, 0.8658, 0.9597, 0.8992, 0.8306,0.7505,0.8229,0.9836]   )

np.float64(0.9493363636363639)

In [None]:
from collections import defaultdict
errs = defaultdict()

In [None]:
!pwd

In [None]:
errors = torch.tensor(val_errors).numpy(force=True).reshape(12,-1)
np.array(errors[:,-1]).mean()

In [None]:
def create_submission(model,merged,batch_size):
    val_month = 34
    test = pd.read_csv('../data_cleaned/test.csv')
    
    data_test = test
    PREDICTION = pd.DataFrame(columns=['shop_id','item_id','item_cnt_month'])
    
    print('date_block_num', val_month)
    print('month', val_month%12)
    for X_val, Y_val in create_batch_val(merged,batch_size, val_month):#but then cartesian product used
        shops= X_val.shop_id
        items = X_val.item_id
        if X_val is None:
            continue
        if X_val.empty:
            print('None')
            continue
        Y_val = np.clip(Y_val,0,20)        
        X_val = make_X_lag_format(X_val, val_month)
        
        
        X_val=preapre_X_LSTM_format(X_val, val_month, device)
        Y_val = preapre_Y_LSTM_format(Y_val, device)

        
        with torch.no_grad():
            y_val_pred = model(X_val)[:,-1,:]
        
            app = pd.DataFrame({'item_id':items,
                                'shop_id': shops,
                                'item_cnt_month':y_val_pred.numpy(force=True).flatten()})
            PREDICTION = pd.concat([PREDICTION, app],ignore_index=True)


    
    data_test = data_test.merge(PREDICTION,on=['shop_id','item_id'])[['ID','item_cnt_month']]
    return data_test
    

In [None]:
def create_submission_pipeline(merged):
    batch_size=10000
    model = CustomLSTM(EMBEDDING_DIM, HIDDEN_DIM, TARGET_SIZE, N_LEVELS,device=device)
    
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    loss_fn = nn.MSELoss()
    
    val_month=34
    epochs=50
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = StepLR(optimizer, step_size=13, gamma=0.3)
    for epoch in range(epochs):
        print('training epoch',epoch)
        model,columns_order = train_lstm(model,optimizer,loss_fn,   merged,batch_size, val_month, epochs=20)
        scheduler.step()
    
    
    data_test = create_submission(model,merged,batch_size)

    return data_test

In [None]:
submission = create_submission_pipeline(merged)

In [None]:
submission.describe()

In [40]:
submission.to_csv('submission.csv', index=False)

In [None]:
12 epochs
validation - 0.9546194
test - 1.04
35 epochs
test - 1.03