In [1]:
import pandas as pd
import numpy as np



In [2]:
np.random.seed(42)

In [3]:
import sklearn

In [4]:
data_train = pd.read_csv('../data_cleaned/data_train.csv')

In [5]:
test = pd.read_csv('../data_cleaned/test.csv')
test['date_block_num'] = 34

In [6]:
data_train = pd.concat([data_train,test ], ignore_index=True).drop('ID', axis=1).fillna(0)

In [7]:
data_train

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.00,1.0
1,03.01.2013,0,25,2552,899.00,1.0
2,06.01.2013,0,25,2554,1709.05,1.0
3,15.01.2013,0,25,2555,1099.00,1.0
4,10.01.2013,0,25,2564,349.00,1.0
...,...,...,...,...,...,...
3142680,0,34,45,18454,0.00,0.0
3142681,0,34,45,16188,0.00,0.0
3142682,0,34,45,15757,0.00,0.0
3142683,0,34,45,19648,0.00,0.0


In [8]:

def prepare_past_ID_s(data_train):
    data_train['shop_item'] = [tuple([shop, item]) for shop, item in zip(data_train['shop_id'], data_train['item_id'])]
    #34 block contains A LOT more shop_item than others
    shop_item_pairs_in_dbn = data_train.groupby('date_block_num')['shop_item'].apply(np.unique)
    data_train = data_train.drop(['shop_item'], axis=1)
    
    shop_item_pairs_WITH_PREV_in_dbn = shop_item_pairs_in_dbn.copy()
    
    print(np.array(shop_item_pairs_WITH_PREV_in_dbn.index))
    arr = np.array(shop_item_pairs_WITH_PREV_in_dbn.index)
    
    for block in arr[arr>=0]:
        if block == 0:
            continue

        
        arr = np.append(shop_item_pairs_WITH_PREV_in_dbn[block -1],
                                                            shop_item_pairs_in_dbn[block-1])
        
        
        shop_item_pairs_WITH_PREV_in_dbn[block] = np.unique(np.append(shop_item_pairs_WITH_PREV_in_dbn[block -1],
                                                            shop_item_pairs_in_dbn[block-1]))
        print(len(shop_item_pairs_WITH_PREV_in_dbn[block]))

    return shop_item_pairs_in_dbn, shop_item_pairs_WITH_PREV_in_dbn


In [9]:

def prepare_past_ID_s_CARTESIAN(data_train):
    data_train['shop_item'] = [tuple([shop, item]) for shop, item in zip(data_train['shop_id'], data_train['item_id'])]
    #34 block contains A LOT more shop_item than others
    shop_item_pairs_in_dbn = data_train.groupby('date_block_num')['shop_item'].apply(np.unique)
    data_train = data_train.drop(['shop_item'], axis=1)
    
    shop_item_pairs_WITH_PREV_in_dbn = np.array([None] * len(shop_item_pairs_in_dbn))
    
    #print(np.array(shop_item_pairs_WITH_PREV_in_dbn.index))
    

    cartesians = []
    for dbn in shop_item_pairs_in_dbn.index:
        val = shop_item_pairs_in_dbn[dbn]

        shops = np.unique(list(zip(*val))[0])
        items = np.unique(list(zip(*val))[1])
    
        cartesian_product = np.random.permutation (np.array(np.meshgrid(shops, items)).T.reshape(-1, 2))
        #print(cartesian_product)
        cartesians.append(cartesian_product)
        
    
    shop_item_pairs_WITH_PREV_in_dbn[0] = cartesians[0]
    
    for block in shop_item_pairs_in_dbn.index:
        if block == 0:
            continue
        arr = np.append(shop_item_pairs_WITH_PREV_in_dbn[block - 1],
                             cartesians[block - 1], axis=0)
        
        shop_item_pairs_WITH_PREV_in_dbn[block] = np.unique(arr, axis=0)
        print(len(shop_item_pairs_WITH_PREV_in_dbn[block]))
    return shop_item_pairs_in_dbn, shop_item_pairs_WITH_PREV_in_dbn


In [10]:
shop_item_pairs_in_dbn, shop_item_pairs_WITH_PREV_in_dbn = prepare_past_ID_s_CARTESIAN(data_train)

364950
428871
466086
494493
532909
566259
587979
609623
627192
664844
686985
719696
730116
746129
775024
799403
814628
828506
851457
871899
890066
928598
952398
976804
987057
997953
1013772
1025692
1035736
1046582
1055558
1067461
1080188
1110590


In [11]:
#pd.concat([shop_item_pairs_WITH_PREV_in_dbn.map(len),shop_item_pairs_in_dbn.map(len)], axis=1)

In [12]:
from sklearn.metrics import root_mean_squared_error

In [13]:
from collections import defaultdict

In [14]:
def make_X_lag_format(data, dbn):
    """
    transform X to lag format
    columns with dbn in names become lag_0, dbn-1 - lag_1 etc.
    """
    
    lag_cols = defaultdict()
    for col in data.columns:
        splitted = col.split('$')
        if len(splitted) == 1:
            continue
        
        lag_cols[col] = splitted[0] + '_lag;' + str(dbn - int(splitted[1]))

    #print(lag_cols)
    data = data.rename(columns=dict(lag_cols))
    #print(data.columns)
    return data

In [15]:
def prepare_train(data, valid ):
    """
    returns one batch of merged data with required IDs from valid
    """
    #print(data)
    valid_shop_item = valid
    valid_shop_item = list(zip(*valid_shop_item))
    df = pd.DataFrame({'item_id':valid_shop_item[1],'shop_id':valid_shop_item[0]} )
    data = df.merge(data, on=['shop_id','item_id'], how='left').fillna(0)
    
    return data

In [16]:

def prepare_val(data, valid ):
    """
    returns one batch of merged data with required IDs from valid
    """
    
    df = pd.DataFrame({'item_id':valid[:,1],'shop_id':valid[:,0]} )
    data = df.merge(data, on=['shop_id','item_id'], how='left').fillna(0)
    return data

In [17]:
import re


In [18]:
def prepare_data_train_boosting(data, valid, dbn):
    """
    
    """
    train = prepare_train (data, valid)
    lag_cols = []
    for col in data.columns:
        
        splitted = col.split('$')
        if len(splitted) == 1:
                lag_cols.append(col)
                continue
        #if 'shop_item_cnt' not in col:
        #    continue
            
        for db in range(0,dbn-1):
            
            if db == int(splitted[1]):
                #print(col)
                lag_cols.append(col)

    #print(lag_cols)
    X = train[lag_cols]
    Y = train[f'shop_item_cnt${dbn-1}']
    
    return X, Y
        

In [19]:
def prepare_data_validation_boosting(data, valid, dbn):
    """
    
    """
    test = prepare_val (data, valid)
    
    lag_cols = []
    for col in test.columns:
        
            
        splitted = col.split('$')
        if len(splitted) == 1:
                lag_cols.append(col)
                continue
        #if 'shop_item_cnt' not in col:
        #    continue
        for db in range(1,dbn):
            
            if db == int(splitted[1]):
                #print(db, int(''.join(re.findall(r'\d+', col))))
                lag_cols.append(col)

    X = test[lag_cols]
    Y = test[f'shop_item_cnt${dbn}']
    
    return X, Y

In [20]:
np.random.permutation([[1,2],[3,4]])

array([[1, 2],
       [3, 4]])

In [21]:
def create_batch_train(merged, batch_size, dbn):
    """
    
    """
    #merged = pd.read_csv('data/merged.csv', chunksize=500000)
    #merged = pd.read_csv('data/merged.csv')
    train = np.random.permutation (shop_item_pairs_WITH_PREV_in_dbn[dbn])
    chunck_num = (len(train)  // batch_size) if batch_size <= len(train) else 1
    
    for idx in range(chunck_num):#split shop_item_pairs_WITH_PREV_in_dbn into chuncks
        #for chunck in merged:#split merged into chuncks
        train_ret = prepare_data_train_boosting(merged,train[idx*batch_size:(idx+1)*batch_size], dbn)
       
        if  train_ret[0].empty:
            yield [None, None]
        
        yield train_ret#, test

In [22]:
def create_batch_val(merged, batch_size, dbn):
    """
    
    """
    #merged = pd.read_csv('data/merged.csv', chunksize=500000) - (DOESNT WORK PROPERLY))))) - use it if merged doesnt fit memory
    #merged = pd.read_csv('data/merged.csv')
    val = shop_item_pairs_in_dbn[dbn]

    shops = np.unique(list(zip(*val))[0])
    items = np.unique(list(zip(*val))[1])

    cartesian_product = np.random.permutation (np.array(np.meshgrid(shops, items)).T.reshape(-1, 2))
    
    chunck_num = (len(cartesian_product)  // batch_size) + 1
    for idx in range(chunck_num):
        #for chunck in merged:
        train_ret = prepare_data_validation_boosting(merged,cartesian_product[idx*batch_size:(idx+1)*batch_size], dbn)
        #When in batches idx no elements that are in (shop, item) in batch of merged
        if  train_ret[0].empty:
            
            yield [None, None]
        #print(len(train_ret))
        
        yield train_ret#, test

In [1]:
names_base = ['ema_6_item_cnt_month_item_id',
         'ema_6_item_cnt_month_item_id_shop_id',
         'ema_6_item_cnt_month_item_category_id_cat_city',
         'ema_6_item_cnt_month_item_category_id_cat_shop_id',
         'date_block_num_diff',
        'avg_item_priceitem_id_lag_1',
        'item_cnt_monthitem_category_id_cat_lag_1',
             'item_cnt_month_lag_1']

names_changed = ['item_price_change','shop_item_price_change']
names = names_changed + names_base
names

['item_price_change',
 'shop_item_price_change',
 'ema_6_item_cnt_month_item_id',
 'ema_6_item_cnt_month_item_id_shop_id',
 'ema_6_item_cnt_month_item_category_id_cat_city',
 'ema_6_item_cnt_month_item_category_id_cat_shop_id',
 'date_block_num_diff',
 'avg_item_priceitem_id_lag_1',
 'item_cnt_monthitem_category_id_cat_lag_1',
 'item_cnt_month_lag_1']

In [26]:
def select_columns(X_train, dbn):#WHEN LINEAR MODELS, X_train = append_some_columns(X_train,dbn) - to comment
    X_train = append_some_columns(X_train,dbn)
    shop_item_cnt_lags= [1,2,3,4,5,6,7,8,9,10,11,12,24]
    cols=[]

    
    for col in X_train.columns:
        l = col.split(';')
        if len(l) == 1:
            cols.append(col)
            continue

        name = l[0]
        num = int(l[1])
        #if 'change' in name:
        #    continue
        
        #if 'ema' in name:
        #    continue
            
        if 'ema_6_item_cnt_month_item_category_id_cat_city' in name:
           if num == 1:
                cols.append(col)
                continue
            
       
        if 'ema_6_item_cnt_month_item_id_shop_id' in name:
            if num == 1:
                cols.append(col)
                continue
            
        if 'diff' in name:
            if num == 1:
                cols.append(col)
                continue

            continue
            
        if 'change' in name:

            if num <= 3:
                cols.append(col)
                continue

            continue
                
        if 'ema' in name:
            if num <= 1:
                cols.append(col)
                continue

            continue
               
        if 'price' in name:
            if num <= 3:
                cols.append(col)
                continue

            continue
        
        if 'shop_item_cnt' in name:
           # if num <=6 or num == 12:
           cols.append(col)
            #    continue
        
    
    
    return X_train[cols]

In [27]:
def append_some_columns(X_train, dbn):
    X_train['date_block_num'] = dbn
    X_train['month'] = dbn%12
    return X_train

In [28]:
from sklearn.svm import SVC

In [29]:
from sklearn.ensemble import RandomForestRegressor

In [30]:
import xgboost as xgb
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression

In [31]:

#model=xgb.XGBRegressor(tree_method="hist",
#                       #early_stopping_rounds=15,
#                       enable_categorical=True,
#                       #max_depth = 10,
#                       max_leaves=512,
#                       n_estimators = 100,
#                       learning_rate = 0.005
#                       )


#model =RandomForestRegressor(max_depth = 11, n_estimators = 150,n_jobs=8)
model = LGBMRegressor(verbose=-1,n_jobs=8, num_leaves=340, n_estimators = 500,  learning_rate=0.005)
#model = Lasso()
#model = SVC(kernel='linear')

In [32]:
type(model)

lightgbm.sklearn.LGBMRegressor

In [33]:
def train_model(model, merged,batch_size, val_month):
    
    first=True
    rmse = 0
    c=0
    columns_order=None
    
    Y_true_l = []
    preds_l = []
    for X_train,Y_train  in create_batch_train(merged,batch_size, val_month):
        
        if type(model) in [Lasso,SVC]:
            #print(X_train.columns)
            X_train.drop('shop_id', inplace=True, axis=1) 
            X_train.drop('item_category_id', inplace=True, axis=1) 
            X_train.drop('item_id', inplace=True, axis=1)
        else:
            #print(list(X_train.columns))
        
            #X_train = X_train.drop('item_id', axis=1)
            X_train['shop_id'] = X_train['shop_id'].astype('category')
            X_train['item_category_id'] = X_train['item_category_id'].astype('category')
            X_train['city'] = X_train['city'].astype('category')
            X_train['super_category'] = X_train['super_category'].astype('category')
            
            pass
           
        if X_train is None:
            print('None')
            continue
            
        Y_train = np.clip(Y_train,0,20)
        
        if X_train.empty:
            print('None')
            continue
        
        X_train = make_X_lag_format(X_train, val_month-1)
        
        X_train=select_columns(X_train, val_month-1)
        
            
        columns_order=X_train.columns

        if c == 0:
            print('train columns')
            #print(X_train.columns)
        if type(model) in [Lasso,SVC]:
            model.fit(X_train, Y_train)
            y_train_pred = model.predict(X_train)
        
        elif type(model) == LGBMRegressor:
            if first:
                model.fit(X_train, Y_train)
                first=False
            else:
                model.fit(X_train, Y_train, init_model=model)
            y_train_pred = model.predict(X_train, validate_features=True)

        elif type(model) == xgb.XGBRegressor:
            if first:
                model=model.fit(X_train, Y_train)
                first=False
            else:
                print(model.get_booster())
                model=model.fit(X_train, Y_train, xgb_model=model.get_booster())
                print('n_estimators:', model.n_estimators)
                
            y_train_pred = model.predict(X_train)  

        elif type(model) == RandomForestRegressor:
            model.fit(X_train, Y_train)
            y_train_pred = model.predict(X_train)  
        
        
        
        Y_true_l.append(Y_train)
        preds_l.append(y_train_pred)
        
        c+=1
        
    train_rmse = root_mean_squared_error(pd.concat(Y_true_l), np.concat(preds_l))
    print('train_rmse, ',train_rmse)
           

    return model, columns_order

In [34]:
def validate_model(model,merged,batch_size, val_month, columns_order):
    rmse = 0
    c=0
    
    val_preds = []
    Y_true_l = []
    preds_l = []
    #create_batch_train(merged,batch_size, val_month) - return train set, where Y_val
    #is shop_item_cnt_month{val_month}
    for X_val, Y_val in create_batch_val(merged,batch_size, val_month):#but then cartesian product used

        if type(model) in [sklearn.linear_model._coordinate_descent.Lasso,
                          SVC]:
            
            X_val.drop('shop_id', inplace=True, axis=1) 
            X_val.drop('item_category_id', inplace=True, axis=1) 
            X_val.drop('item_id', inplace=True, axis=1) 
            

        else:
            
            #X_val = X_val.drop('item_id', axis=1)
            X_val['shop_id'] = X_val['shop_id'].astype('category')
            X_val['item_category_id'] = X_val['item_category_id'].astype('category')
            X_val['city'] = X_val['city'].astype('category')
            X_val['super_category'] = X_val['super_category'].astype('category')
                    
            pass
            
        if X_val is None:
            continue
            
        Y_val = np.clip(Y_val,0,20)
        
        
        X_val = make_X_lag_format(X_val, val_month)
        
        X_val=select_columns(X_val, val_month)
        X_val = X_val[columns_order]

        if type(model) in [Lasso,SVC]:
            y_val_pred = model.predict(X_val)#lgb - validate features
            
        elif type(model) ==LGBMRegressor:
            y_val_pred = model.predict(X_val, validate_features=True)#lgb - validate features

        elif type(model) == xgb.XGBRegressor:
            y_val_pred = model.predict(X_val)   

        elif type(model) == RandomForestRegressor:
            y_val_pred = model.predict(X_val)  

        y_val_pred = np.clip(y_val_pred,0,20)

        
        
        
        preds_l.append(y_val_pred)
        Y_true_l.append(Y_val)
        
        c+=1
        
        val_preds.append(y_val_pred)

    
    val_rmse = root_mean_squared_error(pd.concat(Y_true_l), np.concat(preds_l))
    print('val rmse, ',val_rmse)

    return val_preds, val_rmse

In [35]:
def validate_ML(merged, start_val_month):
    """
    Function for validating model
    
    """
    
    val_errors = []
    batch_size=100000
    val_preds=[]
    
    
    for val_month in range(start_val_month, 34):

        
        model = LGBMRegressor(verbose=-1,n_jobs=8, num_leaves=456, n_estimators = 500,  learning_rate=0.005)

        
        print('date_block_num', val_month)
        print('month', val_month%12)

        model,columns_order = train_model(model, merged,batch_size, val_month)
        print('feature importances, ')
        print(list(model.feature_names_in_[np.argsort( model.feature_importances_)][::-1]))
        
        #dump_list = model.get_booster().get_dump()
        #num_trees = len(dump_list)
        
        print('n_estimators:', model.n_estimators_)
        val_pred, val_error = validate_model(model,merged,batch_size, val_month,columns_order)
        
        val_errors.append(val_error)
        val_preds.append(val_pred)
        

    return val_errors, val_preds

In [38]:
start_val_month=22

val_errors, val_preds = validate_ML(merged,start_val_month)
#Some feature names contains 2 word 'lag'. First word lag comes from column names in ./data.csv

date_block_num 22
month 10
train columns
train_rmse,  0.4335074696504726
feature importances, 
[np.str_('item_id'), np.str_('ema_6_item_cnt_month_item_category_id_cat_shop_id_lag;1'), np.str_('ema_6_item_cnt_month_item_category_id_cat_city_lag;1'), np.str_('ema_6_item_cnt_month_item_id_lag;1'), np.str_('avg_item_priceitem_id_lag_1_lag;1'), np.str_('shop_id'), np.str_('date_block_num_diff_lag;1'), np.str_('avg_item_priceitem_id_lag_1_lag;3'), np.str_('shop_item_cnt_lag;1'), np.str_('item_category_id'), np.str_('item_price_change_lag;1'), np.str_('avg_item_priceitem_id_lag_1_lag;2'), np.str_('item_price_change_lag;2'), np.str_('item_price_change_lag;3'), np.str_('ema_6_item_cnt_month_item_id_shop_id_lag;1'), np.str_('shop_item_cnt_lag;2'), np.str_('shop_item_cnt_lag;3'), np.str_('shop_item_cnt_lag;12'), np.str_('shop_item_cnt_lag;4'), np.str_('shop_item_cnt_lag;10'), np.str_('shop_item_cnt_lag;5'), np.str_('shop_item_cnt_lag;6'), np.str_('shop_item_cnt_lag;7'), np.str_('shop_item_cnt_lag

KeyboardInterrupt: 

In [None]:
[1.1131333259192753, 
1.280538425526827,
1.1267832346268418,
0.8577197073585947,
0.852271386468949, 
0.955775029714284,
0.892993924504934,
0.8300112585408684,
0.7460129792132149,
0.8224050978044093,
0.979549517863459,
1.0014096241176884]

In [None]:
val_errors

In [None]:
np.array(val_errors).mean()

In [36]:
def create_submission(model,merged,batch_size, columns_order):
    val_month = 34
    test = pd.read_csv('../data_cleaned/test.csv')
    
    data_test = test
    PREDICTION = pd.DataFrame(columns=['shop_id','item_id','item_cnt_month'])
    Y_true_l=[]
    for X_val, Y_val in create_batch_val(merged,batch_size, val_month):
        if type(model) in [sklearn.linear_model._coordinate_descent.Lasso,
                          SVC]:
            
            X_val.drop('shop_id', inplace=True, axis=1) 
            X_val.drop('item_category_id', inplace=True, axis=1) 
            X_val.drop('item_id', inplace=True, axis=1) 
            

        else:
            
            #X_val = X_val.drop('item_id', axis=1)
            X_val['shop_id'] = X_val['shop_id'].astype('category')
            X_val['item_category_id'] = X_val['item_category_id'].astype('category')
            X_val['city'] = X_val['city'].astype('category')
            X_val['super_category'] = X_val['super_category'].astype('category')
                    
            pass

        
        if X_val is None:
            continue
            
        Y_val = np.clip(Y_val,0,20)
        
        if X_val.empty:
            print('None')
            continue
            
        
        X_val = make_X_lag_format(X_val, val_month)
        X_val=select_columns(X_val, val_month)
        X_val = X_val[columns_order]

        
        y_val_pred=model.predict(X_val)
        y_val_pred = np.clip(y_val_pred,0,20)#lgb - validate features
        Y_true_l.append(Y_val)
        
        
        app = pd.DataFrame({'item_id':X_val.item_id,'shop_id': X_val.shop_id, 'item_cnt_month':y_val_pred})
        PREDICTION = pd.concat([PREDICTION, app],ignore_index=True)

    #val_rmse = root_mean_squared_error(PREDICTION['item_cnt_month'], np.concat(Y_true_l))
    #print('val rmse, ',val_rmse)
    
    data_test = data_test.merge(PREDICTION,on=['shop_id','item_id'])[['ID','item_cnt_month']]
    return data_test
    

In [37]:
def create_submission_pipeline(merged):
    val_errors = []
    batch_size=100000
    val_errors=[]

    model = LGBMRegressor(verbose=-1,n_jobs=8, num_leaves=512, n_estimators = 1000,  learning_rate=0.003)
    #model =RandomForestRegressor(max_depth = 11, n_estimators = 150,n_jobs=8)
    model,columns_order = train_model(model, merged,batch_size, 34)
    
    print('Feature importnaces in lgb:')
    
    print(model.feature_names_in_[np.argsort(model.feature_importances_)][::-1])
    #print('n_estimators:', model.n_estimators_)
    
    data_test = create_submission(model,merged,batch_size,columns_order)

    return data_test

In [38]:
submission = create_submission_pipeline(merged)

train columns
train_rmse,  0.3129824042781292
Feature importnaces in lgb:
['item_id' 'ema_6_item_cnt_month_item_category_id_cat_shop_id_lag;1'
 'ema_6_item_cnt_month_item_category_id_cat_city_lag;1'
 'ema_6_item_cnt_month_item_id_lag;1' 'avg_item_priceitem_id_lag_1_lag;1'
 'shop_id' 'date_block_num_diff_lag;1' 'avg_item_priceitem_id_lag_1_lag;3'
 'avg_item_priceitem_id_lag_1_lag;2' 'item_category_id'
 'item_price_change_lag;1' 'item_price_change_lag;2'
 'item_price_change_lag;3' 'shop_item_cnt_lag;1'
 'ema_6_item_cnt_month_item_id_shop_id_lag;1' 'shop_item_cnt_lag;2'
 'shop_item_cnt_lag;3' 'shop_item_cnt_lag;12' 'shop_item_cnt_lag;10'
 'shop_item_cnt_lag;4' 'shop_item_cnt_lag;5' 'city' 'shop_item_cnt_lag;7'
 'shop_item_cnt_lag;8' 'shop_item_cnt_lag;6' 'shop_item_cnt_lag;9'
 'shop_item_cnt_lag;11' 'shop_item_cnt_lag;17' 'shop_item_cnt_lag;24'
 'shop_item_cnt_lag;14' 'shop_item_cnt_lag;13' 'shop_item_cnt_lag;16'
 'shop_item_cnt_lag;33' 'shop_item_cnt_lag;18' 'shop_item_cnt_lag;15'
 'shop

  PREDICTION = pd.concat([PREDICTION, app],ignore_index=True)


In [40]:
submission.describe()

Unnamed: 0,ID,item_cnt_month
count,214200.0,214200.0
mean,107099.5,0.293744
std,61834.358168,0.803725
min,0.0,0.0
25%,53549.75,0.0
50%,107099.5,0.079759
75%,160649.25,0.283214
max,214199.0,20.0


In [41]:
submission.to_csv('submission.csv', index=False)

In [None]:
result:
LGBM default:
???

Lasso
validation - 1.122829



In [None]:
!pwd

In [None]:
RF on :
batch_size=100000???
['shop_item_cnt_lag;1',
 'item_id', 
 'ema_6_item_cnt_month_item_category_id_cat_shop_id_lag;1',
 'shop_item_cnt_lag;2', 
 'item_category_id', 
 'shop_item_cnt_lag;3', 
 'avg_item_priceitem_id_lag_1_lag;1',
 'ema_6_item_cnt_month_item_id_lag;1',
 'shop_id', 
 'date_block_num_diff_lag;1',
 'item_price_change_lag;1',
 'shop_item_cnt_lag;12',
 'avg_item_priceitem_id_lag_1_lag;3', 
 'city', 
 'avg_item_priceitem_id_lag_1_lag;2', 
 'super_category',
 'shop_item_cnt_lag;6', 
 'shop_item_cnt_lag;4', 
 'item_price_change_lag;2', 
 'shop_item_cnt_lag;5', 
 'item_price_change_lag;3', 
 'month', 
 'date_block_num']

validation - [np.float64(1.1030454661151439),
 np.float64(1.2808061846792664),
 np.float64(1.0932161531758702),
 np.float64(0.8539095663659091),
 np.float64(0.8556563562276699),
 np.float64(0.9702615886206524),
 np.float64(0.9432473239256615),
 np.float64(0.8404985919353657),
 np.float64(0.7832955927106592),
 np.float64(0.8417092000723038),
 np.float64(0.9909527270877753),
 np.float64(0.9851377342980254)]
mean - np.float64(0.9618113737678584)
test - 1.05


In [None]:
RF on :
batch_size=100000???
['shop_item_cnt_lag;1', 
 'item_id', 
 'item_category_id',
 'shop_item_cnt_lag;2', 
 'shop_item_cnt_lag;3', 
 'shop_id',
 'super_category', 
 'city', 
 'shop_item_cnt_lag;4', 
 'shop_item_cnt_lag;9', 
 'shop_item_cnt_lag;5', 
 'shop_item_cnt_lag;21', 
 'shop_item_cnt_lag;6', 
 'shop_item_cnt_lag;8',
 'shop_item_cnt_lag;7',
 'shop_item_cnt_lag;10', 
 'shop_item_cnt_lag;14', 
 'shop_item_cnt_lag;17', 
 'shop_item_cnt_lag;12',
 'shop_item_cnt_lag;16', 
 'shop_item_cnt_lag;11',
 'shop_item_cnt_lag;19', 
 'shop_item_cnt_lag;13', 
 'shop_item_cnt_lag;31',
 'shop_item_cnt_lag;22', 
 'shop_item_cnt_lag;23', 
 'shop_item_cnt_lag;15',
 'shop_item_cnt_lag;28', 
 'shop_item_cnt_lag;18', 
 'shop_item_cnt_lag;20', 
 'shop_item_cnt_lag;30',
 'shop_item_cnt_lag;24', 
 'shop_item_cnt_lag;27', 
 'shop_item_cnt_lag;26', 
 'shop_item_cnt_lag;29',
 'shop_item_cnt_lag;25', 
 'month',
 'date_block_num']


validation - [np.float64(1.1261279577971253),
 np.float64(1.3063675989401422),
 np.float64(1.107639823687741),
 np.float64(0.8568042503316228),
 np.float64(0.8558897249837448),
 np.float64(1.0032960741790946),
 np.float64(0.9291427800382513),
 np.float64(0.8312809789838309),
 np.float64(0.7627208542403842),
 np.float64(0.8455873646630931),
 np.float64(0.9902778716165955),
 np.float64(0.9974386033703456)]
mean - np.float64(0.9677144902359976)

test - 1.08

In [None]:
boosting(LGBMRegressor(verbose=-1,n_jobs=8, num_leaves=512, n_estimators = 150,  learning_rate=0.005))on:
btch_size = 100000

[1.1131333259192753, 
1.280538425526827,
1.1267832346268418,
0.8577197073585947,
0.852271386468949, 
0.955775029714284,
0.892993924504934,
0.8300112585408684,
0.7460129792132149,
0.8224050978044093,
0.979549517863459,
1.0014096241176884]
mean - 0.9548836259716121
test - 1.03