In [96]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import catboost
import lightgbm as lgb
import gc
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans

In [97]:
train = pd.read_csv('data/train.csv')
sub = pd.read_csv('data/sample_submission.csv')

In [98]:
train = train.sort_values(by=['id_house', 'date'])

In [99]:
sub['date'] = sub['id'].apply(lambda x:x.split('_')[0])
sub['id_house'] = sub['id'].apply(lambda x:int(x.split('_')[1]))
sub['date'] = pd.to_datetime(sub['date'])
train['date'] = pd.to_datetime(train['date'])

In [100]:
sub['date'].unique()
# 3 month to predict

<DatetimeArray>
['2022-06-01 00:00:00', '2022-07-01 00:00:00', '2022-08-01 00:00:00']
Length: 3, dtype: datetime64[ns]

In [101]:
date_range = pd.date_range(train['date'].min(), sub['date'].max(), freq = 'MS').tolist()
city_list = []
time_list = []
for city in sub['id_house'].unique():
    time_list += date_range
    city_list += [city] * len(date_range)
data = pd.DataFrame()
data['id_house'] = city_list
data['date'] = time_list
data = data.merge(train, on = ['id_house', 'date'], how = 'left')
data_other_columns = [x for x in data.columns if x not in ['id_house', 'date']]
#fill issing data
for col in tqdm(data_other_columns):
    data[col] = data.groupby('id_house', group_keys = False)[col].apply(lambda x:x.fillna(method = 'bfill').fillna(method = 'ffill'))

data['preds'] = -1.

  0%|          | 0/20 [00:00<?, ?it/s]

  data[col] = data.groupby('id_house', group_keys = False)[col].apply(lambda x:x.fillna(method = 'bfill').fillna(method = 'ffill'))


In [102]:
coords_data = data.drop_duplicates('id_house')[['lat', 'lng']]
nn = NearestNeighbors(n_neighbors=10)
nn.fit(coords_data.values)

col = 'med_price'
dict_vals = data.groupby(['id_house', 'date'])[col].mean().to_dict()
dict_ind_house = {i: k for i, k in enumerate(data['id_house'].unique())}
list_month = data['date'].unique()

dict_mean = {}
dict_std = {}
for neighbors in nn.kneighbors(coords_data.values)[1]:
    for month in list_month:
        vals = []
        for neighbor in neighbors:
            if (dict_ind_house[neighbor], month) in dict_vals:
                vals += [dict_vals[(dict_ind_house[neighbor], month)]]
        if len(vals) > 0:
            dict_mean[(dict_ind_house[neighbors[0]], month)] = np.mean(vals)

data['med_price_mean'] = [dict_mean.get((c, m), None) for c, m in data[['id_house', 'date']].values]

In [103]:
def catboost_train(train, target, split_list, param):
    bst_list = []
    
    for i, (train_index, val_index, test_index) in enumerate(split_list):
        tr = catboost.Pool(train[train_index], label=target[train_index])
        te = catboost.Pool(train[val_index], label=target[val_index])
        
        bst = catboost.train(
            tr, param, eval_set=te, 
            iterations=1000, early_stopping_rounds=200, verbose=300
        )
        
        bst_list.append(bst)
        
        gc.collect()
        del tr, te
    
    return bst_list

params_cat = {
    'loss_function': 'MAE', 
    'max_depth': 6, 
    'eval_metric': 'MAPE', 
    'learning_rate': 0.04, 
    'l2_leaf_reg': 20, 
    'random_state': 42, 
    'verbose': 0
}

def lgb_train(train, target, split_list, param):
    bst_list = []
    
    for i, (train_index, val_index, test_index) in enumerate(split_list):
        tr = lgb.Dataset(train[train_index], target[train_index])
        te = lgb.Dataset(train[val_index], target[val_index], reference=tr)
        
        bst = lgb.train(
            param, tr, num_boost_round=1000, valid_sets=te,
            callbacks=[lgb.early_stopping(100), lgb.log_evaluation(5000)]
        )
        
        bst_list.append(bst)
        
        gc.collect()
        del tr, te
    
    return bst_list

params_lgb = {
    'objective': 'mae', 
    'verbosity': -1, 
    'boosting_type': 'gbdt', 
    'metric': 'mape', 
    'lambda_l1': 5, 
    'learning_rate': 0.01, 
    'num_leaves': 64
}

In [104]:
train_month = sorted(train['date'].unique())
test_month = sorted(sub['date'].unique())

cbstats = []
lgbstats = []
corrs = []

for m_predict in [1, 2, 3]:
    data[f'mean_price_shift_{m_predict}'] = (
        data.groupby('id_house', group_keys=False)['mean_price']
        .apply(lambda x: x.shift(m_predict))
    )
    
    data['new_target'] = data['mean_price'] / data[f'mean_price_shift_{m_predict}']

    monthstats = data.groupby(data['date'].dt.month)[['new_target']].mean()
    data['date_cluster'] = data['date'].dt.month.map(pd.Series(KMeans(n_clusters=2, random_state=42).fit_predict(monthstats), 
                                                       index=monthstats.index))

    data['diff_rw_std'] = (data.groupby('id_house', group_keys=False)['mean_price']
                           .apply(lambda x: x.diff().rolling(4, min_periods=1).std().shift(m_predict)))
    
    data['house_mean'] = data.groupby('id_house', group_keys=False)['new_target'].transform('mean')
    
    train_cols = ['date_cluster', 'diff_rw_std']#
    feature_cols = [
        'build_year_median', 'vc_city_quadkey', 'number_total', 'new_target', 'mean_price',
        'num_builds_live', 'room_one', 'med_price_mean'
    ]
    
    for col in tqdm(feature_cols):
        data[f'{col}_shift_rm_{m_predict}'] = (
            data.groupby('id_house', group_keys=False)[col]
            .apply(lambda x: x.rolling(3, min_periods=1).mean().shift(m_predict))
        )
        train_cols.append(f'{col}_shift_rm_{m_predict}')
    
    split_list = []    
    list_val_months = [-1, -2]
    
    for val_month in list_val_months:
        train_index = data[(data['date'] > train_month[5]) & (data['date'] <= train_month[val_month - 1])].index
        val_index = data[data['date'] == train_month[val_month]].index
        test_index = data[data['date'] == test_month[m_predict - 1]].index
        split_list.append((train_index, val_index, test_index))
    
    # Checking correlations
    vals = data[train_cols].corr().abs().values
    max_corr = vals[~np.eye(vals.shape[0], dtype=bool)].max()
    print('CHECK CORR COLS:', max_corr < 0.95, 'MAX CORR:', max_corr)
    corrs.append(max_corr)
    
    # Train CatBoost
    bst_list_catboost = catboost_train(data[train_cols].values, data['new_target'].values, split_list, params_cat)
    catboost_preds = []
    
    for num_, bst in enumerate(bst_list_catboost):
        val_index = split_list[num_][-2]
        val_pred = bst.predict(data[train_cols].values[val_index]) * data[f'mean_price_shift_{m_predict}'][val_index]
        score = (data['mean_price'][val_index] - val_pred).abs().mean()
        print(f'VAL SCORE CATBOOST MONTH {num_}:', score)
        cbstats.append(score)
        
        test_index = split_list[num_][-1]
        catboost_preds.append(
            bst.predict(data[train_cols].values[test_index]) * data[f'mean_price_shift_{m_predict}'][test_index]
        )
    
    catboost_preds = np.mean(catboost_preds, axis=0)
    
    # Train LightGBM
    bst_list_lgb = lgb_train(data[train_cols].values, data['new_target'].values, split_list, params_lgb)
    lgb_preds = []
    
    for num_, bst in enumerate(bst_list_lgb):
        val_index = split_list[num_][-2]
        val_pred = bst.predict(data[train_cols].values[val_index]) * data[f'mean_price_shift_{m_predict}'][val_index]
        score = (data['mean_price'][val_index] - val_pred).abs().mean()
        print(f'VAL SCORE LIGHTGBM MONTH {num_}:', score)
        lgbstats.append(score)
        
        test_index = split_list[num_][-1]
        lgb_preds.append(
            bst.predict(data[train_cols].values[test_index]) * data[f'mean_price_shift_{m_predict}'][test_index]
        )
    
    lgb_preds = np.mean(lgb_preds, axis=0)
    
    # Store predictions
    data.loc[test_index, 'preds'] = (catboost_preds * 0.5 + lgb_preds * 0.5) / (0.5 + 0.5)

  0%|          | 0/8 [00:00<?, ?it/s]

CHECK CORR COLS: True MAX CORR: 0.9260504972238871
0:	learn: 0.0170445	test: 0.0161071	best: 0.0161071 (0)	total: 8.02ms	remaining: 8.01s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.01598873209
bestIteration = 83

Shrink model to first 84 iterations.
0:	learn: 0.0170237	test: 0.0174779	best: 0.0174779 (0)	total: 9.67ms	remaining: 9.66s
300:	learn: 0.0168531	test: 0.0173554	best: 0.0173480 (241)	total: 1.7s	remaining: 3.96s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.01734796948
bestIteration = 241

Shrink model to first 242 iterations.
VAL SCORE CATBOOST MONTH 0: 2230.7901454635717
VAL SCORE CATBOOST MONTH 1: 2395.4546648447777
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[321]	valid_0's mape: 0.0159722
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[392]	valid_0's mape: 0.0173502
VAL SCORE LIGHTGBM MONTH 0: 2228.5927130331056
VAL SCORE

  0%|          | 0/8 [00:00<?, ?it/s]

CHECK CORR COLS: True MAX CORR: 0.925757773518557
0:	learn: 0.0272367	test: 0.0268349	best: 0.0268349 (0)	total: 6.83ms	remaining: 6.82s
300:	learn: 0.0267349	test: 0.0263432	best: 0.0263432 (300)	total: 1.69s	remaining: 3.93s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.02634215864
bestIteration = 308

Shrink model to first 309 iterations.
0:	learn: 0.0271601	test: 0.0288440	best: 0.0288440 (0)	total: 6.93ms	remaining: 6.93s
300:	learn: 0.0266462	test: 0.0282009	best: 0.0282009 (300)	total: 1.5s	remaining: 3.49s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.02819994548
bestIteration = 302

Shrink model to first 303 iterations.
VAL SCORE CATBOOST MONTH 0: 3597.12825741342
VAL SCORE CATBOOST MONTH 1: 3861.932857781824
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[371]	valid_0's mape: 0.0263161
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:


  0%|          | 0/8 [00:00<?, ?it/s]

CHECK CORR COLS: True MAX CORR: 0.9254555242956002
0:	learn: 0.0339889	test: 0.0353041	best: 0.0353041 (0)	total: 6.6ms	remaining: 6.59s
300:	learn: 0.0331163	test: 0.0341692	best: 0.0341680 (297)	total: 1.51s	remaining: 3.5s
600:	learn: 0.0329528	test: 0.0340891	best: 0.0340866 (587)	total: 3.06s	remaining: 2.03s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.03408475748
bestIteration = 670

Shrink model to first 671 iterations.
0:	learn: 0.0338516	test: 0.0369101	best: 0.0369101 (0)	total: 6.7ms	remaining: 6.69s
300:	learn: 0.0329509	test: 0.0355863	best: 0.0355667 (295)	total: 1.45s	remaining: 3.37s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.03556669686
bestIteration = 295

Shrink model to first 296 iterations.
VAL SCORE CATBOOST MONTH 0: 4629.086999275065
VAL SCORE CATBOOST MONTH 1: 4916.871360292706
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[978]	valid_0's mape: 0.03

In [105]:
print('Validation Report')
print()
print('CB Mean:    ', np.array(cbstats).mean().round(2))
print('LGB Mean:   ', np.array(lgbstats).mean().round(2))
print()
print('CB Mean by model:    ', np.array(cbstats).reshape(-1, 2).mean(axis=1).round(2))
print('LGB Mean by model:   ', np.array(lgbstats).reshape(-1, 2).mean(axis=1).round(2))
print()
print('Max col corr:', '{:.3f}'.format(max(corrs)))
print(f'Columns ({len(train_cols)}):')
print(pd.Series(train_cols))
print()
print('CatBoost')
print(np.array(cbstats).round(2).reshape(-1, 2))
print()
print('LightGBM')
print(np.array(lgbstats).round(2).reshape(-1, 2))

Validation Report

CB Mean:     3605.21
LGB Mean:    3608.79

CB Mean by model:     [2313.12 3729.53 4772.98]
LGB Mean by model:    [2311.86 3727.3  4787.19]

Max col corr: 0.926
Columns (10):
0                    date_cluster
1                     diff_rw_std
2    build_year_median_shift_rm_3
3      vc_city_quadkey_shift_rm_3
4         number_total_shift_rm_3
5           new_target_shift_rm_3
6           mean_price_shift_rm_3
7      num_builds_live_shift_rm_3
8             room_one_shift_rm_3
9       med_price_mean_shift_rm_3
dtype: object

CatBoost
[[2230.79 2395.45]
 [3597.13 3861.93]
 [4629.09 4916.87]]

LightGBM
[[2228.59 2395.14]
 [3590.25 3864.35]
 [4647.81 4926.57]]


making submission

In [106]:
sub = sub.merge(data[['date', 'id_house', 'preds']], on = ['date', 'id_house'], how = 'left')
sub['target'] = sub['preds']
# delete temp files of catboost from working folder
!rm -rf /kaggle/working/
sub[['id', 'target']].to_csv('solution.csv', index = None)
sub['target'].min(), sub['target'].isnull().sum()

(12399.999799832483, 0)