In [1]:
import polars as pl
import numpy as np
from tqdm.notebook import tqdm
import catboost
import lightgbm as lgb
import xgboost as xgb
import gc
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from datetime import datetime
from dateutil.relativedelta import relativedelta

In [2]:
train = pl.read_csv('data/train.csv')
sub = pl.read_csv('data/sample_submission.csv')

In [3]:
train = train.sort(["id_house", "date"])

In [4]:
sub = sub.with_columns([
    pl.col("id").str.split("_").list.get(0).alias("date"),
    pl.col("id").str.split("_").list.get(1).cast(pl.Int64).alias("id_house")
])

sub = sub.with_columns(
    pl.col("date").str.strptime(pl.Date, format="%Y-%m-%d")
)

train = train.with_columns(
    pl.col("date").str.strptime(pl.Date, format="%Y-%m-%d")
)

In [5]:
sub.select(pl.col("date")).unique()

date
date
2022-06-01
2022-07-01
2022-08-01


In [6]:
start_date = train.select(pl.col("date").min()).to_series()[0]
end_date   = sub.select(pl.col("date").max()).to_series()[0]

dates_list = []
current = start_date
while current <= end_date:
    dates_list.append(current)
    current += relativedelta(months=1)

unique_cities = sub.select("id_house").unique()

dates_df = pl.DataFrame({"date": dates_list})

data = unique_cities.join(dates_df, how="cross")

data = data.join(train, on=["id_house", "date"], how="left")

data_other_columns = [col for col in data.columns if col not in ["id_house", "date"]]

for col_name in data_other_columns:
    data = data.with_columns(
        pl.col(col_name)
          .fill_null(strategy="backward")
          .fill_null(strategy="forward")
          .over("id_house")
          .alias(col_name)
    )

data = data.with_columns(pl.lit(-1.).alias("preds"))

In [7]:
unique_houses_df = data.unique(subset="id_house")
coords_data = unique_houses_df.select(["lat", "lng"])
unique_ids = unique_houses_df.select("id_house")["id_house"].to_list()
dict_ind_house = {i: k for i, k in enumerate(unique_ids)}

coords_np = coords_data.to_numpy()
nn = NearestNeighbors(n_neighbors=10)
nn.fit(coords_np)

grouped = data.group_by(["id_house", "date"]).agg(
    pl.col("med_price").mean().alias("med_price_mean")
)
dict_vals = { (row[0], row[1]) : row[2] for row in grouped.iter_rows() }

list_month = data.select("date").unique()["date"].to_list()

dict_mean = {}
neighbors_arr = nn.kneighbors(coords_np)[1]
for neighbors in neighbors_arr:
    for month in list_month:
        vals = []
        for neighbor in neighbors:
            key = (dict_ind_house[neighbor], month)
            if key in dict_vals:
                vals.append(dict_vals[key])
        if vals:
            dict_mean[(dict_ind_house[neighbors[0]], month)] = np.mean(vals)

data = data.with_columns(
    pl.struct(["id_house", "date"]).map_elements(
        lambda row: dict_mean.get((row["id_house"], row["date"]), None)
    ).alias("med_price_mean")
)

  data = data.with_columns(


In [None]:
def catboost_train(train, target, split_list, param):
    bst_list = []
    
    for i, (train_index, val_index, test_index) in enumerate(split_list):
        tr = catboost.Pool(train[train_index], label=target[train_index])
        te = catboost.Pool(train[val_index], label=target[val_index])
        
        bst = catboost.train(
            tr, param, eval_set=te, 
            iterations=1000, early_stopping_rounds=200, verbose=300
        )
        
        bst_list.append(bst)
        
        gc.collect()
        del tr, te
    
    return bst_list

params_cat = {
    'loss_function': 'MAE', 
    'max_depth': 6, 
    'eval_metric': 'MAPE', 
    'learning_rate': 0.04, 
    'l2_leaf_reg': 20, 
    'random_state': 42, 
    'verbose': 0
}

def lgb_train(train, target, split_list, param):
    bst_list = []дрессировка собак
        te = lgb.Dataset(train[val_index], target[val_index], reference=tr)
        
        bst = lgb.train(
            param, tr, num_boost_round=1000, valid_sets=te,
            callbacks=[lgb.early_stopping(100), lgb.log_evaluation(5000)]
        )
        
        bst_list.append(bst)
        
        gc.collect()
        del tr, te
    
    return bst_list

params_lgb = {
    'objective': 'mae', 
    'verbosity': -1, 
    'boosting_type': 'gbdt', 
    'metric': 'mape', 
    'lambda_l1': 5, 
    'learning_rate': 0.01, 
    'num_leaves': 64
}

def xgb_train(train, target, split_list, param):
    bst_list = []    
    for i, (train_index, val_index, test_index) in enumerate(split_list):
        dtrain = xgb.DMatrix(train[train_index], label=target[train_index])
        dval = xgb.DMatrix(train[val_index], label=target[val_index])
        watchlist = [(dtrain, 'train'), (dval, 'eval')]
        bst = xgb.train(
            param, dtrain, num_boost_round=1000, evals=watchlist,
            early_stopping_rounds=100, verbose_eval=300
        )
        bst_list.append(bst)        
        gc.collect()
        del dtrain, dval    
    return bst_list

params_xgb = {
    'objective': 'reg:squarederror', 
    'eval_metric': 'mae',
    'eta': 0.01,
    'max_depth': 6,
    'lambda': 16,
    'seed': 42
}

In [9]:
data = data.sort(["id_house", "date"])

feature_cols = [
    'build_year_median', 'vc_city_quadkey', 'number_total', 'new_target', 'mean_price',
    'num_builds_live', 'room_one', 'med_price_mean'
]

cbstats = []
lgbstats = []
xgbstats = []
corrs = []

for m_predict in [1, 2, 3]:

    data = data.with_columns(
        pl.col("mean_price").shift(m_predict).over("id_house").alias(f"mean_price_shift_{m_predict}")
    )

    data = data.with_columns(
        (pl.col("mean_price") / pl.col(f"mean_price_shift_{m_predict}")).alias("new_target")
    )

    data = data.with_columns(
        pl.col("date").dt.month().alias("month")
    )

    monthstats = data.group_by("month").agg(
        pl.col("new_target").mean().alias("new_target_mean")
    )

    kmeans = KMeans(n_clusters=2, random_state=42)
    month_cluster = kmeans.fit_predict(
        monthstats.select("new_target_mean").to_numpy().ravel().reshape(-1, 1)
    )

    month_to_cluster = { row[0]: cluster for row, cluster in zip(monthstats.iter_rows(), month_cluster) }

    data = data.with_columns(
        pl.col("month").map_elements(lambda m: month_to_cluster[m]).alias("date_cluster")
    )

    data = data.with_columns(
        pl.col("mean_price")
          .diff()
          .rolling_std(window_size=4, min_periods=1)
          .shift(m_predict)
          .over("id_house")
          .alias("diff_rw_std")
    )

    data = data.with_columns(
        pl.col("new_target").mean().over("id_house").alias("house_mean")
    )

    train_cols = ['date_cluster', 'diff_rw_std']

    for col in feature_cols:
        new_col = f"{col}_shift_rm_{m_predict}"
        data = data.with_columns(
            pl.col(col)
              .rolling_mean(window_size=3, min_periods=1)
              .shift(m_predict)
              .over("id_house")
              .alias(new_col)
        )
        train_cols.append(new_col)

    data_pd = data.to_pandas()
    train_pd = train.to_pandas()
    sub_pd = sub.to_pandas()

    train_month = sorted(train_pd['date'].unique())
    test_month = sorted(sub_pd['date'].unique())

    split_list = []
    list_val_months = [-1, -2]
    for val_month in list_val_months:
        train_index = data_pd[(data_pd['date'] > train_month[5]) & (data_pd['date'] <= train_month[val_month - 1])].index
        val_index = data_pd[data_pd['date'] == train_month[val_month]].index
        test_index = data_pd[data_pd['date'] == test_month[m_predict - 1]].index
        split_list.append((train_index, val_index, test_index))

    vals = data_pd[train_cols].corr().abs().values
    max_corr = vals[~np.eye(vals.shape[0], dtype=bool)].max()
    print('CHECK CORR COLS:', max_corr < 0.95, 'MAX CORR:', max_corr)
    corrs.append(max_corr)

    bst_list_catboost = catboost_train(data_pd[train_cols].values, data_pd['new_target'].values, split_list, params_cat)
    catboost_preds = []
    for num_, bst in enumerate(bst_list_catboost):
        val_index = split_list[num_][-2]
        val_pred = bst.predict(data_pd[train_cols].values[val_index]) * data_pd[f'mean_price_shift_{m_predict}'][val_index]
        score = (data_pd['mean_price'][val_index] - val_pred).abs().mean()
        print(f'VAL SCORE CATBOOST MONTH {num_}:', score)
        cbstats.append(score)
        test_index = split_list[num_][-1]
        catboost_preds.append(
            bst.predict(data_pd[train_cols].values[test_index]) * data_pd[f'mean_price_shift_{m_predict}'][test_index]
        )
    catboost_preds = np.mean(catboost_preds, axis=0)

    bst_list_lgb = lgb_train(data_pd[train_cols].values, data_pd['new_target'].values, split_list, params_lgb)
    lgb_preds = []
    for num_, bst in enumerate(bst_list_lgb):
        val_index = split_list[num_][-2]
        val_pred = bst.predict(data_pd[train_cols].values[val_index]) * data_pd[f'mean_price_shift_{m_predict}'][val_index]
        score = (data_pd['mean_price'][val_index] - val_pred).abs().mean()
        print(f'VAL SCORE LIGHTGBM MONTH {num_}:', score)
        lgbstats.append(score)
        test_index = split_list[num_][-1]
        lgb_preds.append(
            bst.predict(data_pd[train_cols].values[test_index]) * data_pd[f'mean_price_shift_{m_predict}'][test_index]
        )
    lgb_preds = np.mean(lgb_preds, axis=0)

    bst_list_xgb = xgb_train(data_pd[train_cols].values, data_pd['new_target'].values, split_list, params_xgb)
    xgb_preds = []
    for num_, bst in enumerate(bst_list_xgb):
        val_index = split_list[num_][-2]
        dval = xgb.DMatrix(data_pd[train_cols].values[val_index])
        val_pred = bst.predict(dval) * data_pd[f'mean_price_shift_{m_predict}'][val_index]
        score = (data_pd['mean_price'][val_index] - val_pred).abs().mean()
        print(f'VAL SCORE XGBOOST MONTH {num_}:', score)
        xgbstats.append(score)
        test_index = split_list[num_][-1]
        dtest = xgb.DMatrix(data_pd[train_cols].values[test_index])
        xgb_preds.append(
            bst.predict(dtest) * data_pd[f'mean_price_shift_{m_predict}'][test_index]
        )
    xgb_preds = np.mean(xgb_preds, axis=0)

    data_pd.loc[test_index, 'preds'] = (catboost_preds * 0.4995 + lgb_preds * 0.4995 + xgb_preds * 0.001) / (0.4995 + 0.4995 + 0.001)

    data = pl.from_pandas(data_pd)

Expr.map_elements is significantly slower than the native expressions API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
  - pl.col("month").map_elements(lambda m: ...)
with this one instead:
  + pl.col("month").replace_strict(month_to_cluster)

  pl.col("month").map_elements(lambda m: month_to_cluster[m]).alias("date_cluster")
  data = data.with_columns(
  .rolling_std(window_size=4, min_periods=1)
  .rolling_mean(window_size=3, min_periods=1)


CHECK CORR COLS: True MAX CORR: 0.9258194925423877
0:	learn: 0.0170445	test: 0.0161071	best: 0.0161071 (0)	total: 90.6ms	remaining: 1m 30s
300:	learn: 0.0168737	test: 0.0159861	best: 0.0159859 (162)	total: 1.83s	remaining: 4.25s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.01598482247
bestIteration = 324

Shrink model to first 325 iterations.
0:	learn: 0.0170239	test: 0.0174780	best: 0.0174780 (0)	total: 7.92ms	remaining: 7.91s
300:	learn: 0.0168509	test: 0.0173541	best: 0.0173526 (291)	total: 1.63s	remaining: 3.77s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.017352632
bestIteration = 291

Shrink model to first 292 iterations.
VAL SCORE CATBOOST MONTH 0: 2230.8501571891734
VAL SCORE CATBOOST MONTH 1: 2396.6000770342803
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[369]	valid_0's mape: 0.015975
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration i

Expr.map_elements is significantly slower than the native expressions API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
  - pl.col("month").map_elements(lambda m: ...)
with this one instead:
  + pl.col("month").replace_strict(month_to_cluster)

  pl.col("month").map_elements(lambda m: month_to_cluster[m]).alias("date_cluster")
  data = data.with_columns(
  .rolling_std(window_size=4, min_periods=1)
  .rolling_mean(window_size=3, min_periods=1)


CHECK CORR COLS: True MAX CORR: 0.9255215609513816
0:	learn: 0.0272357	test: 0.0268333	best: 0.0268333 (0)	total: 7.12ms	remaining: 7.11s
300:	learn: 0.0267603	test: 0.0263728	best: 0.0263659 (287)	total: 1.79s	remaining: 4.15s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.02636586239
bestIteration = 287

Shrink model to first 288 iterations.
0:	learn: 0.0271595	test: 0.0288435	best: 0.0288435 (0)	total: 6.85ms	remaining: 6.84s
300:	learn: 0.0266711	test: 0.0282086	best: 0.0282086 (300)	total: 1.57s	remaining: 3.64s
600:	learn: 0.0266262	test: 0.0282088	best: 0.0282016 (555)	total: 3.08s	remaining: 2.04s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.0282015561
bestIteration = 555

Shrink model to first 556 iterations.
VAL SCORE CATBOOST MONTH 0: 3601.1675761829597
VAL SCORE CATBOOST MONTH 1: 3863.935429934298
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[293]	valid_0's mape: 0.0263041
Trai

Expr.map_elements is significantly slower than the native expressions API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
  - pl.col("month").map_elements(lambda m: ...)
with this one instead:
  + pl.col("month").replace_strict(month_to_cluster)

  pl.col("month").map_elements(lambda m: month_to_cluster[m]).alias("date_cluster")
  data = data.with_columns(
  .rolling_std(window_size=4, min_periods=1)
  .rolling_mean(window_size=3, min_periods=1)


CHECK CORR COLS: True MAX CORR: 0.9252123488043936
0:	learn: 0.0339889	test: 0.0353043	best: 0.0353043 (0)	total: 6.92ms	remaining: 6.91s
300:	learn: 0.0331041	test: 0.0341710	best: 0.0341710 (300)	total: 1.58s	remaining: 3.67s
600:	learn: 0.0329678	test: 0.0341235	best: 0.0341222 (589)	total: 3.2s	remaining: 2.12s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.03412034114
bestIteration = 637

Shrink model to first 638 iterations.
0:	learn: 0.0338511	test: 0.0369089	best: 0.0369089 (0)	total: 7.42ms	remaining: 7.41s
300:	learn: 0.0329922	test: 0.0356298	best: 0.0356147 (219)	total: 1.49s	remaining: 3.47s
600:	learn: 0.0328507	test: 0.0356595	best: 0.0355896 (418)	total: 2.94s	remaining: 1.95s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.03558957083
bestIteration = 418

Shrink model to first 419 iterations.
VAL SCORE CATBOOST MONTH 0: 4630.329157343291
VAL SCORE CATBOOST MONTH 1: 4979.110390750926
Training until validation scores don't impro

In [10]:
print('Validation Report')
print()
print('CB Mean:    ', np.array(cbstats).mean().round(2))
print('LGB Mean:   ', np.array(lgbstats).mean().round(2))
print('XGB Mean:   ', np.array(xgbstats).mean().round(2))
print()
print('CB Mean by model:    ', np.array(cbstats).reshape(-1, 2).mean(axis=1).round(2))
print('LGB Mean by model:   ', np.array(lgbstats).reshape(-1, 2).mean(axis=1).round(2))
print('XGB Mean by model:   ', np.array(xgbstats).reshape(-1, 2).mean(axis=1).round(2))
print()
print('Max col corr:', '{:.3f}'.format(max(corrs)))
print(f'Columns: {len(train_cols)}')
print()
print('CatBoost')
print(np.array(cbstats).round(2).reshape(-1, 2))
print()
print('LightGBM')
print(np.array(lgbstats).round(2).reshape(-1, 2))
print()
print('XGBoost')
print(np.array(xgbstats).round(2).reshape(-1, 2))

Validation Report

CB Mean:     3617.0
LGB Mean:    3610.94
XGB Mean:    3802.09

CB Mean by model:     [2313.73 3732.55 4804.72]
LGB Mean by model:    [2312.08 3726.34 4794.41]
XGB Mean by model:    [2420.13 3894.15 5091.99]

Max col corr: 0.926
Columns: 10

CatBoost
[[2230.85 2396.6 ]
 [3601.17 3863.94]
 [4630.33 4979.11]]

LightGBM
[[2228.39 2395.77]
 [3589.79 3862.9 ]
 [4649.41 4939.41]]

XGBoost
[[2325.26 2514.99]
 [3719.78 4068.52]
 [4949.59 5234.38]]


making submission

In [11]:
sub = sub.with_columns(pl.col("date").cast(pl.Date))
data = data.with_columns(pl.col("date").cast(pl.Date))

sub = sub.join(data.select(["date", "id_house", "preds"]), on=["date", "id_house"], how="left")

sub = sub.with_columns(pl.col("preds").alias("target"))

!rm -rf /kaggle/working/

sub.select(["id", "target"]).write_csv("submission.csv")

min_target = sub.select(pl.col("target").min()).to_series()[0]
null_count = sub.select(pl.col("target").is_null().sum()).to_series()[0]
print("min target:", min_target)
print("null count in target:", null_count)


min target: 12400.056688078163
null count in target: 0
