In [10]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 500)

#train
train = pd.read_csv('input/flat_train.csv')

#test
test = pd.read_csv('input/flat_test.csv')

In [11]:
list(enumerate(pd.read_csv('input/flat_train.csv').columns))

[(0, 'id_sec'),
 (1, 'floor'),
 (2, 'spalen'),
 (3, 'stage_number'),
 (4, 'square'),
 (5, 'balcon'),
 (6, 'otdelka'),
 (7, 'plan0'),
 (8, 'bulk_id'),
 (9, 'section'),
 (10, 'date_settle'),
 (11, 'date_salestart'),
 (12, 'id_gk'),
 (13, 'id_flatwork'),
 (14, 'Класс объекта'),
 (15, 'Количество помещений'),
 (16, 'Огорожена территория'),
 (17, 'Площадь земельного участка'),
 (18, 'Входные группы'),
 (19, 'Детский сад'),
 (20, 'Школа'),
 (21, 'Поликлиника'),
 (22, 'ФОК'),
 (23, 'Спортивная площадка'),
 (24, 'Автомойка'),
 (25, 'Кладовые'),
 (26, 'Колясочные'),
 (27, 'Кондиционирование'),
 (28, 'Вентлияция'),
 (29, 'Лифт'),
 (30, 'Система мусоротведения'),
 (31, 'Видеонаблюдение'),
 (32, 'Подземная парковка'),
 (33, 'Двор без машин'),
 (34, 'Машиномест'),
 (35, 'Площадь пром. зоны в радиусе 500 м'),
 (36, 'Площадь зеленой зоны в радиусе 500 м'),
 (37, 'До Кремля'),
 (38, 'До ТТК(км)'),
 (39, 'До Садового(км)'),
 (40, 'До большой дороги на машине(км)'),
 (41, 'До удобной авторазвязки на маш

In [12]:
from sklearn.model_selection import KFold

K = 3

kf = KFold(n_splits=K, random_state=1337, shuffle=True)
kf.get_n_splits(range(len(train)))

print(kf)

for i, (train_index, valid_index) in enumerate(kf.split(range(len(train)))):
    train_fold = train.loc[train_index]
    valid_fold = train.loc[valid_index]

    train_fold.sort_values(['bulk_id', 'spalen', 'months_to_sale']).to_csv(f"input/flat_train_{i}.csv",index=False)
    valid_fold.sort_values(['bulk_id', 'spalen', 'months_to_sale']).to_csv(f"input/flat_valid_{i}.csv",index=False)

KFold(n_splits=3, random_state=1337, shuffle=True)


In [13]:
from catboost import CatBoostRegressor, Pool

CD_FILE = 'input/flat.cd'
TEST_FILE = 'input/flat_test.csv'

test_pool = Pool(TEST_FILE, column_description=CD_FILE, has_header=True, delimiter=",")

In [14]:
for i in range(K):

    TRAIN_FILE = f"input/flat_train_{i}.csv"
    VAL_FILE = f"input/flat_valid_{i}.csv"

    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE, has_header=True, delimiter=",")
    val_pool = Pool(VAL_FILE, column_description=CD_FILE, has_header=True, delimiter=",")

    model = CatBoostRegressor(iterations=1000, 
                              learning_rate=0.05, 
                              depth=10, 
                              random_seed=42,
                              loss_function='QueryRMSE')
    
    model.fit(train_pool, eval_set=val_pool, verbose=10)
    model.save_model(f"models/flat_{i}")

0:	learn: 4.2876478	test: 4.2716643	best: 4.2716643 (0)	total: 520ms	remaining: 8m 39s
10:	learn: 4.1838073	test: 4.1718345	best: 4.1718345 (10)	total: 4.93s	remaining: 7m 23s
20:	learn: 4.1075501	test: 4.0977174	best: 4.0977174 (20)	total: 9.02s	remaining: 7m
30:	learn: 4.0372950	test: 4.0334687	best: 4.0334687 (30)	total: 14.8s	remaining: 7m 43s
40:	learn: 3.9938026	test: 3.9936884	best: 3.9936884 (40)	total: 19.5s	remaining: 7m 35s
50:	learn: 3.9443148	test: 3.9515003	best: 3.9515003 (50)	total: 23.9s	remaining: 7m 24s
60:	learn: 3.9043241	test: 3.9175214	best: 3.9175214 (60)	total: 28.9s	remaining: 7m 24s
70:	learn: 3.8781737	test: 3.8937953	best: 3.8937953 (70)	total: 33.1s	remaining: 7m 12s
80:	learn: 3.8426526	test: 3.8619437	best: 3.8619437 (80)	total: 37.3s	remaining: 7m 3s
90:	learn: 3.8297093	test: 3.8497012	best: 3.8497012 (90)	total: 41.1s	remaining: 6m 50s
100:	learn: 3.8038757	test: 3.8267868	best: 3.8267868 (100)	total: 46.1s	remaining: 6m 50s
110:	learn: 3.7675525	test

In [15]:
oof = pd.DataFrame({'rank': None}, index=train.id_flatwork)

for i in range(K):
    
    VAL_FILE = f"input/flat_valid_{i}.csv"

    val_pool = Pool(VAL_FILE, column_description=CD_FILE, has_header=True, delimiter=",")

    model = CatBoostRegressor().load_model(f"models/flat_{i}")
    
    pred = model.predict(val_pool)
    val = pd.read_csv(VAL_FILE, usecols=['id_flatwork'], index_col='id_flatwork')
    oof.loc[val.index, 'rank'] = pred
    
    pred = model.predict(test_pool)
    test[f"rank_{i}"] = pred


In [16]:
train_predict = train.set_index('id_flatwork')

train_predict.loc[oof.index, 'rank'] = oof['rank']

test['rank'] = test[[c for c in test.columns if c.startswith('rank_')]].mean(axis=1)

train_predict.to_csv('output/flat_train_predict.csv')
test.to_csv('output/flat_test_predict.csv')