In [874]:
import gc
import os
import sys
sys.path.append('..')
import numpy as np
import pandas as pd
from tqdm import tqdm
import catboost as cb
from datetime import datetime
from sklearn.metrics import f1_score
from utils import date_to_timestamp
from utils import seed_everything
from metrics_f1 import calc_f1_score

DATA_ROOT = r"./"
WEIGHTS_ROOT = "weights"
os.makedirs(WEIGHTS_ROOT, exist_ok=True)

SEED = 28

seed_everything(SEED)
date = str(datetime.now()).split('.')[0].replace(':','_').replace(' ','__').replace('-','_')

gt_path = 'prediction/target_predicton_true.csv'
pred_path = 'prediction/target_predicton.csv'

baseline_score = calc_f1_score(gt_path, pred_path)
print(f"{baseline_score = }")

baseline_score = 0.23121339736861674


In [875]:
numerical_features, categorical_features, text_features = [], [], []
months_val = [pd.to_datetime('2022-12-01')]

In [876]:
target1_df = pd.read_csv('target/y_train.csv').convert_dtypes()
target2_df = pd.read_csv('../train2/target/y_test.csv').convert_dtypes()

target_df = pd.concat([target1_df, target2_df])
target_df = target_df.sort_values(by=["wagnum", "month"])
target_df = target_df.drop_duplicates(ignore_index=True)
target_df.month = pd.to_datetime(target_df.month)

print(target_df["month"].value_counts())

del target1_df, target2_df
gc.collect()

month
2022-12-01    33977
2022-09-01    33976
2022-10-01    33976
2022-11-01    33976
2022-08-01    33975
2023-01-01    33973
2023-02-01    33708
Name: count, dtype: int64


1818

In [877]:
wag_prob1_df = pd.read_parquet('wagons_probeg_ownersip.parquet').convert_dtypes()
wag_prob2_df = pd.read_parquet('../train2/wagons_probeg_ownersip.parquet').convert_dtypes()

wag_prob_df = pd.concat([wag_prob1_df, wag_prob2_df])
wag_prob_df = wag_prob_df.sort_values(by=["wagnum", "repdate"])
wag_prob_df = wag_prob_df.drop_duplicates(ignore_index=True)
wag_prob_df = wag_prob_df.drop(columns=["month"])

#Preprocessing
wag_prob_df['ost_prob'].fillna(-1, inplace=True)

del wag_prob1_df, wag_prob2_df
gc.collect()

0

In [878]:
dislok1_df = pd.read_parquet('dislok_wagons.parquet').convert_dtypes()
dislok2_df = pd.read_parquet('../train2/dislok_wagons.parquet').convert_dtypes()

dislok_df = pd.concat([dislok1_df, dislok2_df])
dislok_df = dislok_df.sort_values(by=["wagnum", "plan_date"])
dislok_df = dislok_df.drop_duplicates(ignore_index=True)
print(dislok_df.columns.to_list())
#Preprocessing
dislok_df = dislok_df[['wagnum', 'plan_date', 'date_kap', 'date_dep', 'date_pl_rem', 'ost_prob', 'last_fr_id', 'distance', 'isload']]

dislok_df['date_kap'] = dislok_df.groupby('wagnum')['date_kap'].fillna(method='ffill')
dislok_df['date_dep'] = dislok_df.groupby('wagnum')['date_kap'].fillna(method='ffill')
dislok_df['date_kap'].fillna(pd.to_datetime('1970-01-01'), inplace=True)
dislok_df['date_dep'].fillna(pd.to_datetime('1970-01-01'), inplace=True)

dislok_df['ost_prob'].fillna(-1, inplace=True)
dislok_df = dislok_df.rename(columns={'ost_prob': 'ost_prob_dislok', 'last_fr_id': 'fr_id'})

dislok_df['rem_days'] = (dislok_df['date_pl_rem'] - dislok_df['plan_date']).dt.days
dislok_df['kap_days'] = (dislok_df['plan_date'] - dislok_df['date_kap']).dt.days
dislok_df['dep_days'] = (dislok_df['plan_date'] - dislok_df['date_dep']).dt.days
dislok_df.drop(['date_kap', 'date_dep', 'date_pl_rem'], axis=1, inplace=True)

del dislok1_df, dislok2_df
gc.collect()

['plan_date', 'wagnum', 'date_kap', 'date_dep', 'kod_vrab', 'date_pl_rem', 'id_road_disl', 'st_id_dest', 'id_road_dest', 'st_id_send', 'id_road_send', 'ost_prob', 'isload', 'fr_id', 'last_fr_id', 'distance']


51

In [879]:
wag_param1_df = pd.read_parquet('wag_params.parquet').convert_dtypes()
wag_param2_df = pd.read_parquet('../train2/wag_params.parquet').convert_dtypes()

wag_param_df = pd.concat([wag_param1_df, wag_param2_df])
wag_param_df = wag_param_df.sort_values(by=["wagnum"])
wag_param_df = wag_param_df.drop_duplicates(ignore_index=True)

#Prepocessing
wag_param_df.drop(['date_iskl', 'gruz', 'rod_id'], axis=1, inplace=True)
wag_param_df['tippogl'].fillna(wag_param_df['tippogl'].value_counts().index[0], inplace=True)

del wag_param1_df, wag_param2_df
gc.collect()

0

In [880]:
wag_param_df.isna().sum()

wagnum                0
model                 0
cnsi_gruz_capacity    0
cnsi_volumek          0
tara                  0
date_build            0
srok_sl               0
zavod_build           0
cnsi_probeg_dr        0
cnsi_probeg_kr        0
kuzov                 0
telega                0
tormoz                0
tipvozd               0
tippogl               0
norma_km              0
ownertype             0
dtype: int64

In [881]:
pr_rem1_df = pd.read_parquet('pr_rems.parquet').convert_dtypes()
pr_rem2_df = pd.read_parquet('../train2/pr_rems.parquet').convert_dtypes()

pr_rem_df = pd.concat([pr_rem1_df, pr_rem2_df])
pr_rem_df = pr_rem_df.sort_values(by=["wagnum", "rem_month"])
pr_rem_df = pr_rem_df.drop_duplicates(ignore_index=True)

#Preprocess
pr_rem_df = pr_rem_df[['wagnum', 'rem_month', 'st_id_rem', 'road_id_rem']]

del pr_rem1_df, pr_rem2_df
gc.collect()

0

In [882]:
tr_rem1_df = pd.read_parquet('tr_rems.parquet').convert_dtypes()
tr_rem2_df = pd.read_parquet('../train2/tr_rems.parquet').convert_dtypes()

tr_rem_df = pd.concat([tr_rem1_df, tr_rem2_df])
tr_rem_df = tr_rem_df.sort_values(by=["wagnum", "rem_month"])
tr_rem_df = tr_rem_df.drop_duplicates(ignore_index=True)

#Preprocessing
tr_rem_df = tr_rem_df[['wagnum', 'rem_month', 'gr_probeg']]
tr_rem_df.fillna(-1, inplace=True)
tr_rem_cols = tr_rem_df.columns.to_list()
del tr_rem1_df, tr_rem2_df
gc.collect()

0

In [883]:
kti_add = pd.read_csv('../preprocessed/kti_additional_feature.csv')
kti_add = kti_add.sort_values(by=['wagnum', 'operation_date_dttm'])
kti_add = kti_add.drop_duplicates(ignore_index=True)
kti_add["operation_date_dttm"] = kti_add["operation_date_dttm"].apply(lambda x: datetime.fromtimestamp(int(str(x))).replace(hour=0, minute=0, second=0, microsecond=0))
kti_numerical = kti_add.columns.to_list()[2:]

In [884]:
kti_izm1_df = pd.read_parquet('kti_izm.parquet').convert_dtypes()
kti_izm2_df = pd.read_parquet('../train2/kti_izm.parquet').convert_dtypes()

kti_izm_df = pd.concat([kti_izm1_df, kti_izm2_df])
kti_izm_df = kti_izm_df.sort_values(by=["wagnum", "operation_date_dttm"])
kti_izm_df = kti_izm_df.drop_duplicates(ignore_index=True)
kti_izm_df["operation_date_dttm"] = kti_izm_df["operation_date_dttm"].apply(lambda x: datetime.fromtimestamp(int(str(x)[:-9])).replace(hour=0, minute=0, second=0, microsecond=0))

del kti_izm1_df, kti_izm2_df
gc.collect()

0

In [885]:
freight_info1_df = pd.read_parquet('freight_info.parquet').convert_dtypes()
freight_info2_df = pd.read_parquet('../train2/freight_info.parquet').convert_dtypes()

freight_info_df = pd.concat([freight_info1_df, freight_info2_df])
freight_info_df = freight_info_df.sort_values(by=["fr_id"])
freight_info_df = freight_info_df.drop_duplicates(ignore_index=True)

del freight_info1_df, freight_info2_df
gc.collect()

0

In [886]:
dislok_df = pd.merge(dislok_df, freight_info_df, how='left', on='fr_id')
dislok_df.fillna(-1, inplace=True)

In [887]:
dislok_df.columns.to_list()

['wagnum',
 'plan_date',
 'ost_prob_dislok',
 'fr_id',
 'distance',
 'isload',
 'rem_days',
 'kap_days',
 'dep_days',
 'fr_class',
 'skoroport',
 'naval',
 'nasip',
 'naliv',
 'openvagons',
 'soprovod',
 'smerz']

In [888]:
stations1_df = pd.read_parquet('stations.parquet').convert_dtypes()
stations2_df = pd.read_parquet('../train2/stations.parquet').convert_dtypes()

stations_df = pd.concat([stations1_df, stations2_df])

stations_df = stations_df.sort_values(by=["st_id"])
stations_df = stations_df.drop_duplicates(ignore_index=True)
stations_df = stations_df.rename(columns={'st_id': 'st_id_rem', 'road_id':'road_id_rem'})

stations_df_cols = stations_df.columns.to_list()
for col in stations_df_cols:
    stations_df[col] = stations_df[col].astype(int)
del stations1_df, stations2_df
gc.collect()

0

In [889]:
stations_df_cols

['st_id_rem',
 'road_id_rem',
 'st_border_sign',
 'st_sea_sign',
 'st_river_sign',
 'st_car_sign',
 'st_ferry_sign',
 'st_freigh_sign',
 'opor_station_sign']

In [890]:
pr_rem_df = pd.merge(pr_rem_df, stations_df, how='left', on=['st_id_rem','road_id_rem'])
pr_rem_df.fillna(-1, inplace=True)

In [891]:
pr_rem_df

Unnamed: 0,wagnum,rem_month,st_id_rem,road_id_rem,st_border_sign,st_sea_sign,st_river_sign,st_car_sign,st_ferry_sign,st_freigh_sign,opor_station_sign
0,12,2022-11-22,6408,13,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,19,2023-02-10,10150,19,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,35,2022-08-12,9546,18,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,37,2022-10-11,12016,23,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,77,2022-11-10,7117,12,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...
12027,33972,2022-11-28,8693,17,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12028,33973,2022-09-14,4420,7,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12029,33974,2022-11-25,5722,12,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12030,33975,2022-10-08,4789,13,0.0,0.0,0.0,0.0,0.0,1.0,0.0


wag_prob_ownership

In [892]:
df = pd.merge(target_df, wag_prob_df, how='left', on=["wagnum"])
df = df.drop(df[df["repdate"] > df["month"]].index)
df = df.sort_values(by=["wagnum", "month", "repdate"])
df = df.groupby(["wagnum", "month"]).last().reset_index()
df = df.fillna(-1)

df["ost_prob"] = df["ost_prob"].astype(int)
df["manage_type"] = df["manage_type"].astype(int)
df["rod_id"] = df["rod_id"].astype(int)
df["reestr_state"] = df["reestr_state"].astype(int)
numerical_features.extend(["ost_prob", "manage_type", "rod_id", "reestr_state"])

wag_param

In [893]:
df = pd.merge(df, wag_param_df, how='left', on='wagnum')

df['days_build'] = (df['month'] - df['date_build']).dt.days
df['days_srok'] = (df['srok_sl'] - df['month']).dt.days
df.drop(['srok_sl', 'date_build'], axis=1)

wag_param_numerical = ['cnsi_gruz_capacity', 'cnsi_volumek', 'tara',
       'days_build', 'days_srok', 'zavod_build', 'cnsi_probeg_dr',
       'cnsi_probeg_kr', 'kuzov', 'telega', 'tormoz', 'tipvozd', 'tippogl',
       'norma_km', 'ownertype']

for wag_param_numeric in wag_param_numerical:
    df[wag_param_numeric] = df[wag_param_numeric].astype(int)

df['model'] = df['model'].astype(str)

categorical_features.extend(['model'])
numerical_features.extend(wag_param_numerical)

dislok_wagons

In [894]:
df = pd.merge(df, dislok_df, how='left', on='wagnum')
df = df.drop(df[df["plan_date"] > df["month"]].index)
df = df.sort_values(by=["wagnum", "month", "plan_date"])
df = df.groupby(["wagnum", "month"]).last().reset_index()

df["rem_days"] = df["rem_days"].astype(int)
df["kap_days"] = df["kap_days"].astype(int)
df["dep_days"] = df["dep_days"].astype(int)
df["ost_prob_dislok"] = df["ost_prob_dislok"].astype(int)

gruz_cols = ['fr_class', 'skoroport', 'naval', 'nasip', 'naliv', 'openvagons', 'soprovod', 'smerz', 'distance', 'isload']

for gruz_col in gruz_cols:
    df[gruz_col] = df[gruz_col].astype(int)

numerical_features.extend(['rem_days', 'kap_days', 'dep_days', 'ost_prob_dislok'])
numerical_features.extend(gruz_cols)

tr_rems

In [895]:
temp_df = pd.merge(df[['wagnum', 'month']], tr_rem_df, how='left', on='wagnum')
temp_df = temp_df.drop(temp_df[temp_df["rem_month"] > temp_df["month"]].index)

count_rem = temp_df.groupby(['wagnum', 'month']).size().reset_index(name='count_rem')
mean_gr = temp_df.groupby(['wagnum', 'month'])['gr_probeg'].mean().reset_index(name='mean_gr_probeg')
df = pd.merge(df, count_rem, how='left', on=['wagnum', 'month'])
df = pd.merge(df, mean_gr, how='left', on=['wagnum', 'month'])
df['count_rem'].fillna(-1, inplace=True)
df['mean_gr_probeg'].fillna(-1, inplace=True)

df['count_rem']= df['count_rem'].astype(int)
df["mean_gr_probeg"] = df["mean_gr_probeg"].astype('double')

numerical_features.extend(['count_rem', 'mean_gr_probeg'])

In [896]:
print(len(df))

237559


pr_rems

In [897]:
tdf = pd.merge(df[['wagnum', 'month']], pr_rem_df, how='left', on='wagnum')
tdf = tdf.drop(tdf[tdf["rem_month"] > tdf["month"]].index)

tdf = tdf.sort_values(by=["wagnum", "month", "rem_month"])
tdf = tdf.groupby(["wagnum", "month"]).last().reset_index()
tdf.drop(['rem_month'], axis=1, inplace=True)

df = pd.merge(df, tdf, how='left', on=['wagnum', 'month'])

df.fillna(-1, inplace=True)
for col in stations_df_cols:
    df[col]= df[col].astype(int)

numerical_features.extend(stations_df_cols)

KTI

In [898]:
# temp_df = pd.merge(df[['wagnum','month']], kti_izm_df, how='left', on='wagnum')
# temp_df = temp_df.drop(temp_df[temp_df["operation_date_dttm"] > temp_df["month"]].index)
# temp_df = temp_df.sort_values(by=["wagnum", "month", "operation_date_dttm"])
# temp_df = temp_df.groupby(["wagnum", "month"]).last().reset_index()
# df = pd.merge(df, temp_df, how='left', on=['wagnum', 'month'])

# kti_numeric_features = kti_izm_df.columns.to_list()[2:]
# df.fillna(-1, inplace=True)

# df['mileage_all'] = df['mileage_all'].astype(int)

# for kti_numeric in kti_numeric_features[1:]:
#     df[kti_numeric] = df[kti_numeric].astype('double')

# numerical_features.extend(kti_numeric_features)

KTI(deprecated)

In [899]:
# df = pd.merge(df, kti_add, how='left', on='wagnum')
# #df = df.drop(df[df["operation_date_dttm"] > df["month"]].index)
# df.loc[df["operation_date_dttm"] > df["month"], kti_numerical] = -1
# df = df.sort_values(by=["wagnum", "month", "operation_date_dttm"])
# df = df.groupby(["wagnum", "month"]).last().reset_index()
# df.fillna(-1, inplace=True)

# df['mileage_all'] = df['mileage_all'].astype(int)

# for kti_numeric in kti_numerical[1:]:
#     df[kti_numerical] = df[kti_numerical].astype('double')

# numerical_features.extend(kti_numerical)


In [900]:
def fit_cb(df, params, months_val, target_name):

    x_train = df[~df["month"].isin(months_val)][numerical_features + categorical_features + text_features]
    x_val = df[df["month"].isin(months_val)][numerical_features + categorical_features + text_features]

    y_train = df[~df["month"].isin(months_val)][target_name]
    y_val = df[df["month"].isin(months_val)][target_name]
    print(f"Train class imbalance: {y_train.value_counts()}")
    print(f"Val class imbalance: {y_val.value_counts()}")

    train_pool = cb.Pool(
        data = x_train,
        label = y_train,
        cat_features = categorical_features,
        text_features = text_features
    )

    eval_pool = cb.Pool(
        data = x_val,
        label = y_val,
        cat_features = categorical_features,
        text_features = text_features
    )

    model = cb.CatBoostClassifier(**params)

    model.fit(
        train_pool,
        eval_set=eval_pool,
        verbose=True
    )
    print("best results (train on train):")
    print(model.get_best_score()["learn"])
    print("best results (on validation set):")
    print(model.get_best_score()["validation"])
    print(model.get_feature_importance(data=train_pool, prettified=True))

    return model

In [901]:
cb_params  = {
    'iterations': 2000,
    'loss_function': 'CrossEntropy',
    'custom_metric': ['AUC', 'Accuracy', 'F1'],
    'verbose': False,
    'random_seed': SEED,
    "task_type": "CPU",
    "has_time": True,
    "metric_period": 500,
    "save_snapshot": False,
    "use_best_model": True,
}

cb_month_model = fit_cb(df, cb_params, months_val=months_val, target_name="target_month")
cb_10days_model = fit_cb(df, cb_params, months_val=months_val, target_name="target_day")

Train class imbalance: target_month
0    193177
1     10405
Name: count, dtype: Int64
Val class imbalance: target_month
0    32393
1     1584
Name: count, dtype: Int64
0:	learn: 0.6253398	test: 0.6261832	best: 0.6261832 (0)	total: 48.9ms	remaining: 1m 37s
500:	learn: 0.0745315	test: 0.0672141	best: 0.0672141 (500)	total: 15.6s	remaining: 46.6s
1000:	learn: 0.0711597	test: 0.0671728	best: 0.0671728 (1000)	total: 30.4s	remaining: 30.3s
1500:	learn: 0.0684922	test: 0.0671772	best: 0.0671728 (1000)	total: 45.7s	remaining: 15.2s
1999:	learn: 0.0661805	test: 0.0676415	best: 0.0671728 (1000)	total: 1m 1s	remaining: 0us

bestTest = 0.06717279764
bestIteration = 1000

Shrink model to first 1001 iterations.
best results (train on train):
{'Accuracy': 0.9727333457771316, 'F1': 0.6983316124123689, 'CrossEntropy': 0.06618047603877666}
best results (on validation set):
{'Accuracy': 0.9719221826529711, 'F1': 0.6640845070422535, 'CrossEntropy': 0.06717279764174991, 'AUC': 0.9822645698799497}
         

In [902]:
x_val = df[df["month"].isin(months_val)][numerical_features + categorical_features + text_features]
print(len(x_val))
pred_month = cb_month_model.predict(x_val)
pred_10days = cb_10days_model.predict(x_val)

33977


In [903]:
val_df = pd.DataFrame({"wagnum": df[df["month"].isin(months_val)]["wagnum"]})
val_df["target_month"] = pred_month
val_df["target_day"] = pred_10days

In [904]:
def my_calc_f1_score(gt_path, pred_df):

    pred_labels = pred_df.sort_values(by=["wagnum"])

    true_labels = pd.read_csv(gt_path)
    true_labels = true_labels.sort_values(by=["wagnum"])

    # Таргет для месячного прогноза
    true_labels_month = true_labels['target_month'].values
    pred_labels_month = pred_labels['target_month'].values

    # Таргет для 10 дневного прогноза
    true_labels_day = true_labels['target_day'].values
    pred_labels_day = pred_labels['target_day'].values

    # Посчитаем метрику для месяца и 10 дней
    score_month = f1_score(true_labels_month, pred_labels_month)
    score_day = f1_score(true_labels_day, pred_labels_day)

    # Посчитаем метрику с весом для двух таргетов
    print(score_month)
    print(score_day)
    score = 0.5 * score_month + 0.5 * score_day
    return score

cv_score = my_calc_f1_score(gt_path, val_df)
print(f"{cv_score = }")

0.6536964980544748
0.4074074074074074
cv_score = 0.5305519527309411


In [905]:
test_df = pd.read_csv('../train/target/y_predict.csv').convert_dtypes()
test_df['month'] = pd.to_datetime(test_df['month'])

test_df_copy = test_df.copy()
print(len(test_df_copy))

test_df = pd.merge(test_df, wag_prob_df, how='left', on=["wagnum"])
test_df = test_df.drop(test_df[test_df["repdate"] > test_df["month"]].index)
test_df = test_df.groupby(["wagnum", "month"]).last().reset_index()
test_df = test_df.fillna(-1)
test_df = test_df_copy.merge(test_df)

test_df["ost_prob"] = test_df["ost_prob"].astype(int)
test_df["manage_type"] = test_df["manage_type"].astype(int)
test_df["rod_id"] = test_df["rod_id"].astype(int)
test_df["reestr_state"] = test_df["reestr_state"].astype(int)


33707


In [906]:
test_df = pd.merge(test_df, wag_param_df, how='left', on='wagnum')

test_df['days_build'] = (test_df['month'] - test_df['date_build']).dt.days
test_df['days_srok'] = (test_df['srok_sl'] - test_df['month']).dt.days

test_df.drop(['srok_sl', 'date_build'], axis=1)

wag_param_numerical = ['cnsi_gruz_capacity', 'cnsi_volumek', 'tara',
       'days_build', 'days_srok', 'zavod_build', 'cnsi_probeg_dr',
       'cnsi_probeg_kr', 'kuzov', 'telega', 'tormoz', 'tipvozd', 'tippogl',
       'norma_km', 'ownertype']

for wag_param_numeric in wag_param_numerical:
    test_df[wag_param_numeric] = test_df[wag_param_numeric].astype(int)

test_df['model'] = test_df['model'].astype(str)

In [907]:
test_df = pd.merge(test_df, dislok_df, how='left', on='wagnum')
test_df = test_df.drop(test_df[test_df["plan_date"] > test_df["month"]].index)
test_df = test_df.sort_values(by=["wagnum", "month", "plan_date"])
test_df = test_df.groupby(["wagnum", "month"]).last().reset_index()
test_df = test_df_copy.merge(test_df)

print(len(test_df))
test_df["rem_days"] = test_df["rem_days"].astype(int)
test_df["kap_days"] = test_df["kap_days"].astype(int)
test_df["dep_days"] = test_df["dep_days"].astype(int)
test_df["ost_prob_dislok"] = test_df["ost_prob_dislok"].astype(int)

gruz_cols = ['fr_class', 'skoroport', 'naval', 'nasip', 'naliv', 'openvagons', 'soprovod', 'smerz', 'distance', 'isload']

for gruz_col in gruz_cols:
    test_df[gruz_col] = test_df[gruz_col].astype(int)

33707


In [908]:
temp_df = pd.merge(test_df[['wagnum', 'month']], tr_rem_df, how='left', on='wagnum')
temp_df = temp_df.drop(temp_df[temp_df["rem_month"] > temp_df["month"]].index)

count_rem = temp_df.groupby(['wagnum', 'month']).size().reset_index(name='count_rem')
mean_gr = temp_df.groupby(['wagnum', 'month'])['gr_probeg'].mean().reset_index(name='mean_gr_probeg')

test_df = pd.merge(test_df, count_rem, how='left', on=['wagnum', 'month'])
test_df = pd.merge(test_df, mean_gr, how='left', on=['wagnum', 'month'])
test_df['count_rem'].fillna(-1, inplace=True)
test_df['mean_gr_probeg'].fillna(-1, inplace=True)

test_df['count_rem']= test_df['count_rem'].astype(int)
test_df["mean_gr_probeg"] = test_df["mean_gr_probeg"].astype('double')
print(len(test_df))
test_df = test_df_copy.merge(test_df)

33707


In [909]:
tdf = pd.merge(test_df[['wagnum', 'month']], pr_rem_df, how='left', on='wagnum')
tdf = tdf.drop(tdf[tdf["rem_month"] > tdf["month"]].index)

tdf = tdf.sort_values(by=["wagnum", "month", "rem_month"])
tdf = tdf.groupby(["wagnum", "month"]).last().reset_index()
tdf.drop(['rem_month'], axis=1, inplace=True)

test_df = pd.merge(test_df, tdf, how='left', on=['wagnum', 'month'])

test_df.fillna(-1, inplace=True)
for col in stations_df_cols:
    test_df[col]= test_df[col].astype(int)

test_df = test_df_copy.merge(test_df)

In [910]:
print(len(test_df))

33707


In [911]:
# test_df = pd.merge(test_df, kti_add, how='left', on='wagnum')
# test_df.loc[test_df["operation_date_dttm"] > test_df["month"], kti_numerical] = -1
# test_df = test_df.sort_values(by=["wagnum", "month", "operation_date_dttm"])
# test_df = test_df.groupby(["wagnum", "month"]).last().reset_index()
# test_df.fillna(-1, inplace=True)
# test_df = test_df_copy.merge(test_df)

# test_df['mileage_all'] = test_df['mileage_all'].astype(int)

# for kti_numeric in kti_numerical[1:]:
#     test_df[kti_numerical] = test_df[kti_numerical].astype('double')


In [912]:
x_test = test_df[numerical_features + categorical_features + text_features]

In [913]:
pred_month = cb_month_model.predict(x_test)
pred_10days = cb_10days_model.predict(x_test)

In [914]:
submit_df = test_df[["wagnum", "month"]]
submit_df["target_month"] = pred_month
submit_df["target_day"] = pred_10days

print(submit_df["target_month"].value_counts())
print(submit_df["target_day"].value_counts())

submit_df.to_csv("submission.csv", index=False)

target_month
0    32303
1     1404
Name: count, dtype: int64
target_day
0    33398
1      309
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submit_df["target_month"] = pred_month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submit_df["target_day"] = pred_10days
