In [157]:
import gc
import os
import sys
sys.path.append('..')
import numpy as np
import pandas as pd
from tqdm import tqdm
import catboost as cb
from datetime import datetime
from sklearn.metrics import f1_score
from utils import date_to_timestamp
from utils import seed_everything
from metrics_f1 import calc_f1_score

DATA_ROOT = r"./"
WEIGHTS_ROOT = "weights"
os.makedirs(WEIGHTS_ROOT, exist_ok=True)

SEED = 28

seed_everything(SEED)
date = str(datetime.now()).split('.')[0].replace(':','_').replace(' ','__').replace('-','_')

gt_path = 'prediction/target_predicton_true.csv'
pred_path = 'prediction/target_predicton.csv'

baseline_score = calc_f1_score(gt_path, pred_path)
print(f"{baseline_score = }")

baseline_score = 0.23121339736861674


In [158]:
numerical_features, categorical_features, text_features = [], [], []
months_val = [pd.to_datetime('2022-12-01')]

In [159]:
target1_df = pd.read_csv('target/y_train.csv').convert_dtypes()
target2_df = pd.read_csv('../train2/target/y_test.csv').convert_dtypes()

target_df = pd.concat([target1_df, target2_df])
target_df = target_df.sort_values(by=["wagnum", "month"])
target_df = target_df.drop_duplicates(ignore_index=True)
target_df.month = pd.to_datetime(target_df.month)

print(target_df["month"].value_counts())

del target1_df, target2_df
gc.collect()

month
2022-12-01    33977
2022-09-01    33976
2022-10-01    33976
2022-11-01    33976
2022-08-01    33975
2023-01-01    33973
2023-02-01    33708
Name: count, dtype: int64


0

In [160]:
wag_prob1_df = pd.read_parquet('wagons_probeg_ownersip.parquet').convert_dtypes()
wag_prob2_df = pd.read_parquet('../train2/wagons_probeg_ownersip.parquet').convert_dtypes()

wag_prob_df = pd.concat([wag_prob1_df, wag_prob2_df])
wag_prob_df = wag_prob_df.sort_values(by=["wagnum", "repdate"])
wag_prob_df = wag_prob_df.drop_duplicates(ignore_index=True)
wag_prob_df = wag_prob_df.drop(columns=["month"])

del wag_prob1_df, wag_prob2_df
gc.collect()

0

In [161]:
dislok1_df = pd.read_parquet('dislok_wagons.parquet').convert_dtypes()
dislok2_df = pd.read_parquet('../train2/dislok_wagons.parquet').convert_dtypes()

dislok_df = pd.concat([dislok1_df, dislok2_df])
dislok_df = dislok_df.sort_values(by=["wagnum", "plan_date"])
dislok_df = dislok_df.drop_duplicates(ignore_index=True)

del dislok1_df, dislok2_df
gc.collect()

0

In [162]:
wag_param1_df = pd.read_parquet('wag_params.parquet').convert_dtypes()
wag_param2_df = pd.read_parquet('../train2/wag_params.parquet').convert_dtypes()

wag_param_df = pd.concat([wag_param1_df, wag_param2_df])
wag_param_df = wag_param_df.sort_values(by=["wagnum"])
wag_param_df = wag_param_df.drop_duplicates(ignore_index=True)

#Prepocessing
wag_param_df.drop(['date_iskl', 'gruz', 'rod_id'], axis=1, inplace=True)
wag_param_df['date_build'] = wag_param_df['date_build'].apply(date_to_timestamp)
wag_param_df['srok_sl'] = wag_param_df['srok_sl'].apply(date_to_timestamp)
wag_param_df['tippogl'].fillna(-1, inplace=True)

wag_param_numerical = ['cnsi_gruz_capacity', 'cnsi_volumek', 'tara',
       'date_build', 'srok_sl', 'zavod_build', 'cnsi_probeg_dr',
       'cnsi_probeg_kr', 'kuzov', 'telega', 'tormoz', 'tipvozd', 'tippogl',
       'norma_km', 'ownertype']

del wag_param1_df, wag_param2_df
gc.collect()

0

In [163]:
pr_rem1_df = pd.read_parquet('pr_rems.parquet').convert_dtypes()
pr_rem2_df = pd.read_parquet('../train2/pr_rems.parquet').convert_dtypes()

pr_rem_df = pd.concat([pr_rem1_df, pr_rem2_df])
pr_rem_df = pr_rem_df.sort_values(by=["wagnum", "rem_month"])
pr_rem_df = pr_rem_df.drop_duplicates(ignore_index=True)

del pr_rem1_df, pr_rem2_df
gc.collect()

0

In [164]:
tr_rem1_df = pd.read_parquet('tr_rems.parquet').convert_dtypes()
tr_rem2_df = pd.read_parquet('../train2/tr_rems.parquet').convert_dtypes()

tr_rem_df = pd.concat([tr_rem1_df, tr_rem2_df])
tr_rem_df = tr_rem_df.sort_values(by=["wagnum", "rem_month"])
tr_rem_df = tr_rem_df.drop_duplicates(ignore_index=True)

del tr_rem1_df, tr_rem2_df
gc.collect()

0

In [165]:
kti_izm1_df = pd.read_parquet('kti_izm.parquet').convert_dtypes()
kti_izm2_df = pd.read_parquet('../train2/kti_izm.parquet').convert_dtypes()

kti_izm_df = pd.concat([kti_izm1_df, kti_izm2_df])
kti_izm_df = kti_izm_df.sort_values(by=["wagnum", "operation_date_dttm"])
kti_izm_df = kti_izm_df.drop_duplicates(ignore_index=True)
kti_izm_df["operation_date_dttm"] = kti_izm_df["operation_date_dttm"].apply(lambda x: datetime.fromtimestamp(int(str(x)[:-9])).replace(hour=0, minute=0, second=0, microsecond=0))

del kti_izm1_df, kti_izm2_df
gc.collect()

0

In [166]:
freight_info1_df = pd.read_parquet('freight_info.parquet').convert_dtypes()
freight_info2_df = pd.read_parquet('../train2/freight_info.parquet').convert_dtypes()

freight_info_df = pd.concat([freight_info1_df, freight_info2_df])
freight_info_df = freight_info_df.sort_values(by=["fr_id"])
freight_info_df = freight_info_df.drop_duplicates(ignore_index=True)

del freight_info1_df, freight_info2_df
gc.collect()

0

In [167]:
stations1_df = pd.read_parquet('stations.parquet').convert_dtypes()
stations2_df = pd.read_parquet('../train2/stations.parquet').convert_dtypes()

stations_df = pd.concat([stations1_df, stations2_df])

stations_df = stations_df.sort_values(by=["st_id"])
stations_df = stations_df.drop_duplicates(ignore_index=True)

del stations1_df, stations2_df
gc.collect()

0

In [168]:
df = pd.merge(target_df, wag_prob_df, how='left', on=["wagnum"])
df = df.drop(df[df["repdate"] > df["month"]].index)
df = df.sort_values(by=["wagnum", "month", "repdate"])
df = df.groupby(["wagnum", "month"]).last().reset_index()
df = df.fillna(-1)

df["ost_prob"] = df["ost_prob"].astype(int)
df["manage_type"] = df["manage_type"].astype(int)
df["rod_id"] = df["rod_id"].astype(int)
df["reestr_state"] = df["reestr_state"].astype(int)
numerical_features.extend(["ost_prob", "manage_type", "rod_id", "reestr_state"])

In [169]:
df = pd.merge(df, wag_param_df, how='left', on='wagnum')
for wag_param_numeric in wag_param_numerical:
    df[wag_param_numeric] = df[wag_param_numeric].astype(int)
df['model'] = df['model'].astype(str)

categorical_features.extend(['model'])
numerical_features.extend(wag_param_numerical)

In [171]:
def fit_cb(df, params, months_val, target_name):

    x_train = df[~df["month"].isin(months_val)][numerical_features + categorical_features + text_features]
    x_val = df[df["month"].isin(months_val)][numerical_features + categorical_features + text_features]

    y_train = df[~df["month"].isin(months_val)][target_name]
    y_val = df[df["month"].isin(months_val)][target_name]
    print(f"Train class imbalance: {y_train.value_counts()}")
    print(f"Val class imbalance: {y_val.value_counts()}")

    train_pool = cb.Pool(
        data = x_train,
        label = y_train,
        cat_features = categorical_features,
        text_features = text_features
    )

    eval_pool = cb.Pool(
        data = x_val,
        label = y_val,
        cat_features = categorical_features,
        text_features = text_features
    )

    model = cb.CatBoostClassifier(**params)

    model.fit(
        train_pool,
        eval_set=eval_pool,
        verbose=True
    )
    print("best results (train on train):")
    print(model.get_best_score()["learn"])
    print("best results (on validation set):")
    print(model.get_best_score()["validation"])
    print(model.get_feature_importance(data=train_pool, prettified=True))

    return model

In [172]:
cb_params  = {
    'iterations': 2000,
    'loss_function': 'CrossEntropy',
    'custom_metric': ['AUC', 'Accuracy', 'F1'],
    'verbose': False,
    'random_seed': SEED,
    "task_type": "CPU",
    "has_time": True,
    "metric_period": 500,
    "save_snapshot": False,
    "use_best_model": True,
}

cb_month_model = fit_cb(df, cb_params, months_val=months_val, target_name="target_month")
cb_10days_model = fit_cb(df, cb_params, months_val=months_val, target_name="target_day")

Train class imbalance: target_month
0    193178
1     10405
Name: count, dtype: Int64
Val class imbalance: target_month
0    32393
1     1584
Name: count, dtype: Int64
0:	learn: 0.6386586	test: 0.6382201	best: 0.6382201 (0)	total: 92.2ms	remaining: 3m 4s
500:	learn: 0.0959358	test: 0.0901419	best: 0.0901419 (500)	total: 13.6s	remaining: 40.7s
1000:	learn: 0.0936543	test: 0.0904819	best: 0.0901419 (500)	total: 26.5s	remaining: 26.5s
1500:	learn: 0.0917088	test: 0.0909119	best: 0.0901419 (500)	total: 39.3s	remaining: 13.1s
1999:	learn: 0.0900745	test: 0.0913453	best: 0.0901419 (500)	total: 52.2s	remaining: 0us

bestTest = 0.0901418504
bestIteration = 500

Shrink model to first 501 iterations.
best results (train on train):
{'Accuracy': 0.9635676849245762, 'F1': 0.5541328524195973, 'CrossEntropy': 0.09007451711412541}
best results (on validation set):
{'Accuracy': 0.9632398387144244, 'F1': 0.5299390462531374, 'CrossEntropy': 0.09014185040461554, 'AUC': 0.9640948915107298}
            Feat

In [173]:
x_val = df[df["month"].isin(months_val)][numerical_features + categorical_features + text_features]

pred_month = cb_month_model.predict(x_val)
pred_10days = cb_10days_model.predict(x_val)

In [174]:
val_df = pd.DataFrame({"wagnum": df[df["month"].isin(months_val)]["wagnum"]})
val_df["target_month"] = pred_month
val_df["target_day"] = pred_10days

In [175]:
def my_calc_f1_score(gt_path, pred_df):

    pred_labels = pred_df.sort_values(by=["wagnum"])

    true_labels = pd.read_csv(gt_path)
    true_labels = true_labels.sort_values(by=["wagnum"])

    # Таргет для месячного прогноза
    true_labels_month = true_labels['target_month'].values
    pred_labels_month = pred_labels['target_month'].values

    # Таргет для 10 дневного прогноза
    true_labels_day = true_labels['target_day'].values
    pred_labels_day = pred_labels['target_day'].values

    # Посчитаем метрику для месяца и 10 дней
    score_month = f1_score(true_labels_month, pred_labels_month)
    score_day = f1_score(true_labels_day, pred_labels_day)

    # Посчитаем метрику с весом для двух таргетов
    score = 0.5 * score_month + 0.5 * score_day
    return score

cv_score = my_calc_f1_score(gt_path, val_df)
print(f"{cv_score = }")

cv_score = 0.3416038592268879


In [177]:
test_df = pd.read_csv('../train/target/y_predict.csv').convert_dtypes()
test_df_copy = test_df.copy()

test_df = pd.merge(test_df, wag_prob_df, how='left', on=["wagnum"])
test_df = test_df.drop(test_df[test_df["repdate"] > test_df["month"]].index)
test_df = test_df.groupby(["wagnum", "month"]).last().reset_index()
test_df = test_df.fillna(-1)
test_df = test_df_copy.merge(test_df)

test_df["ost_prob"] = test_df["ost_prob"].astype(int)
test_df["manage_type"] = test_df["manage_type"].astype(int)
test_df["rod_id"] = test_df["rod_id"].astype(int)
test_df["reestr_state"] = test_df["reestr_state"].astype(int)


In [178]:
test_df = pd.merge(test_df, wag_param_df, how='left', on='wagnum')

for wag_param_numeric in wag_param_numerical:
    test_df[wag_param_numeric] = test_df[wag_param_numeric].astype(int)

test_df['model'] = test_df['model'].astype(str)

In [179]:
x_test = test_df[numerical_features + categorical_features + text_features]

x_test

Unnamed: 0,ost_prob,manage_type,rod_id,reestr_state,cnsi_gruz_capacity,cnsi_volumek,tara,date_build,srok_sl,zavod_build,cnsi_probeg_dr,cnsi_probeg_kr,kuzov,telega,tormoz,tipvozd,tippogl,norma_km,ownertype,model
0,140585,0,1,1,690,88,245,1297123200,1993248000,6,110,160,2,2,2,3,11,160000,0,12-1303-01
1,144191,0,1,1,690,88,245,1271808000,1933718400,6,110,160,2,2,2,3,11,160000,0,12-1303-01
2,150649,0,1,1,690,88,245,1277251200,1958256000,6,110,160,2,2,2,3,11,160000,0,12-1303-01
3,14715,0,1,1,690,88,245,1348704000,1985385600,6,110,160,2,2,2,3,11,160000,0,12-1303-01
4,139618,0,1,1,690,88,245,1321747200,2017267200,6,110,160,2,2,2,6,11,160000,0,12-1303-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33702,71590,0,1,1,695,88,240,1286841600,1983830400,0,110,160,2,9,2,7,11,160000,0,12-132
33703,136540,0,1,1,695,88,243,1324857600,2040508800,0,110,160,2,9,2,7,11,160000,0,12-132
33704,129690,0,1,1,695,88,241,1281398400,1923782400,0,110,160,2,9,2,7,11,160000,0,12-132
33705,124151,0,1,1,700,76,234,1111104000,1835740800,2,160,160,2,9,2,4,11,160000,0,12-783


In [180]:
pred_month = cb_month_model.predict(x_test)
pred_10days = cb_10days_model.predict(x_test)

In [181]:
submit_df = test_df[["wagnum", "month"]]
submit_df["target_month"] = pred_month
submit_df["target_day"] = pred_10days

print(submit_df["target_month"].value_counts())
print(submit_df["target_day"].value_counts())

submit_df.to_csv("zalupa.csv", index=False)

target_month
0    32797
1      910
Name: count, dtype: int64
target_day
0    33578
1      129
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submit_df["target_month"] = pred_month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submit_df["target_day"] = pred_10days
