In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import tqdm
import optuna
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_parquet("train_data.pqt")
test_df = pd.read_parquet("test_data.pqt")

In [3]:
# Отобранные фичи
features = ['start_cluster', 'okved', 'city', 'channel_code', 'balance_amt_min', 'segment', 'index_city_code',
             'ogrn_days_end_quarter', 'sum_of_paym_1y', 'min_founderpres', 'ogrn_exist_months', 'balance_amt_max',
             'ogrn_month', 'ogrn_days_end_month', 'max_founderpres', 'ogrn_year', 'balance_amt_avg', 'ft_registration_date',
             'sum_deb_e_oper_3m', 'cnt_days_deb_e_oper_3m', 'sum_of_paym_2m', 'sum_of_paym_6m', 'sum_cred_h_oper_3m',
             'cnt_days_cred_e_oper_3m', 'balance_amt_day_avg', 'sum_deb_h_oper_3m', 'cnt_cred_e_oper_1m', 'cnt_cred_e_oper_3m',
             'sum_deb_f_oper_3m', 'sum_deb_h_oper_1m', 'cnt_days_cred_h_oper_3m', 'cnt_c_oper_1m', 'sum_deb_g_oper_3m',
             'cnt_deb_e_oper_3m', 'cnt_b_oper_1m', 'sum_deb_e_oper_1m', 'sum_deb_d_oper_3m', 'cnt_days_cred_e_oper_1m',
             'cnt_deb_e_oper_1m', 'cnt_deb_h_oper_3m', 'cnt_a_oper_3m', 'cnt_deb_g_oper_3m', 'sum_cred_h_oper_1m',
             'cnt_days_deb_h_oper_3m', 'sum_a_oper_3m', 'cnt_deb_d_oper_3m', 'city_type', 'cnt_deb_f_oper_3m',
             'cnt_c_oper_3m', 'cnt_days_cred_g_oper_3m', 'end_cluster']

train_df = train_df[features]
test_df = test_df[features[:-1]]

### Добавление столбца с предыдущим start_cluster для каждого месяца

In [4]:
prev_cluster = train_df["start_cluster"].shift()
prev_cluster.iloc[0::3] = "None"
prev_cluster_test = test_df["start_cluster"].shift()

train_df["prev_cluster"] = prev_cluster
test_df["prev_cluster"] = prev_cluster_test.astype("str")

train_df.head()

Unnamed: 0,start_cluster,okved,city,channel_code,balance_amt_min,segment,index_city_code,ogrn_days_end_quarter,sum_of_paym_1y,min_founderpres,...,sum_cred_h_oper_1m,cnt_days_deb_h_oper_3m,sum_a_oper_3m,cnt_deb_d_oper_3m,city_type,cnt_deb_f_oper_3m,cnt_c_oper_3m,cnt_days_cred_g_oper_3m,end_cluster,prev_cluster
0,"{α, γ}",okved_30,city_23,channel_code_5,1.287207,segment_1,index_city_code_39,-0.135063,0.51149,2.93256,...,1.17102,0.774354,-0.207082,0.870124,city_type_0,0.286074,0.960017,0.568681,{other},
1,"{α, γ}",okved_30,city_23,channel_code_5,2.458609,segment_1,index_city_code_39,-0.135063,0.486425,2.952725,...,0.41041,0.696576,-0.207082,0.870983,city_type_0,0.286081,0.960017,0.499716,{other},"{α, γ}"
2,"{α, γ}",okved_30,city_23,channel_code_5,0.430042,segment_1,index_city_code_39,-0.135063,0.480547,2.97094,...,0.552757,0.663243,-0.207082,0.870983,city_type_0,0.286081,0.960017,0.442244,{other},"{α, γ}"
3,{other},okved_5,city_14,channel_code_2,-0.11404,segment_1,,1.258747,0.052041,,...,-0.171047,0.785465,-0.207082,0.878708,city_type_0,0.286074,0.960017,0.407762,{other},
4,{other},okved_5,city_14,channel_code_2,-0.119302,segment_1,,1.258747,0.033554,,...,-0.130732,0.696576,-0.207082,0.879566,city_type_0,0.286081,0.960017,0.43075,{other},{other}


### Модель

In [5]:
cat_cols = list(np.array(train_df.select_dtypes(exclude="float64").columns))

if "end_cluster" in cat_cols:
    cat_cols.remove("end_cluster")
    
train_df[cat_cols] = train_df[cat_cols].fillna('NA')
train_df[cat_cols] = train_df[cat_cols].astype("category")
    
X = train_df.copy()

X_train, X_val = train_test_split(X, random_state=42, shuffle=True, test_size=0.25, stratify=X['end_cluster'])
y_train = pd.Series(X_train['end_cluster'].values)
new_X_train = X_train.drop(['end_cluster'], axis=1)
y_val = pd.Series(X_val['end_cluster'].values)
new_X_val = X_val.drop(['end_cluster'], axis=1)

y = X['end_cluster']
X = X.drop('end_cluster',axis=1)

In [10]:
CatBoostModel = CatBoostClassifier(
        objective="MultiClass",
        iterations=1250,
        learning_rate=0.05,
        depth=6,
        cat_features=cat_cols,
        task_type="GPU",
        random_state=42,
            )

In [11]:
CatBoostModel.fit(
        X, y,
        plot=False,
        verbose=1000
                )

0:	learn: 2.4341098	total: 230ms	remaining: 4m 47s
1000:	learn: 0.7852500	total: 1m 4s	remaining: 16.1s
1249:	learn: 0.7753461	total: 1m 20s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1bcf6da74f0>

###  Метрика

In [12]:
cluster_weights = pd.read_excel("cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()
sorted_classes = ['{other}' ,'{}', '{α, β}', '{α, γ}' ,'{α, δ}' ,'{α, ε, η}', '{α, ε, θ}',
 '{α, ε, ψ}', '{α, ε}' ,'{α, η}' ,'{α, θ}' ,'{α, λ}' ,'{α, μ}', '{α, π}',
 '{α, ψ}','{α}' ,'{λ}']

sorted_weights = [weights_dict[class_] for class_ in sorted_classes]
sorted_weights_dict = dict(zip(sorted_classes, sorted_weights))

In [13]:
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

y_pred_proba = CatBoostModel.predict_proba(new_X_val)
weighted_roc_auc(y_val, y_pred_proba, CatBoostModel.classes_, sorted_weights_dict)

0.946572324963462

### Прогноз на тестовой выборке

In [14]:
test_df = pd.read_parquet("test_data.pqt")

test_df["prev_cluster"] = prev_cluster_test.astype("str")

last_m_test_df = test_df[test_df["date"] == "month_6"]
last_m_test_df = last_m_test_df.drop(["id", "date"], axis=1)

In [15]:
#Загрузка предсказанного стартового кластера для шестого месяца
month_6 = pd.read_csv('start_cluster_6_predict.csv')
month_6

Unnamed: 0.1,Unnamed: 0,0
0,0,{α}
1,1,{α}
2,2,{other}
3,3,{α}
4,4,{α}
...,...,...
99995,99995,{α}
99996,99996,{α}
99997,99997,{α}
99998,99998,{α}


In [16]:
last_m_test_df["start_cluster"] = list(month_6.iloc[:,1])
last_m_test_df[cat_cols] = last_m_test_df[cat_cols].astype("object")
last_m_test_df[cat_cols] = last_m_test_df[cat_cols].fillna('NA')
last_m_test_df[cat_cols] = last_m_test_df[cat_cols].astype("category")
last_m_test_df = last_m_test_df[new_X_train.columns]

test_pred_proba = CatBoostModel.predict_proba(last_m_test_df)
test_pred_proba_df = pd.DataFrame(test_pred_proba, columns=CatBoostModel.classes_)
sorted_classes = sorted(test_pred_proba_df.columns.to_list())
test_pred_proba_df = test_pred_proba_df[sorted_classes]

In [17]:
sample_submission_df = pd.read_csv("sample_submission.csv")

In [18]:
test_pred_proba_df.shape

(100000, 17)

In [19]:
sample_submission_df[sorted_classes] = test_pred_proba_df
sample_submission_df.to_csv("submission_new.csv", index=False)