In [1]:
# Загрузка необходимых библиотек
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb


In [2]:
# Загрузка датасетов
train_df = pd.read_parquet('train_data.pqt')
test_df = pd.read_parquet('test_data.pqt')
cluster_weights_df = pd.read_excel("cluster_weights.xlsx")
sample_submission_df = pd.read_csv("sample_submission.csv")

# Определение функции потерь
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

### Обработка датасетов

In [3]:
# Коллонка с численным номером месяца
train_df['month_num'] = train_df['date'].apply(lambda x: x[-1]).astype(int)
test_df['month_num'] = test_df['date'].apply(lambda x: x[-1]).astype(int)

In [4]:
# Заполнение пропусков в month_6
m5_cluster = test_df[test_df['month_num'] ==5][['start_cluster']].reset_index()
m5_cluster['index'] +=1
m5_cluster.set_index('index', inplace=True)
m5_cluster = m5_cluster.to_dict('dict')['start_cluster']

m6_ids = test_df[test_df['month_num'] == 6].index
test_df.loc[m6_ids, 'start_cluster'] = m5_cluster

In [5]:
# Присвоение коллонке start_cluster типа категориальной переменной
all_clusters = train_df['start_cluster'].unique()
start_cluster_dtype = pd.CategoricalDtype(categories=all_clusters)

train_df['start_cluster'] = train_df['start_cluster'].astype(start_cluster_dtype)
test_df['start_cluster'] = test_df['start_cluster'].astype(start_cluster_dtype)

In [6]:
# Создание объединенного датасета для дальнейшей обработки
train_df['is_train'] = 1
test_df['is_train'] = 0
full_df = pd.concat([train_df, test_df], ignore_index=True)
full_df = full_df.sort_values(['id', 'month_num']).reset_index(drop=True)

### Инжинеринг переменных для обучения модели

In [7]:


# Выделение остальных категориальных переменных

cat_cols = ["channel_code", "city", "city_type", "index_city_code", "ogrn_month",
            "ogrn_year", "okved", "segment", "start_cluster"]

for col in cat_cols:
      all_uniques = full_df[col].dropna().unique()
      cat_dtype = pd.CategoricalDtype(categories=all_uniques)
      full_df[col] = full_df[col].astype(cat_dtype)

In [8]:
# Добавление лаговых переменных

cols_for_lags = ['balance_amt_avg', 'balance_amt_max', 'balance_amt_min', 'balance_amt_day_avg',
                     'ft_registration_date', 'max_founderpres', 'min_founderpres', 'ogrn_exist_months']

for col in cols_for_lags:
      full_df[f'{col}_lag1'] = full_df.groupby('id')[col].shift(1).fillna(0)
      full_df[f'{col}_diff1'] = (full_df[col] - full_df[f'{col}_lag1']).fillna(0)

In [9]:
# Добавление категориальной переменной кластера в прошлом месяце
placeholder = "MISSING_LAG"
prev_month_clusters = list(all_clusters) + [placeholder]
prev_start_cluster_dtype = pd.CategoricalDtype(categories=prev_month_clusters)

full_df['prev_month_start_cluster'] = (full_df.groupby('id')['start_cluster'].shift(1)).astype(prev_start_cluster_dtype).fillna(placeholder)
cat_cols.append('prev_month_start_cluster')

In [10]:
# Обработка окончена, разделение датасетов по флагу is_train
train_processed_df = full_df[full_df['is_train'] == 1].copy()
test_processed_df = full_df[full_df['is_train'] == 0].copy()

# Перевод end_cluster в численных формат

# Датасеты соотношения названия и кода кластера
encoding_df = pd.DataFrame({'cluster' :sorted(train_processed_df['end_cluster'].unique()),
             'end_cluster_encoded': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]})
encoding_df_w_weights = encoding_df.merge(cluster_weights_df[['cluster', 'unnorm_weight']], on='cluster')

# Перенос кода в тренировочный датасет
train_processed_df = pd.merge(train_processed_df, encoding_df.rename({'cluster': 'end_cluster'}, axis=1), on='end_cluster')

num_classes = len(encoding_df)

In [11]:
# Выделение переменных для тренировки
features_to_drop = ['id', 'date', 'month_num', 'is_train', 'end_cluster', 'end_cluster_encoded'] # Переменные, не учитываемы в обучении модели

features = [col for col in train_processed_df.columns if col not in features_to_drop]
final_cat_features = [col for col in cat_cols]


### Обучение модели

In [12]:
# Подготовка данных для обучения
X = train_processed_df[features]
y = train_processed_df['end_cluster_encoded']
X_test = test_processed_df[test_processed_df['month_num'] == 6][features].copy()

oof_preds = np.zeros((X.shape[0], num_classes))
test_preds = np.zeros((X_test.shape[0], num_classes))

In [13]:
#Обучение модели с использованием ассемблеи
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros((X.shape[0], num_classes))
test_preds = np.zeros((X_test.shape[0], num_classes))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold + 1}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = lgb.LGBMClassifier(
        objective='multiclass', metric='multi_logloss', num_class=num_classes,
        n_estimators=1000, learning_rate=0.05, feature_fraction=0.8,
        bagging_fraction=0.8, bagging_freq=1, lambda_l1=0.1, lambda_l2=0.1,
        num_leaves=31, verbose=-1, n_jobs=-1, seed=42 + fold, boosting_type='gbdt'
    )

    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              callbacks=[lgb.early_stopping(100, verbose=True)],
              categorical_feature=final_cat_features)

    oof_preds[val_idx] = model.predict_proba(X_val)
    test_preds += model.predict_proba(X_test) / skf.n_splits


Fold 1
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.646659

Fold 2
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.647406

Fold 3
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.654207

Fold 4
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.646248

Fold 5
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.645517


### Форматирование итогового файла

In [15]:
submission_df = pd.DataFrame(test_preds, columns=sorted(all_clusters))
submission_df['id'] = test_processed_df[test_processed_df['month_num'] == 6]['id'].values

submission_cols_order = ['id'] + list(sample_submission_df.columns[1:])
submission_df = submission_df[submission_cols_order]

submission_df.to_csv("submission.csv", index=False)
print("Submission file succesfully created!")

Submission file succesfully created!


In [18]:
submission_df

Unnamed: 0,id,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ}
0,200000,0.005926,0.014308,0.024379,0.016736,0.000608,1.420817e-06,2.069692e-06,4.639868e-07,0.001485,0.002701,0.007095,2.637911e-06,0.000068,1.041634e-07,0.000031,0.926655,4.188624e-07
1,200001,0.002782,0.588337,0.000089,0.000698,0.000009,3.872562e-07,3.168853e-07,3.895498e-07,0.000384,0.005618,0.000150,3.832987e-08,0.000004,7.666370e-08,0.000024,0.401902,1.474097e-06
2,200002,0.863873,0.002064,0.000843,0.015447,0.001777,5.712840e-06,1.007807e-04,2.878167e-04,0.008594,0.012818,0.004731,3.595782e-05,0.000039,1.805630e-06,0.000873,0.088507,2.103447e-06
3,200003,0.036343,0.653162,0.000237,0.001018,0.000029,1.415472e-06,5.017812e-07,1.515462e-06,0.000304,0.012684,0.000602,2.423105e-07,0.000003,9.859078e-08,0.000001,0.295613,1.993572e-07
4,200004,0.073637,0.104724,0.000870,0.007909,0.000151,5.416449e-06,5.860343e-07,8.149348e-07,0.000711,0.320108,0.002384,5.845194e-06,0.000090,2.747593e-07,0.000012,0.489392,5.546757e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,299995,0.028486,0.318285,0.000249,0.014118,0.000076,1.692562e-06,8.327950e-07,2.023396e-08,0.002188,0.000462,0.000030,6.150587e-06,0.000013,4.379531e-06,0.000016,0.636056,7.258288e-06
99996,299996,0.024118,0.036529,0.004181,0.027432,0.000293,4.286767e-07,9.300151e-07,1.434047e-06,0.001446,0.000974,0.002355,5.979866e-04,0.000056,4.038551e-07,0.000237,0.901778,9.495929e-07
99997,299997,0.024204,0.005980,0.004503,0.054248,0.001142,4.195278e-06,8.762402e-07,1.016297e-07,0.002082,0.000992,0.000484,8.856725e-06,0.000017,3.082818e-07,0.002762,0.903573,5.585127e-07
99998,299998,0.026977,0.176127,0.018963,0.015800,0.001430,4.903867e-07,2.058513e-05,6.366114e-06,0.039472,0.003152,0.005219,1.962945e-04,0.000078,2.218844e-05,0.000249,0.712102,1.873724e-04
