In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler,  LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import dump

In [2]:
df = pd.read_csv('train.csv', sep=',')

In [3]:
df['churn'] = df['churn'].replace(0, 2)
df = df.replace([0, None], np.nan)
df = df.replace([-1], 0)
df['churn'] = df['churn'].replace(2, 0)

le = LabelEncoder()
df['region'] = le.fit_transform(df['region'])

In [4]:
df.drop(columns = ['appl_mrkr','evry_qrtr_pmnt'],axis = 1, inplace=True)

In [5]:
df.dropna(thresh=5, inplace=True)
df.drop_duplicates(inplace=True)

In [6]:
df[['frst_pmnt_date_y', 'frst_pmnt_date_m', 'frst_pmnt_date_d']] = df['frst_pmnt_date'].str.split('-', expand=True)

df['frst_pmnt_date_y']=df['frst_pmnt_date_y'].astype(float)
df['frst_pmnt_date_m']=df['frst_pmnt_date_m'].astype(float)
df['frst_pmnt_date_d']=df['frst_pmnt_date_d'].astype(float)

df[['lst_pmnt_date_per_qrtr_y', 'lst_pmnt_date_per_qrtr_m', 'lst_pmnt_date_per_qrtr_d']] = df['lst_pmnt_date_per_qrtr'].str.split('-', expand=True)

df['lst_pmnt_date_per_qrtr_y']=df['lst_pmnt_date_per_qrtr_y'].astype(float)
df['lst_pmnt_date_per_qrtr_m']=df['lst_pmnt_date_per_qrtr_m'].astype(float)
df['lst_pmnt_date_per_qrtr_d']=df['lst_pmnt_date_per_qrtr_d'].astype(float)

df.drop(columns = ['frst_pmnt_date','lst_pmnt_date_per_qrtr'],axis = 1, inplace=True)

In [7]:
data = {
    "year": range(2000, 2024+1),
    "vvp": [7.306,8.944,10.831,13.208,17.027,21.61,26.92,33.25,41.28,38.81,46.31,60.11,68.1,72.99,79.03,83.1,86.1,91.84,103.86,109.61,107.32,109.6,116.22,171.041,175],
    "infl": [20.8,21.5,15.8,13.7,10.9,12.7,9.7,9.0,14.1,11.6,6.8,8.4,5.0,6.7,7.8,15.5,7.0,3.7,2.9,4.5,3.4,6.7,11.9, 7.42, 4.5],
    "uroven_bezrab": [10.6, 8.9, 8.0, 8.2, 7.3, 7.2, 7.0, 6.0, 6.2, 8.2, 7.4, 6.5, 5.5, 5.5, 5.2, 5.6, 5.5, 5.2, 4.8, 4.6, 5.8, 5.6, 4.0, 3.0, 2.9 ],
    "stavka_refen": [30, 25, 21, 18, 14, 12, 11, 10, 11, 12, 8.5, 8, 8, 5, 7, 13, 10, 9, 7, 7, 4, 5, 15, 13, 16],
    "mrot": [132, 200, 450, 600, 600, 750, 1500, 2300, 3000, 4300, 4500, 4600, 5000, 5200, 5500, 5900, 6200, 7500, 10000, 11200, 12100, 12700, 14000, 16200, 19200],
    "vvp_na_dushu" : [10511, 11346, 12126, 13333, 14727, 16221, 18113, 20228, 21700, 20117, 21271, 22783, 24278, 26044, 25730, 24062, 24104, 25999, 27386, 28495, 28181, 30875, 34750, 36000, 38000],
    # "vnesh_b": [50, 60, 70, 80, 90, 100, 110, 120, 110, 100, 110, 120, 130, 140, 130, 120, 110, 120, 130, 140, 130, 140, 150, 160, 170],
    # "dolg": [7.05, 5.09, 4.744, 7.349, 7.176, 6.376, 8.154, 8.535, 5.248, -7.821, 4.503, 5.066, 4.024, -1.755, -0.736, -1.973, 0.194, 1.826, 2.807, -2.198, -2.664, 4.749, -11.91, -11],
    "kurs": [28,29,31,30,28,28,27,25,24,31,30,29,31,31,38,60,67,58,62,64,72,73,68,85,91],
}

dop_df = pd.DataFrame(data)
df = df.merge(dop_df, left_on='year', right_on='year', how="left")

In [8]:
df = df[["npo_accnts_nmbr", "pmnts_type",'clnt_cprtn_time_d','lst_pmnt_rcnc_d','oprtn_sum_per_year','pmnts_nmbr','pmnts_nmbr_per_year','mgd_accum_period','mgd_payment_period','postal_code','region','fact_addrss','frst_pmnt_date_y','lst_pmnt_date_per_qrtr_y','year','vvp','infl','uroven_bezrab','kurs','stavka_refen','mrot','churn']]

# df_train = df.loc[(df['year'] <= 2021)]
df_train = df.iloc[:, :]

# df_test = df.loc[(df['year'] > 2021)]
# df = df.drop('year', axis=1)

In [9]:
# churn = df['churn']
# df = df.drop('churn', axis=1)
# df.insert(len(df.columns ), 'churn', churn)

In [10]:
X_train = df_train.iloc[:, :-1]
Y_train = df_train.iloc[:, -1]
# X_test = df_test.iloc[:, :-1]
# Y_test = df_test.iloc[:, -1]

In [11]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
# X_test = sc_X.transform(X_test)
dump(sc_X, 'std_scaler.bin', compress=True)

['std_scaler.bin']

In [14]:
model = CatBoostClassifier(iterations=1000,
                           depth=10,
                           learning_rate=0.3,
                           loss_function='Logloss',
                           eval_metric='F1',
                           l2_leaf_reg = 9,
                           nan_mode='Min',
                           verbose=True)
model.fit(X_train, Y_train)
Y_pred_train = model.predict(X_train)
# Y_pred = model.predict(X_test)

print('train score: ', f1_score(Y_train, Y_pred_train))
# print('test score: ', f1_score(Y_test, Y_pred))

0:	learn: 0.4707968	total: 830ms	remaining: 13m 48s
1:	learn: 0.5117681	total: 1.69s	remaining: 14m 2s
2:	learn: 0.5261868	total: 2.44s	remaining: 13m 31s
3:	learn: 0.5247569	total: 3.31s	remaining: 13m 44s
4:	learn: 0.5393865	total: 4.13s	remaining: 13m 42s
5:	learn: 0.5484782	total: 4.9s	remaining: 13m 31s
6:	learn: 0.5599874	total: 5.65s	remaining: 13m 21s
7:	learn: 0.5611938	total: 6.42s	remaining: 13m 15s
8:	learn: 0.5640155	total: 7.06s	remaining: 12m 57s
9:	learn: 0.5694482	total: 7.7s	remaining: 12m 42s
10:	learn: 0.5729823	total: 8.3s	remaining: 12m 26s
11:	learn: 0.5786805	total: 8.96s	remaining: 12m 17s
12:	learn: 0.5792401	total: 9.59s	remaining: 12m 8s
13:	learn: 0.5833668	total: 10.3s	remaining: 12m 2s
14:	learn: 0.5851808	total: 10.9s	remaining: 11m 56s
15:	learn: 0.5878255	total: 11.6s	remaining: 11m 53s
16:	learn: 0.5906133	total: 12.3s	remaining: 11m 48s
17:	learn: 0.5902700	total: 12.9s	remaining: 11m 43s
18:	learn: 0.5938977	total: 13.5s	remaining: 11m 38s
19:	learn

In [16]:
model.save_model('cat')