In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score

In [2]:
df = pd.read_csv('train.csv', sep=',')

In [3]:
def quarter_last_number(x):
   return int(x[-1])
df['quarter'] = df['quarter'].apply(quarter_last_number)

In [4]:
df = df.sort_values(['client_id', 'year','quarter','npo_account_id'])
df['churn'] = df['churn'].replace(0, 2)
df['region'] = df['region'].replace(list(set(df['region'])), [x for x in range(len(set(df['region'])))]).astype(int)
df = df.replace([0, None], np.nan)
df = df.replace([-1], 0)
df['churn'] = df['churn'].replace(2, 0)

In [5]:
df.drop(columns = ['appl_mrkr','clnt_cprtn_time_d','actv_prd_d','oprtn_sum_per_qrtr','oprtn_sum_per_year','pmnts_nmbr_per_qrtr','pmnts_nmbr_per_year','mgd_accum_period','mgd_payment_period','phone_number','assignee_npo','citizen','evry_qrtr_pmnt'],axis = 1, inplace=True)

In [6]:
df.dropna(thresh=5, inplace=True)
df = df.drop(df[df.duplicated()].index)

In [7]:
df[['frst_pmnt_date_y', 'frst_pmnt_date_m', 'frst_pmnt_date_d']] = df['frst_pmnt_date'].str.split('-', expand=True)

df['frst_pmnt_date_y']=df['frst_pmnt_date_y'].astype(float)
df['frst_pmnt_date_m']=df['frst_pmnt_date_m'].astype(float)
df['frst_pmnt_date_d']=df['frst_pmnt_date_d'].astype(float)

df[['lst_pmnt_date_per_qrtr_y', 'lst_pmnt_date_per_qrtr_m', 'lst_pmnt_date_per_qrtr_d']] = df['lst_pmnt_date_per_qrtr'].str.split('-', expand=True)

df['lst_pmnt_date_per_qrtr_y']=df['lst_pmnt_date_per_qrtr_y'].astype(float)
df['lst_pmnt_date_per_qrtr_m']=df['lst_pmnt_date_per_qrtr_m'].astype(float)
df['lst_pmnt_date_per_qrtr_d']=df['lst_pmnt_date_per_qrtr_d'].astype(float)

df.drop(columns = ['frst_pmnt_date','lst_pmnt_date_per_qrtr'],axis = 1, inplace=True)

churn = df['churn']
df = df.drop('churn', axis=1)
df.insert(len(df.columns ), 'churn', churn)

In [8]:
X = df.iloc[:, 3:-1]
Y = df.iloc[:, -1]

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split (X, Y, test_size=0.25, random_state=42)

In [13]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [14]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(iterations=300,
                           depth=10,
                           learning_rate=0.5,
                           loss_function='Logloss',
                           eval_metric='F1',
                           l2_leaf_reg = 9,
                           nan_mode='Min',
                           verbose=True)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

f1_score(Y_test, Y_pred)

0:	learn: 0.0769790	total: 720ms	remaining: 3m 35s
1:	learn: 0.1728063	total: 1.49s	remaining: 3m 42s
2:	learn: 0.2626291	total: 2.25s	remaining: 3m 42s
3:	learn: 0.3336523	total: 2.97s	remaining: 3m 39s
4:	learn: 0.3538384	total: 3.64s	remaining: 3m 34s
5:	learn: 0.3975269	total: 4.37s	remaining: 3m 34s
6:	learn: 0.4212589	total: 5.02s	remaining: 3m 30s
7:	learn: 0.4514567	total: 5.63s	remaining: 3m 25s
8:	learn: 0.4693734	total: 6.29s	remaining: 3m 23s
9:	learn: 0.4893044	total: 6.95s	remaining: 3m 21s
10:	learn: 0.5059643	total: 7.6s	remaining: 3m 19s
11:	learn: 0.5180412	total: 8.2s	remaining: 3m 16s
12:	learn: 0.5302412	total: 8.83s	remaining: 3m 14s
13:	learn: 0.5419743	total: 9.42s	remaining: 3m 12s
14:	learn: 0.5448121	total: 10s	remaining: 3m 9s
15:	learn: 0.5509795	total: 10.6s	remaining: 3m 7s
16:	learn: 0.5558013	total: 11.2s	remaining: 3m 5s
17:	learn: 0.5650439	total: 11.8s	remaining: 3m 4s
18:	learn: 0.5700720	total: 12.4s	remaining: 3m 3s
19:	learn: 0.5744813	total: 13s

0.6763422746317653

In [None]:
# model.save_model('cat')