# Imports

In [1]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
import pickle
from tqdm import tqdm

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report

from catboost import CatBoostClassifier, Pool

# Prepare data

In [9]:
train = pd.read_csv('drive/MyDrive/spb/train.csv', usecols=['CLIENT_ID', 'RETRO_DT', 'DEF'])
train_processed = pd.read_feather('drive/MyDrive/spb/train_processed.ftr').drop(['target'], axis=1)
urls_train_processed = pd.read_feather('drive/MyDrive/spb/urls_train_processed.ftr').drop(['target'], axis=1)
LaBSE_embeddings = pd.read_feather('drive/MyDrive/spb/LaBSE_embeddings.ftr')
svd_embeddings_tokens = pd.read_feather('drive/MyDrive/spb/svd_embeddings_tokens.ftr')
svd_embeddings_urls = pd.read_feather('drive/MyDrive/spb/svd_embeddings_urls.ftr')
w2v_128_train = pd.read_feather('drive/MyDrive/spb/w2v_128_train')

In [10]:
train = pd.concat([train, 
                   train_processed, urls_train_processed,
                   LaBSE_embeddings, 
                   svd_embeddings_tokens, svd_embeddings_urls,
                   w2v_128_train], axis=1)

In [11]:
train

Unnamed: 0,CLIENT_ID,RETRO_DT,DEF,tokens_processed,tokens_ratio_mean_0_1,urls_processed,urls_ratio_mean_0_1,labse_0,labse_1,labse_2,...,118,119,120,121,122,123,124,125,126,127
0,5909886,20200911,0,отличаются vozrasta clothes pensionnogo bryuki...,0.247153,c93024971e7e255564556a780fa06418 088d9cd775cbb...,0.270492,0.042832,0.015710,0.005240,...,0.922403,0.711751,0.491740,0.355994,-0.439500,0.238270,1.739571,0.306569,-0.299016,-0.098736
1,1385448,20210519,1,прозрачное положения гарант вложения бизнесу s...,0.286867,efecb07a20deb7ae4d380bf387168794 5abadb0a28486...,0.360123,0.051527,0.004283,-0.022121,...,0.243908,1.916932,5.377493,-2.468982,3.080981,5.838118,4.338518,6.469142,1.421552,-3.374285
2,1866195,20210913,0,вишня известного вузах kurorte podala количест...,0.227464,e8ae4477cbde4f5ca46266606e484335 734b5a445e49b...,0.407214,0.044195,0.002743,-0.021305,...,-0.046195,-0.046615,-0.231495,-0.435888,-0.549012,-0.343074,0.783745,0.275752,0.057956,0.417989
3,1587081,20210702,0,согласовали completion sredstvo ottenok dlina ...,0.281095,3c9e2d06c5fa4e94222eca6e29bdda56 9099a1dbf1b04...,0.380863,0.039948,0.021756,0.002506,...,0.072269,2.977056,3.357500,-2.276484,0.816574,3.794444,1.874452,2.741173,0.637505,-3.673784
4,1400908,20210522,0,гимнастики турцию детская местоимения mer пани...,0.230729,5dc1b6a5cf4e94dcc7f0a9aa6c83652d c5f797ca5bc04...,0.280012,0.043054,0.007908,0.038048,...,-0.300249,-3.201139,1.964681,-2.722527,0.321271,2.280865,0.110442,2.300574,-2.271284,0.383220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303772,6220063,20201211,0,калорийность calorizator,0.229299,2c45709d8e05171bfb39b7528e97ac5f,0.234551,0.041605,-0.071626,-0.058381,...,-0.040342,0.050774,0.111814,0.093060,-0.060128,0.299196,-0.001963,-0.023159,0.032425,0.308650
303773,5543361,20200507,0,gayd,0.238273,993cb67a376c12c523584fc94e0dbf41 d56bdcec945b0...,0.261227,0.044924,-0.037827,-0.018032,...,-0.418721,-0.374338,0.034644,0.065725,-0.579452,0.084315,0.199234,-0.036328,-0.201813,-0.084700
303774,224227891,20211218,0,predlozheniya claim заполнения сумасшедшие зим...,0.236123,e0d278038adccea538ff2965a95b49a2 63783590bed0b...,0.270949,0.030484,-0.012333,0.033553,...,3.211429,4.042616,0.373840,-0.428528,-3.044943,0.694152,4.726947,0.427282,-2.332344,-1.264381
303775,5909478,20200911,0,иностранцев пожизненно zheltogo исчезновения д...,0.231896,e007319d9c6427cbf2aedabf881e18cb 88b06056e543b...,0.201815,0.019891,-0.003404,0.010515,...,0.577203,0.544745,-0.148168,-0.598592,0.389467,-0.072030,0.579196,0.872499,-0.457334,-1.098054


In [12]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

def generate_features(df, column_name):
    """
    df - train или тест 
    column_name - колонка с строкой, в которой через пробел находятся урлы или токены очищенные от по весам
    """
    df['char_count'] = df[column_name].apply(len)
    df['word_count'] = df[column_name].apply(lambda x: len(x.split()))
    df['word_density'] = df['char_count'] / (df['word_count']+1)

    
    # BoW
    print('start CountVectorizer...')
    cvectorizer = CountVectorizer(max_features=1000)
    cvz = cvectorizer.fit_transform(df[column_name])

    # Latent Dirichlet Allocation
    print('start Latent Dirichlet Allocation...')
    lda_model = LatentDirichletAllocation(n_components=20, learning_method='online', random_state=42)
    X_topics = lda_model.fit_transform(cvz)
    
    X_topics = pd.DataFrame(X_topics)
    X_topics.columns = [f'lde_feature_{i}' for i in range(X_topics.shape[1])]

    df = pd.concat([df, X_topics], axis = 1)
    
    return df

In [13]:
train = generate_features(train, 'urls_processed')

start CountVectorizer...
start Latent Dirichlet Allocation...


In [22]:
train = generate_features(train, 'tokens_processed')

start CountVectorizer...
start Latent Dirichlet Allocation...


In [36]:
train.columns.values[-40:-20] = ['url_'+v for v in train.columns.values[-40:-20]]

In [39]:
train.head()

Unnamed: 0,CLIENT_ID,RETRO_DT,DEF,tokens_processed,tokens_ratio_mean_0_1,urls_processed,urls_ratio_mean_0_1,labse_0,labse_1,labse_2,...,lde_feature_10,lde_feature_11,lde_feature_12,lde_feature_13,lde_feature_14,lde_feature_15,lde_feature_16,lde_feature_17,lde_feature_18,lde_feature_19
0,5909886,20200911,0,отличаются vozrasta clothes pensionnogo bryuki...,0.247153,c93024971e7e255564556a780fa06418 088d9cd775cbb...,0.270492,0.042832,0.01571,0.00524,...,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625,0.00625
1,1385448,20210519,1,прозрачное положения гарант вложения бизнесу s...,0.286867,efecb07a20deb7ae4d380bf387168794 5abadb0a28486...,0.360123,0.051527,0.004283,-0.022121,...,0.00098,0.00098,0.00098,0.00098,0.329399,0.00098,0.151608,0.037243,0.00098,0.00098
2,1866195,20210913,0,вишня известного вузах kurorte podala количест...,0.227464,e8ae4477cbde4f5ca46266606e484335 734b5a445e49b...,0.407214,0.044195,0.002743,-0.021305,...,0.005556,0.226763,0.005556,0.005556,0.005556,0.005556,0.005556,0.175794,0.502998,0.005556
3,1587081,20210702,0,согласовали completion sredstvo ottenok dlina ...,0.281095,3c9e2d06c5fa4e94222eca6e29bdda56 9099a1dbf1b04...,0.380863,0.039948,0.021756,0.002506,...,0.002,0.002,0.002,0.002,0.43908,0.002,0.274991,0.002,0.002,0.002
4,1400908,20210522,0,гимнастики турцию детская местоимения mer пани...,0.230729,5dc1b6a5cf4e94dcc7f0a9aa6c83652d c5f797ca5bc04...,0.280012,0.043054,0.007908,0.038048,...,0.030604,0.055259,0.000459,0.23954,0.000459,0.042344,0.111543,0.034799,0.060964,0.000459


# Model

In [40]:
X_train, X_val, y_train, y_val = train_test_split(train.drop(['CLIENT_ID', 'RETRO_DT', 'DEF',
                                                              ], axis=1), 
                                                  train.DEF, 
                                                  test_size=0.15, stratify=train.DEF, random_state=17)

In [41]:
train_pool = Pool(X_train, y_train, 
                  text_features=['tokens_processed', 'urls_processed']
                  )
val_pool = Pool(X_val, y_val, 
                text_features=['tokens_processed', 'urls_processed']
                )

In [42]:
model = CatBoostClassifier(
    auto_class_weights='Balanced',
    task_type='GPU',
    verbose=50,
)

model.fit(train_pool, eval_set=(val_pool))

Learning rate set to 0.044413
0:	learn: 0.6902826	test: 0.6902535	best: 0.6902535 (0)	total: 55.3ms	remaining: 55.3s
50:	learn: 0.6474850	test: 0.6466049	best: 0.6466049 (50)	total: 2.13s	remaining: 39.6s
100:	learn: 0.6390793	test: 0.6394326	best: 0.6394326 (100)	total: 4.19s	remaining: 37.3s
150:	learn: 0.6333123	test: 0.6350094	best: 0.6350094 (150)	total: 6.28s	remaining: 35.3s
200:	learn: 0.6284351	test: 0.6314126	best: 0.6314126 (200)	total: 8.37s	remaining: 33.3s
250:	learn: 0.6242470	test: 0.6283724	best: 0.6283724 (250)	total: 10.5s	remaining: 31.4s
300:	learn: 0.6201208	test: 0.6254088	best: 0.6254088 (300)	total: 12.6s	remaining: 29.3s
350:	learn: 0.6164609	test: 0.6230942	best: 0.6230942 (350)	total: 14.7s	remaining: 27.3s
400:	learn: 0.6129988	test: 0.6210775	best: 0.6210775 (400)	total: 16.9s	remaining: 25.2s
450:	learn: 0.6101662	test: 0.6196752	best: 0.6196752 (450)	total: 19s	remaining: 23.1s
500:	learn: 0.6070956	test: 0.6177393	best: 0.6177393 (500)	total: 21.1s	rema

<catboost.core.CatBoostClassifier at 0x7fe0d91a7310>

In [43]:
print(roc_auc_score(y_val, model.predict_proba(X_val)[:, -1]))
print(classification_report(y_val, model.predict(X_val)))

0.7234612210387653
              precision    recall  f1-score   support

           0       0.86      0.61      0.72     34487
           1       0.37      0.70      0.48     11080

    accuracy                           0.63     45567
   macro avg       0.62      0.66      0.60     45567
weighted avg       0.74      0.63      0.66     45567



In [44]:
fip = pd.DataFrame()
fip['feature'] = model.feature_names_
fip['value'] = model.feature_importances_
fip = fip.sort_values('value', ascending=False)

In [45]:
fip.head(10)

Unnamed: 0,feature,value
3,urls_ratio_mean_0_1,22.057083
1925,word_count,5.353012
2,urls_processed,4.729327
0,tokens_processed,3.660337
1924,char_count,2.865711
1901,105,0.855529
1851,55,0.790168
1811,15,0.734871
1,tokens_ratio_mean_0_1,0.619793
1943,url_lde_feature_16,0.61059


In [46]:
model.save_model('drive/MyDrive/spb/cb_model-all_data')

In [48]:
preds = model.predict_proba(X_val)[:, -1]

for i in range(101):
    preds_bin = preds.copy()
    preds_bin[preds_bin >= i/100] = 1
    preds_bin[preds_bin < i/100] = 0
    print(np.round(i/100, 2), roc_auc_score(y_val, preds_bin))


0.0 0.5
0.01 0.5000144982167194
0.02 0.5006814161858091
0.03 0.5020732449908661
0.04 0.5036680488299938
0.05 0.5055818134369472
0.06 0.5076695566445327
0.07 0.5096993069852408
0.08 0.5117435555426683
0.09 0.514222750601676
0.1 0.5167744367442805
0.11 0.5194984697838845
0.12 0.522601088161824
0.13 0.5253202260905302
0.14 0.528434079277924
0.15 0.5314899395984403
0.16 0.5346697570549003
0.17 0.538735597434873
0.18 0.5425372065066322
0.19 0.5464482744976158
0.2 0.5505735523321246
0.21 0.5545683462161591
0.22 0.5587644834306319
0.23 0.5622918838564084
0.24 0.5664427076011167
0.25 0.5712185863683894
0.26 0.5751662479630528
0.27 0.5791267760708032
0.28 0.5838718395850307
0.29 0.5883539292627296
0.3 0.5930391810904732
0.31 0.5978951481115837
0.32 0.6017501768311379
0.33 0.6057436609033551
0.34 0.6098304739744449
0.35 0.6141121035614425
0.36 0.6188478858616635
0.37 0.6223766785349661
0.38 0.6257766189614273
0.39 0.6306845610426741
0.4 0.6348282442847978
0.41 0.6385496604224539
0.42 0.640922181

# SKF on best features

In [55]:
good_features = fip[fip.value > 0.0].feature.values
len(good_features)

1139

In [None]:
i = 0
for train_index, val_index in StratifiedKFold(n_splits=5).split(train, train.DEF):

    X_train = train[good_features].iloc[train_index]
    X_val = train[good_features].iloc[val_index]
    y_train = train['DEF'].iloc[train_index]
    y_val = train['DEF'].iloc[val_index]

    train_pool = Pool(X_train, y_train, text_features=['tokens_processed', 'urls_processed'])
    val_pool = Pool(X_val, y_val, text_features=['tokens_processed', 'urls_processed'])

    model = CatBoostClassifier(
        auto_class_weights='Balanced',
        task_type='GPU',
        verbose=50)

    model.fit(train_pool, eval_set=(val_pool))

    print(roc_auc_score(y_val, model.predict_proba(X_val)[:, -1]))
    print(classification_report(y_val, model.predict(X_val)))

    model.save_model(f'drive/MyDrive/spb/skf{i}_cb_model-all_data')
    i += 1

Learning rate set to 0.044642
0:	learn: 0.6901920	test: 0.6901082	best: 0.6901082 (0)	total: 38.6ms	remaining: 38.5s
50:	learn: 0.6475915	test: 0.6462987	best: 0.6462987 (50)	total: 1.81s	remaining: 33.7s
100:	learn: 0.6392054	test: 0.6391867	best: 0.6391867 (100)	total: 3.56s	remaining: 31.7s
150:	learn: 0.6332402	test: 0.6346183	best: 0.6346183 (150)	total: 5.35s	remaining: 30.1s
200:	learn: 0.6285234	test: 0.6311429	best: 0.6311429 (200)	total: 7.13s	remaining: 28.3s
250:	learn: 0.6242596	test: 0.6282901	best: 0.6282901 (250)	total: 8.93s	remaining: 26.6s
300:	learn: 0.6201206	test: 0.6255476	best: 0.6255476 (300)	total: 10.7s	remaining: 24.9s
350:	learn: 0.6164671	test: 0.6234690	best: 0.6234690 (350)	total: 12.5s	remaining: 23.2s
400:	learn: 0.6129724	test: 0.6213924	best: 0.6213924 (400)	total: 14.4s	remaining: 21.5s
450:	learn: 0.6098768	test: 0.6196205	best: 0.6196205 (450)	total: 16.2s	remaining: 19.7s
500:	learn: 0.6070766	test: 0.6182434	best: 0.6182434 (500)	total: 18s	rema

In [None]:
print(roc_auc_score(y_val, model.predict(X_val)))

# Inference

In [65]:
models = []

models.append(CatBoostClassifier().load_model('drive/MyDrive/spb/cb_model-all_data'))
for i in range(5):
    models.append(CatBoostClassifier().load_model(f'drive/MyDrive/spb/skf{i}_cb_model-all_data'))

In [66]:
test = pd.read_feather('drive/MyDrive/spb/test.ftr')[['CLIENT_ID', 'RETRO_DT']]
test_processed = pd.read_feather('drive/MyDrive/spb/test_processed.ftr')
urls_test_processed = pd.read_feather('drive/MyDrive/spb/urls_test_processed.ftr')
test_LaBSE_embeddings = pd.read_feather('drive/MyDrive/spb/test_LaBSE_embeddings.ftr')
test_svd_embeddings_tokens = pd.read_feather('drive/MyDrive/spb/test_svd_embeddings_tokens.ftr')
test_svd_embeddings_urls = pd.read_feather('drive/MyDrive/spb/test_svd_embeddings_urls.ftr')
w2v_128_test = pd.read_feather('drive/MyDrive/spb/w2v_128_test')

In [67]:
test = pd.concat([test, 
                  test_processed, urls_test_processed,
                  test_LaBSE_embeddings, 
                  test_svd_embeddings_tokens, test_svd_embeddings_urls,
                  w2v_128_test], axis=1)

In [68]:
test_svd_embeddings_tokens.shape

(154804, 512)

In [69]:
test.shape

(154804, 1926)

In [70]:
test['tokens_processed'] = test['tokens_processed'].fillna('')
test['urls_processed'] = test['urls_processed'].fillna('')

In [71]:
test_preds = []

for model in models:

    test_preds.append(model.predict_proba(test[model.feature_names_])[:, -1])

test_preds = np.mean(test_preds, axis=0)

In [72]:
res = test[['CLIENT_ID', 'RETRO_DT']].copy()
res['DEF'] = test_preds

In [73]:
res

Unnamed: 0,CLIENT_ID,RETRO_DT,DEF
0,5467000,20200419,0.193871
1,6093848,20201107,0.268562
2,5269916,20200305,0.375836
3,816015821,20220115,0.786068
4,223498561,20211204,0.608581
...,...,...,...
154799,820037931,20220408,0.523726
154800,820043321,20220408,0.594040
154801,820058971,20220409,0.505627
154802,820062491,20220409,0.438396


In [59]:
res.to_csv('drive/MyDrive/spb/sub1.csv', index=False)