In [1]:
import pandas as pd
from lightgbm import Booster

import torch

import os
import pickle

import warnings
warnings.filterwarnings('ignore')

# Boosting

In [2]:
X_train_full = pd.read_parquet('data/X_train_full_processed.parquet').drop(columns=['partition'])
X_test = pd.read_parquet('data/X_test_processed.parquet')

In [3]:
cat_cols = ['type']
num_cols = [col for col in X_train_full if col not in cat_cols]

In [4]:
X_test_scaled = X_test.copy()

X_test_scaled[num_cols] = X_test_scaled[num_cols] * X_train_full[num_cols].mean() / X_test[num_cols].mean()

In [5]:
model_l = Booster(model_file='weights/lightgbm_without_validation.txt')

In [6]:
preds = model_l.predict(X_test_scaled)

In [7]:
submit = pd.read_csv('data/submission_example.csv')

submit['score'] = preds
submit

Unnamed: 0,clientbankpartner_pin,score
0,6781,0.013787
1,236905,0.064307
2,125779,0.217228
3,1952,0.212962
4,4872,0.569838
...,...,...
4503,121120,0.400931
4504,60667,0.627254
4505,5065,0.657438
4506,133125,0.429514


In [8]:
submit.to_csv(f'submissions/lightgbm.csv', index=False)

# RNN

In [9]:
from rnn_model import ChurnPredictor
from data_generators import batches_generator
from pytorch_training import inference

In [10]:
with open('constants/di_features.pkl', 'rb') as f:
     di_features = pickle.load(f) 
        
device = torch.device('cpu')

In [11]:
model_rnn = torch.load('weights/rnn.pt')

In [12]:
batch_size = 2**6

In [13]:
# Путь к бакетам

path_to_dataset = 'buckets/test'
dir_with_datasets = os.listdir(path_to_dataset)
dataset_test = sorted([os.path.join(path_to_dataset, x) for x in dir_with_datasets if x.endswith('pkl')])

In [14]:
dataloader_test = batches_generator(dataset_test,
                                     batch_size = batch_size,
                                     has_target=False,
                                     shuffle=False,
                                     di_features = di_features)

inference(model_rnn, dataloader_test, device, di_features, path_to_sample_submission='data/submission_example.csv', path_to_save='submissions/rnn.csv')

Unnamed: 0,clientbankpartner_pin,score
0,6781,0.006357
1,236905,0.039373
2,125779,0.188521
3,1952,0.219963
4,4872,0.500983
...,...,...
4503,121120,0.404237
4504,60667,0.611074
4505,5065,0.611959
4506,133125,0.508919


# Blending

In [15]:
lightgbm_preds = pd.read_csv("submissions/lightgbm.csv")
rnn_preds = pd.read_csv("submissions/rnn.csv")

preds = [lightgbm_preds, rnn_preds]
weights = [1, 0.6]

submit = lightgbm_preds.copy()

submit['score'] = sum(pred['score'] * weights[i] for i, pred in enumerate(preds)) / sum(weights)

submit.to_csv('submissions/blended_lgbm_rnn.csv', index=False)

In [16]:
submit

Unnamed: 0,clientbankpartner_pin,score
0,6781,0.011001
1,236905,0.054957
2,125779,0.206463
3,1952,0.215587
4,4872,0.544017
...,...,...
4503,121120,0.402171
4504,60667,0.621187
4505,5065,0.640383
4506,133125,0.459291
