# Libs

In [1]:
# Data
import pandas as pd
import numpy as np

In [2]:
# For model
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Imputer
from sklearn.impute import KNNImputer


# Model
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [20]:
# Auxiliary
import os

# Code

## ***Imputed dataset***

In [21]:
train_raw = pd.read_csv("../data/train.tsv", sep='\t', index_col=0, na_values=['?']).drop("y", axis=1)
test_raw = pd.read_csv("../data/test.tsv", sep='\t', index_col=0, na_values=['?'])
dataset_raw = pd.concat([train_raw, test_raw])

In [22]:
imputer = "knn2" # best - knn2

### Imputing

In [23]:
# # Определение пайплайна для преобразования всего датасэта
# pipeline_preprocessing_dataset = Pipeline([
#     ('imputer', KNNImputer(n_neighbors=2)),    # Заполнение пропусков
# ])

In [24]:
# dataset_imputed = pd.DataFrame(
#     pipeline_preprocessing_dataset.fit_transform(dataset_raw), 
#     columns=dataset_raw.columns
# )
# dataset_imputed.to_csv(f"../tmp/datasets/{imputer}.csv", index=False)

## Imputed dataset for selection features

In [None]:
selection_imputer = "knn1" # best knn1

### Imputing

In [None]:
# # Определение пайплайна для преобразования всего датасэта
# pipeline_preprocessing_dataset = Pipeline([
#     ('imputer', KNNImputer(n_neighbors=1)),    # Заполнение пропусков
# ])

In [None]:
# dataset_imputed = pd.DataFrame(
#     pipeline_preprocessing_dataset.fit_transform(dataset_raw), 
#     columns=dataset_raw.columns
# )
# dataset_imputed.to_csv(f"../tmp/datasets/{selection_imputer}.csv", index=False)

## *Read **dataset***

In [25]:
train_size = train_raw.shape[0]

In [26]:
dataset_imputed = pd.read_csv(f"../tmp/datasets/{imputer}.csv").reset_index()
y = pd.read_csv("../data/train.tsv", sep='\t', index_col=0, na_values=['?']).y

train_imputed = dataset_imputed[:train_size]
test_imputed = dataset_imputed[train_size:]

## **Feature** selection

In [None]:
feature_selection = "catboost-1250it" # catboost-1250it - best

In [28]:
feature_importance_path = f"../tmp/selected_features/{selection_imputer}_{feature_selection}.csv"

### Select

In [None]:
# selection_train = pd.read_csv(f"../tmp/datasets/{selection_imputer}.csv").reset_index()[:train_size]
# selection_model = CatBoostClassifier(1250, task_type="GPU", eval_metric='F1', verbose=250, random_state=42)
# selection_model.fit(selection_train, y);

Learning rate set to 0.023409
0:	learn: 0.7915538	total: 103ms	remaining: 2m 8s
250:	learn: 0.8901472	total: 22.7s	remaining: 1m 30s
500:	learn: 0.9096114	total: 44.7s	remaining: 1m 6s
750:	learn: 0.9194666	total: 1m 7s	remaining: 44.8s
1000:	learn: 0.9291271	total: 1m 28s	remaining: 22.1s
1249:	learn: 0.9370798	total: 1m 49s	remaining: 0us


In [None]:
# # Вытаскиваем значения важности признаков
# importances = selection_model.feature_importances_

# # Создаем DataFrame с важностью признаков
# feature_importance = pd.DataFrame({
#     'Feature': train_imputed.columns,
#     'Importance': importances
# }).sort_values(by='Importance', ascending=False)

In [None]:
# feature_importance.to_csv(feature_importance_path, index=False)

### Get **important features**

In [32]:
num_features = 230

In [33]:
assert os.path.exists(feature_importance_path), "Нет файла с такими параметрами"
selected_features = pd.read_csv(feature_importance_path)["Feature"][:num_features]

In [34]:
class FeatureFilter(BaseEstimator, TransformerMixin):
    def __init__(self, selected_features):
        """
        Фильтр признаков, оставляющий только указанные.
        
        Параметры:
        - selected_features: pd.Series с именами отобранных признаков
        """   
        self.selected_features = selected_features

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        """
        Отбирает только указанные признаки.
        """
        return X[self.selected_features]

## Transform **dataset**

In [35]:
# Определение пайплайна для преобразования
pipeline_preprocessing = Pipeline([
    ('feature_filter', FeatureFilter(selected_features=selected_features)),
    (('standart_scaler'), StandardScaler())
])

In [36]:
dataset = pipeline_preprocessing.fit_transform(dataset_imputed);

train = dataset[:train_size]
test = dataset[train_size:]

## Set **model**

In [37]:
class Ensemble:
    def __init__(self, *coef):
        self.labelencoding = LabelEncoder()

        self.catboost = CatBoostClassifier(
            task_type='GPU', 
            eval_metric='F1',
            verbose=False,
            random_state=42,
        )

        self.lgbm_params = dict(
            n_estimators=450,
            random_state=42,
            verbose=-1,
        )
        self.lgbm = LGBMClassifier(**self.lgbm_params)

        self.xgb_params = dict(
            n_estimators=400,
            random_state=42
        )
        self.xgb = XGBClassifier(**self.xgb_params)

        self.randomforest_params = dict(
            n_estimators=250,
            random_state=42
        )
        self.randomforest = RandomForestClassifier(**self.randomforest_params)

        self.models = [self.catboost, self.lgbm, self.xgb, self.randomforest]
        self.coef = coef
        assert len(coef) == len(self.models)

    def fit(self, X, y):
        y = self.labelencoding.fit_transform(y)
        for i, model in enumerate(self.models):
            if self.coef[i]:
                model.fit(X, y)

    def predict_proba(self, X):
        proba = 0
        for i, model in enumerate(self.models):
            if self.coef[i]:
                proba += model.predict_proba(X) * self.coef[i]
        
        return proba
    
    def get_params(self):
        result = f'{self.coef}'

        if self.coef[0]:
            result += f'\n - CatBoost: {self.catboost.get_params()}'

        if self.coef[1]:
            result += f'\n - LGBM: {self.lgbm_params}'

        if self.coef[2]:
            result += f'\n - XGB: {self.xgb_params}'

        if self.coef[3]:
            result += f'\n - RandomForest: {self.randomforest_params}'


        return result
    

In [38]:
model = Ensemble(0.25, 0.6, 0.05, 0.15)

### Treshold

In [39]:
threshold = 0.445 # 0.44 best

### **Fit**

In [40]:
model.fit(train, y);

## Submission

In [41]:
proba = model.predict_proba(test)[:, 1]
predict = np.where(proba >= threshold, 'P', 'N')

### **Save**

In [42]:
sub = pd.DataFrame(predict, columns=['y'])

description_path = "../subs/description.txt"
is_exist = os.path.exists(description_path)

with open("../subs/description.txt", "r+" if is_exist else "w", encoding='utf-8') as file:
    if is_exist:
        data = ''.join(file.readlines()).split('\n\n')

    if is_exist and data[-1]:
        file.write("\n\n")
        id = round(float(data[-1].split('\n')[0][4:]) + 0.1, 1)
    else:
        id = 1.0

    file.write(f"ID: {id}\n")
    file.write(f"Impute selection - {selection_imputer}\n")
    file.write(f"Feature Selection - {feature_selection} (num-{num_features})\n")
    file.write(f"Impute - {imputer}\n")
    file.write(f"Preprocessing - {list(pipeline_preprocessing.named_steps.keys())}\n")
    file.write(f"Model - {model.__class__.__name__}\n")
    file.write(f"Params: {model.get_params()}\n")
    file.write(f"Threshold - {threshold}\n")
    file.write(f"LeaderBord Score: ")

    sub.to_csv(f"../subs/submission_{id}.tsv", sep="\t")