# Importando Bibliotecas

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, RandomizedSearchCV

from lightgbm import LGBMClassifier

from catboost import CatBoostClassifier

import skopt

import warnings
warnings.filterwarnings('ignore')

# Lendo os Dados

In [None]:
train_label = pd.read_csv('../data/train_labels.csv')

In [None]:
df = pd.read_csv('../data/train_data.csv')

In [None]:
df = df.groupby('customer_ID').tail(1)
df.reset_index(drop=True, inplace=True)

In [None]:
df = df.merge(train_label, how='left', left_on='customer_ID', right_on='customer_ID')
df

# Feature Engineering

### Droping

In [None]:
useless_columns = ['customer_ID', 'S_2', 'S_3','D_42','D_43','D_46','D_49','D_50','D_53','S_7','D_56','S_9','B_17','D_66','D_73','D_76','D_77','R_9','D_82','B_29','D_87','D_88','D_105','D_106','R_26','D_108','D_110','D_111','B_39','S_27','B_42','D_132','D_134','D_135','D_136','D_137','D_138','D_142']
df.drop(useless_columns, axis=1, inplace=True)
df

### Column Transformer

In [None]:
cat_data = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_68']
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [None]:
numeric_data = ['P_2','D_39','B_1','B_2','R_1','D_41','B_3','D_44','B_4','D_45','B_5','R_2','D_47','D_48','B_6','B_7','B_8','D_51','B_9','R_3','D_52','P_3','B_10','S_5','B_11','S_6','D_54','R_4','B_12','S_8','D_55','B_13','R_5','D_58','B_14','D_59','D_60','D_61','B_15','S_11','D_62','D_65','B_16','B_18','B_19','B_20','S_12','R_6','S_13','B_21','D_69','B_22','D_70','D_71','D_72','S_15','B_23','P_4','D_74','D_75','B_24','R_7','B_25','B_26','D_78','D_79','R_8','S_16','D_80','R_10','R_11','B_27','D_81','S_17','R_12','B_28','R_13','D_83','R_14','R_15','D_84','R_16','S_18','D_86','R_17','R_18','B_31','S_19','R_19','B_32','S_20','R_20','R_21','B_33','D_89','R_22','R_23','D_91','D_92','D_93','D_94','R_24','R_25','D_96','S_22','S_23','S_24','S_25','S_26','D_102','D_103','D_104','D_107','B_36','B_37','R_27','D_109','D_112','B_40','D_113','D_115','D_118','D_119','D_121','D_122','D_123','D_124','D_125','D_127','D_128','D_129','B_41','D_130','D_131','D_133','R_28','D_139','D_140','D_141','D_143','D_144','D_145']
numeric_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(estimator=LinearRegression(), max_iter=30, imputation_order='roman')),
    ('pca', PCA(n_components=20))
    ])

In [None]:
transformer = ColumnTransformer(transformers=[
    ('cat', cat_transformer, cat_data),
    ('num', numeric_transformer, numeric_data)
    ])

### Train Test Split

In [None]:
X = df.drop(['target'], axis=1)
y = df['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)

### Scoring

In [None]:
def amex_metric(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)
    print("G: {:.6f}, D: {:.6f}, ALL: {:6f}".format(gini[1]/gini[0], top_four, 0.5*(gini[1]/gini[0] + top_four)))
    return 0.5 * (gini[1]/gini[0] + top_four)

In [None]:
metric = metrics.make_scorer(amex_metric)

### Class Weight

In [None]:
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

In [None]:
class_weights

# Pipeline

In [None]:
modelos = [RandomForestClassifier(class_weight='balanced'), LogisticRegression(class_weight='balanced'), LGBMClassifier(class_weight='balanced'), CatBoostClassifier(class_weights=class_weights)]

In [None]:
skf = StratifiedKFold(n_splits=5)

In [None]:
for modelo in modelos:
    pipe = Pipeline(steps=[
        ('preprocessor', transformer),
        ('model', modelo)
    ])
    scores = cross_val_score(pipe, X_train, y_train, cv=skf, scoring=metric)
    print(f'''
          Modelo: {modelo} CV AMEX Score: {scores.mean()*100} +/- {scores.std()*100}%'
          ''')