In [12]:
import pandas                as pd
import numpy                 as np
from sklearn.compose         import ColumnTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn.impute          import SimpleImputer
from sklearn.preprocessing   import MinMaxScaler, OrdinalEncoder
from sklearn.pipeline        import Pipeline, FunctionTransformer
from category_encoders       import TargetEncoder
from sklearn.ensemble        import StackingClassifier, RandomForestClassifier
import xgboost               as xgb
import lightgbm              as lgb
from sklearn.linear_model    import LogisticRegression
from catboost                import CatBoostClassifier

In [13]:
SEED = 42

**<h2>Load Data</h2>**

In [14]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
sub = df_test.copy()

df_train['status_do_caso'] = df_train['status_do_caso'].map({'Negado':0, 'Aprovado':1})
df_train.drop('id_do_caso',axis=1,inplace=True)
df_test.drop('id_do_caso',axis=1,inplace=True)

In [15]:
cat_features     = ['continente','tem_experiencia_de_trabalho', 'requer_treinamento_de_trabalho','regiao_de_emprego','unidade_de_salario','posicao_em_tempo_integral']
numeric_features = ['ano_de_estabelecimento', 'salario_prevalecente','num_de_empregados']
ordinal_features = ['educacao_do_empregado']

In [16]:
X_train, X_test, y_train = df_train.drop(['status_do_caso'],axis=1), df_test, df_train['status_do_caso']
X_train['num_de_empregados'] =  X_train['num_de_empregados'].abs()
X_test['num_de_empregados']  =  X_test['num_de_empregados'].abs()

In [17]:
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", MinMaxScaler())]
)

ordinal_transformer = Pipeline(
    steps=[
        ('Imputer', SimpleImputer(strategy='most_frequent')),
        ("encoder", OrdinalEncoder(
            handle_unknown="use_encoded_value",
            unknown_value = -1
        )),        
    ]
)

cat_transformer = Pipeline(
    steps =[
      ('Imputer', SimpleImputer(strategy='most_frequent')),
      ('tencoder', TargetEncoder())
    ]

)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("ordinal_transform", ordinal_transformer, ordinal_features),
        ('targetencoder',cat_transformer,cat_features)
    ]
)

**<h2>Random Forest<h2>**

In [20]:
params_rf={'n_estimators': 993, 
     'max_features': None, 
     'max_depth': 8, 
     'min_samples_split': 7, 
     'min_samples_leaf': 9, 
     'bootstrap': True, 
     'class_weight': None, 
     'criterion': 'log_loss'}


rf_pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('classifier',RandomForestClassifier(**params_rf,random_state=SEED))
])


**<h2>Catboost</h2>**

In [21]:
params_cat={
    'iterations': 412,
    'learning_rate': 0.121561751576995,
    'random_strength': 2,
    'bagging_temperature': 2,
    'max_bin': 4,
    'grow_policy': 'Depthwise',
    'min_data_in_leaf': 4,
    'max_depth': 3,
    'l2_leaf_reg': 67.69236413263982,
    'one_hot_max_size': 5,
    'logging_level': 'Silent'
    }

cat_pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('classifier',CatBoostClassifier(**params_cat,random_state=SEED))
])

**<h2>LightGBM</h2>**

In [22]:
params_lgb = {
    'bagging_fraction': 0.9623530416114077,
    'bagging_freq': 2,
    'boosting_type': 'dart',
    'colsample_bytree': 0.5184569834685038,
    'feature_fraction': 0.583303193443559,
    'learning_rate': 0.1734069205361399,
    'max_depth': 11,
    'min_child_samples': 9,
    'min_data_in_leaf': 21,
    'n_estimators': 106,
    'num_leaves': 17,
    'subsample': 0.3500348029030599,
    'random_state': SEED} 

lgb_pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('classifier',lgb.LGBMClassifier(**params_lgb))
])

**<h2>XGBoost</h2>**

In [23]:
params_xgb ={
    'colsample_bytree': 0.8303268080868073,
    'learning_rate': 0.05257223981653102,
    'max_depth': 4,
    'n_estimators': 100,
    'subsample': 0.7664349504009373,
    'random_state':SEED}

xgb_pipeline = Pipeline([
    ('preprocesor',preprocessor),
    ('classifier',xgb.XGBClassifier(**params_xgb))
])

In [28]:
rf_pipeline.fit(X_train,y_train)
cat_pipeline.fit(X_train,y_train)
lgb_pipeline.fit(X_train,y_train)
xgb_pipeline.fit(X_train,y_train);



In [29]:
sub['status_rf']   = rf_pipeline.predict_proba(X_test)[:,1]
sub['status_cat']  = cat_pipeline.predict_proba(X_test)[:,1]
sub['status_lgb']  = lgb_pipeline.predict_proba(X_test)[:,1]
sub['status_xgb']  = xgb_pipeline.predict_proba(X_test)[:,1]

**<h2>Stacking Ensemble</h2>**

In [30]:
clf_stack = StackingClassifier(
    estimators=[
        ('rf',rf_pipeline),
        ('catboost',cat_pipeline),        
        ('lightgbm',lgb_pipeline),
        ('xgb',xgb_pipeline)
        ],
    final_estimator=LogisticRegression(class_weight='balanced',random_state=SEED),          
    stack_method='predict',
    passthrough=False 
)

In [31]:
clf_stack.fit(X_train,y_train)



In [33]:
sub['status_stack'] = clf_stack.predict_proba(X_test)[:,1]

**<h2>Weighted average</h2>**

In [164]:
sub['status_do_caso'] = sub['status_stack']*0.50+sub['status_rf']*0.50
sub['status_do_caso'] = sub[['status_do_caso','status_rf','status_lgb','status_xgb']].mean(axis=1)

**<h4>Threshold</h4>**

In [165]:
ths = np.percentile(sub['status_do_caso'],100-67)
ths = round(ths,4)

In [166]:
sub['status_do_caso'] = sub['status_do_caso'].apply(lambda x: 1 if x>ths else 0)
sub['status_do_caso'].value_counts()

status_do_caso
1    5122
0    2522
Name: count, dtype: int64

In [167]:
sub[['id_do_caso','status_do_caso']].to_csv('submission.csv',index=False)