In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn import set_config
set_config(display='diagram')


In [53]:
df = pd.read_csv('titanic.csv', index_col=0)
df.head()
df = df.drop(columns=['Signing_date', 'Cabin', 'Ticket', 'Name'])

X = df[['Pclass','Fare','Age','Parch',  'SibSp','Embarked',   'Sex']]
y = df.Survived
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123)


In [54]:
clf1 = LogisticRegression(random_state=123)
clf2 = SVC(random_state=123, probability=True)
clf3 = DecisionTreeClassifier(random_state=123)
clf4 = KNeighborsClassifier()

## Preprocesamiento

In [55]:
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer
from feature_engine.encoding import OrdinalEncoder, OneHotEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import StandardScaler

In [56]:
prep = Pipeline(steps = [
    ('cat_imp', CategoricalImputer(imputation_method='frequent')),
    ('num_imp', MeanMedianImputer(imputation_method='mean')),
    ('ord',OrdinalEncoder(encoding_method='ordered',variables='Pclass',ignore_format=True)),
    ('ohe', OneHotEncoder()),
    ('sc',SklearnTransformerWrapper(StandardScaler(),variables = ['Fare', 'Age', 'Parch','SibSp']))
])

prep.fit_transform(X_train, y_train)

Unnamed: 0_level_0,Pclass,Fare,Age,Parch,SibSp,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
330,2,0.490420,-1.086485,0.829588,-0.460103,1,0,0,1,0
750,0,-0.472168,0.091668,-0.474312,-0.460103,0,1,0,0,1
204,0,-0.482229,1.230549,-0.474312,-0.460103,1,0,0,0,1
422,0,-0.472488,-0.693768,-0.474312,-0.460103,0,1,0,0,1
98,2,0.593505,-0.536681,0.829588,-0.460103,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
99,1,-0.179919,0.327298,0.829588,-0.460103,0,0,1,1,0
323,1,-0.384014,0.013124,-0.474312,-0.460103,0,1,0,1,0
383,0,-0.468815,0.170211,-0.474312,-0.460103,0,0,1,0,1
366,0,-0.481750,0.013124,-0.474312,-0.460103,0,0,1,0,1


In [57]:
from sklearn.metrics import classification_report

In [58]:
def train_function(pipe, X_train, X_test, y_train, y_test):
    pipe.fit(X_train, y_train)
    y_pred_train    = pipe.predict(X_train)
    y_pred          = pipe.predict(X_test)
    print('train')
    print(classification_report(y_train, y_pred_train, digits=4))
    print('test')
    print(classification_report(y_test, y_pred, digits=4))
    return pipe

In [59]:
pipe_clf1 = Pipeline(steps=[
    ('prep', prep),
    ('model', clf1)
])

train_function(pipe_clf1, X_train, X_test, y_train, y_test);

train
              precision    recall  f1-score   support

           0     0.8208    0.8529    0.8365       435
           1     0.7538    0.7076    0.7300       277

    accuracy                         0.7963       712
   macro avg     0.7873    0.7802    0.7833       712
weighted avg     0.7947    0.7963    0.7951       712

test
              precision    recall  f1-score   support

           0     0.8596    0.8596    0.8596       114
           1     0.7538    0.7538    0.7538        65

    accuracy                         0.8212       179
   macro avg     0.8067    0.8067    0.8067       179
weighted avg     0.8212    0.8212    0.8212       179



In [60]:
pipe_clf2 = Pipeline(steps=[
    ('prep', prep),
    ('model', clf2)
])

train_function(pipe_clf2, X_train, X_test, y_train, y_test);

train
              precision    recall  f1-score   support

           0     0.8359    0.8897    0.8619       435
           1     0.8072    0.7256    0.7643       277

    accuracy                         0.8258       712
   macro avg     0.8215    0.8076    0.8131       712
weighted avg     0.8247    0.8258    0.8239       712

test
              precision    recall  f1-score   support

           0     0.8595    0.9123    0.8851       114
           1     0.8276    0.7385    0.7805        65

    accuracy                         0.8492       179
   macro avg     0.8435    0.8254    0.8328       179
weighted avg     0.8479    0.8492    0.8471       179



In [61]:
pipe_clf3 = Pipeline(steps=[
    ('prep', prep),
    ('model', clf3)
])

train_function(pipe_clf3, X_train, X_test, y_train, y_test);

train
              precision    recall  f1-score   support

           0     0.9752    0.9954    0.9852       435
           1     0.9925    0.9603    0.9761       277

    accuracy                         0.9817       712
   macro avg     0.9839    0.9778    0.9807       712
weighted avg     0.9820    0.9817    0.9817       712

test
              precision    recall  f1-score   support

           0     0.8727    0.8421    0.8571       114
           1     0.7391    0.7846    0.7612        65

    accuracy                         0.8212       179
   macro avg     0.8059    0.8134    0.8092       179
weighted avg     0.8242    0.8212    0.8223       179



In [62]:
pipe_clf4 = Pipeline(steps=[
    ('prep', prep),
    ('model', clf4)
])

train_function(pipe_clf4, X_train, X_test, y_train, y_test);

train
              precision    recall  f1-score   support

           0     0.8621    0.9057    0.8834       435
           1     0.8392    0.7726    0.8045       277

    accuracy                         0.8539       712
   macro avg     0.8507    0.8392    0.8440       712
weighted avg     0.8532    0.8539    0.8527       712

test
              precision    recall  f1-score   support

           0     0.8632    0.8860    0.8745       114
           1     0.7903    0.7538    0.7717        65

    accuracy                         0.8380       179
   macro avg     0.8268    0.8199    0.8231       179
weighted avg     0.8368    0.8380    0.8371       179



## Ensamble: Voting

In [63]:
pipe_clf_vc = Pipeline(steps=[
    ('prep', prep),
    ('model', VotingClassifier([('lr', clf1), ('svm', clf2), ('dt', clf3)], voting = 'hard', n_jobs=-1)) #voting = hard, cuento la cantidad de votos que entrega cada modelo. 
])
# una regla implícita que es mayoritariamente escoge un número impar de modelos.
train_function(pipe_clf_vc, X_train, X_test, y_train, y_test);

train
              precision    recall  f1-score   support

           0     0.8562    0.9034    0.8792       435
           1     0.8340    0.7617    0.7962       277

    accuracy                         0.8483       712
   macro avg     0.8451    0.8326    0.8377       712
weighted avg     0.8476    0.8483    0.8469       712

test
              precision    recall  f1-score   support

           0     0.8793    0.8947    0.8870       114
           1     0.8095    0.7846    0.7969        65

    accuracy                         0.8547       179
   macro avg     0.8444    0.8397    0.8419       179
weighted avg     0.8540    0.8547    0.8542       179



In [64]:
pipe_clf_vc2 = Pipeline(steps=[
    ('prep', prep),
    ('model', VotingClassifier([('lr', clf1), ('svm', clf2), ('dt', clf3), ('knn', clf4)], voting = 'soft', n_jobs=-1)) #voting = hard, cuento la cantidad de votos que entrega cada modelo. 
])
# una regla implícita que es mayoritariamente escoge un número impar de modelos.
train_function(pipe_clf_vc2, X_train, X_test, y_train, y_test)

train
              precision    recall  f1-score   support

           0     0.8774    0.9540    0.9141       435
           1     0.9163    0.7906    0.8488       277

    accuracy                         0.8904       712
   macro avg     0.8968    0.8723    0.8815       712
weighted avg     0.8925    0.8904    0.8887       712

test
              precision    recall  f1-score   support

           0     0.8879    0.9035    0.8957       114
           1     0.8254    0.8000    0.8125        65

    accuracy                         0.8659       179
   macro avg     0.8567    0.8518    0.8541       179
weighted avg     0.8652    0.8659    0.8655       179



#### Averiguar sobre probability calibration en sklearn (esta asociado con voting = 'soft')

In [65]:
from sklearn.ensemble import StackingClassifier

estimator = [('svm', pipe_clf2), ('dt', pipe_clf3)]

# final estimator: metaestimator
stc = StackingClassifier(estimators = estimator, final_estimator=clf1, cv = 5)

train_function(stc, X_train, X_test, y_train, y_test)


train
              precision    recall  f1-score   support

           0     0.8478    0.9218    0.8833       435
           1     0.8577    0.7401    0.7946       277

    accuracy                         0.8511       712
   macro avg     0.8528    0.8310    0.8389       712
weighted avg     0.8517    0.8511    0.8488       712

test
              precision    recall  f1-score   support

           0     0.8689    0.9298    0.8983       114
           1     0.8596    0.7538    0.8033        65

    accuracy                         0.8659       179
   macro avg     0.8643    0.8418    0.8508       179
weighted avg     0.8655    0.8659    0.8638       179



In [66]:
df = pd.read_csv('titanic.csv', index_col=0)
X = df[['Pclass','Fare','Parch','SibSp']]
y = df.Survived
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)


In [67]:
clf1 = GaussianNB()
clf2 = KNeighborsClassifier()
clf3 = SVC(random_state=123, probability=True)
clf4 = GradientBoostingClassifier(random_state=123)

meta0 = LogisticRegression(random_state=123)
meta1 = RandomForestClassifier(random_state=123)
meta2 = GradientBoostingClassifier(random_state=123)

In [68]:
train_function(clf1, X_train, X_test, y_train, y_test)

train
              precision    recall  f1-score   support

           0     0.6866    0.8460    0.7580       435
           1     0.6193    0.3935    0.4812       277

    accuracy                         0.6699       712
   macro avg     0.6529    0.6197    0.6196       712
weighted avg     0.6604    0.6699    0.6503       712

test
              precision    recall  f1-score   support

           0     0.7419    0.8070    0.7731       114
           1     0.6000    0.5077    0.5500        65

    accuracy                         0.6983       179
   macro avg     0.6710    0.6574    0.6616       179
weighted avg     0.6904    0.6983    0.6921       179



In [69]:
train_function(clf2, X_train, X_test, y_train, y_test)


train
              precision    recall  f1-score   support

           0     0.7897    0.8115    0.8005       435
           1     0.6906    0.6606    0.6753       277

    accuracy                         0.7528       712
   macro avg     0.7401    0.7361    0.7379       712
weighted avg     0.7511    0.7528    0.7518       712

test
              precision    recall  f1-score   support

           0     0.7886    0.8509    0.8186       114
           1     0.6964    0.6000    0.6446        65

    accuracy                         0.7598       179
   macro avg     0.7425    0.7254    0.7316       179
weighted avg     0.7551    0.7598    0.7554       179



In [70]:
train_function(clf3, X_train, X_test, y_train, y_test)


train
              precision    recall  f1-score   support

           0     0.6684    0.8989    0.7667       435
           1     0.6535    0.2996    0.4109       277

    accuracy                         0.6657       712
   macro avg     0.6610    0.5992    0.5888       712
weighted avg     0.6626    0.6657    0.6283       712

test
              precision    recall  f1-score   support

           0     0.7260    0.9298    0.8154       114
           1     0.7576    0.3846    0.5102        65

    accuracy                         0.7318       179
   macro avg     0.7418    0.6572    0.6628       179
weighted avg     0.7375    0.7318    0.7046       179



In [71]:
train_function(clf4, X_train, X_test, y_train, y_test)

train
              precision    recall  f1-score   support

           0     0.7914    0.8897    0.8377       435
           1     0.7848    0.6318    0.7000       277

    accuracy                         0.7893       712
   macro avg     0.7881    0.7607    0.7688       712
weighted avg     0.7888    0.7893    0.7841       712

test
              precision    recall  f1-score   support

           0     0.8276    0.8421    0.8348       114
           1     0.7143    0.6923    0.7031        65

    accuracy                         0.7877       179
   macro avg     0.7709    0.7672    0.7690       179
weighted avg     0.7864    0.7877    0.7870       179



In [72]:
estimators = [('gnb', clf1), ('knn', clf2)]

# final estimator: metaestimator
st_1 = StackingClassifier(estimators=estimators, 
                        final_estimator=meta0,
                        cv=5)

train_function(st_1, X_train, X_test, y_train, y_test)

train
              precision    recall  f1-score   support

           0     0.7071    0.8713    0.7806       435
           1     0.6818    0.4332    0.5298       277

    accuracy                         0.7008       712
   macro avg     0.6945    0.6522    0.6552       712
weighted avg     0.6973    0.7008    0.6831       712

test
              precision    recall  f1-score   support

           0     0.7519    0.8509    0.7984       114
           1     0.6600    0.5077    0.5739        65

    accuracy                         0.7263       179
   macro avg     0.7060    0.6793    0.6861       179
weighted avg     0.7186    0.7263    0.7169       179



In [81]:
estimators = [('gnb', clf1), ('knn', clf2), ('svm', clf3)]

# final estimator: metaestimator
st_2 = StackingClassifier(estimators=estimators, 
                        final_estimator=meta0,
                        cv=5, n_jobs=-1)

train_function(st_2, X_train, X_test, y_train, y_test)

train
              precision    recall  f1-score   support

           0     0.7071    0.8713    0.7806       435
           1     0.6818    0.4332    0.5298       277

    accuracy                         0.7008       712
   macro avg     0.6945    0.6522    0.6552       712
weighted avg     0.6973    0.7008    0.6831       712

test
              precision    recall  f1-score   support

           0     0.7519    0.8509    0.7984       114
           1     0.6600    0.5077    0.5739        65

    accuracy                         0.7263       179
   macro avg     0.7060    0.6793    0.6861       179
weighted avg     0.7186    0.7263    0.7169       179



In [80]:
estimators = [('gnb', clf1), ('knn', clf2), ('svm', clf3), ('gb', clf4)]
# recomendación: los modelos tienen que ser lo más distinto posible...

# final estimator: metaestimator
st_3 = StackingClassifier(estimators=estimators, 
                        final_estimator=meta0,
                        cv=5, n_jobs=-1)

train_function(st_3, X_train, X_test, y_train, y_test)

train
              precision    recall  f1-score   support

           0     0.7519    0.8989    0.8188       435
           1     0.7708    0.5343    0.6311       277

    accuracy                         0.7570       712
   macro avg     0.7614    0.7166    0.7250       712
weighted avg     0.7593    0.7570    0.7458       712

test
              precision    recall  f1-score   support

           0     0.7812    0.8772    0.8264       114
           1     0.7255    0.5692    0.6379        65

    accuracy                         0.7654       179
   macro avg     0.7534    0.7232    0.7322       179
weighted avg     0.7610    0.7654    0.7580       179



In [78]:
estimators = [('gnb', clf1), ('knn', clf2), ('svm', clf3), ('gb', clf4)]
# recomendación: los modelos tienen que ser lo más distinto posible...

# final estimator: metaestimator
st_3 = StackingClassifier(estimators=estimators, 
                        final_estimator=meta1,
                        cv=5, n_jobs=-1)

train_function(st_3, X_train, X_test, y_train, y_test)

train
              precision    recall  f1-score   support

           0     0.7138    0.8828    0.7893       435
           1     0.7069    0.4440    0.5455       277

    accuracy                         0.7121       712
   macro avg     0.7103    0.6634    0.6674       712
weighted avg     0.7111    0.7121    0.6944       712

test
              precision    recall  f1-score   support

           0     0.7874    0.8772    0.8299       114
           1     0.7308    0.5846    0.6496        65

    accuracy                         0.7709       179
   macro avg     0.7591    0.7309    0.7397       179
weighted avg     0.7668    0.7709    0.7644       179



In [82]:
estimators = [('gnb', clf1), ('knn', clf2), ('svm', clf3), ('gb', clf4)]
# recomendación: los modelos tienen que ser lo más distinto posible...

# final estimator: metaestimator
st_3 = StackingClassifier(estimators=estimators, 
                        final_estimator=meta2,
                        cv=5, n_jobs=-1)

train_function(st_3, X_train, X_test, y_train, y_test)

train
              precision    recall  f1-score   support

           0     0.7200    0.8690    0.7875       435
           1     0.6952    0.4693    0.5603       277

    accuracy                         0.7135       712
   macro avg     0.7076    0.6691    0.6739       712
weighted avg     0.7103    0.7135    0.6991       712

test
              precision    recall  f1-score   support

           0     0.7903    0.8596    0.8235       114
           1     0.7091    0.6000    0.6500        65

    accuracy                         0.7654       179
   macro avg     0.7497    0.7298    0.7368       179
weighted avg     0.7608    0.7654    0.7605       179



In [83]:
estimators = [('gnb', clf1), ('knn', clf2), ('svm', clf3), ('gb', clf4)]
# recomendación: los modelos tienen que ser lo más distinto posible...

# final estimator: metaestimator
st_3 = StackingClassifier(estimators=estimators, 
                        final_estimator=meta2,
                        stack_method='predict', #voting hard
                        cv=5, n_jobs=-1)

train_function(st_3, X_train, X_test, y_train, y_test)

train
              precision    recall  f1-score   support

           0     0.7816    0.8805    0.8281       435
           1     0.7658    0.6137    0.6814       277

    accuracy                         0.7767       712
   macro avg     0.7737    0.7471    0.7547       712
weighted avg     0.7755    0.7767    0.7710       712

test
              precision    recall  f1-score   support

           0     0.8205    0.8421    0.8312       114
           1     0.7097    0.6769    0.6929        65

    accuracy                         0.7821       179
   macro avg     0.7651    0.7595    0.7620       179
weighted avg     0.7803    0.7821    0.7810       179

