![img1](im1.png)

In [137]:
#SVM
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer
from feature_engine.encoding import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn import set_config

set_config(display='diagram')



In [138]:
df = pd.read_csv('titanic.csv', index_col=0)
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Signing_date
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1911-05-17
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1911-07-23
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1911-09-08
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1911-06-26
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1911-10-25


In [139]:
df.isnull().mean()

Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age             0.198653
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin           0.771044
Embarked        0.002245
Signing_date    0.000000
dtype: float64

In [140]:
## Primera  Aproximación
df_subset = df.select_dtypes(np.number)
df_subset

X = df_subset.drop(columns='Survived')
y = df_subset.Survived
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=123)


In [141]:
X_train.shape, X_test.shape

((623, 5), (268, 5))

## Regresión Logística

In [142]:
pipe = Pipeline(steps=[
    ('ni', MeanMedianImputer(imputation_method='mean')),
    ('sc', StandardScaler()),
    ('lr', LogisticRegression(C = 1 , random_state=123)) # LogLoss, C=parametro regularización
])

pipe.fit(X_train, y_train)
y_pred       = pipe.predict(X_test)
y_pred_train = pipe.predict(X_train)

print(classification_report(y_pred_train, y_train, digits=4))
print(classification_report(y_pred, y_test, digits = 4))


              precision    recall  f1-score   support

           0     0.8496    0.7061    0.7713       456
           1     0.4508    0.6587    0.5353       167

    accuracy                         0.6934       623
   macro avg     0.6502    0.6824    0.6533       623
weighted avg     0.7427    0.6934    0.7080       623

              precision    recall  f1-score   support

           0     0.8471    0.7500    0.7956       192
           1     0.5102    0.6579    0.5747        76

    accuracy                         0.7239       268
   macro avg     0.6786    0.7039    0.6851       268
weighted avg     0.7515    0.7239    0.7329       268



# Linear SVC

In [143]:
pipe = Pipeline(steps=[
    ('ni', MeanMedianImputer(imputation_method='mean')),
    ('sc', StandardScaler()),
    ('lsvc', LinearSVC(C = 1 , random_state=123)) # HuberLoss
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
y_pred_train = pipe.predict(X_train)

print(classification_report(y_pred_train, y_train, digits=4))
print(classification_report(y_pred, y_test, digits = 4))

              precision    recall  f1-score   support

           0     0.8470    0.7039    0.7689       456
           1     0.4467    0.6527    0.5304       167

    accuracy                         0.6902       623
   macro avg     0.6468    0.6783    0.6496       623
weighted avg     0.7397    0.6902    0.7049       623

              precision    recall  f1-score   support

           0     0.8412    0.7448    0.7901       192
           1     0.5000    0.6447    0.5632        76

    accuracy                         0.7164       268
   macro avg     0.6706    0.6948    0.6766       268
weighted avg     0.7444    0.7164    0.7257       268





SVC

In [144]:
pipe = Pipeline(steps=[
    ('ni', MeanMedianImputer(imputation_method='mean')),
    ('sc', StandardScaler()),
    ('svc', SVC(C = 1.5 , random_state=123)) # kernel: rbf, poly, linear, sigmoid, precomputed
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
y_pred_train = pipe.predict(X_train)

print(classification_report(y_pred_train, y_train, digits=4))
print(classification_report(y_pred, y_test, digits = 4))

              precision    recall  f1-score   support

           0     0.8971    0.7391    0.8105       460
           1     0.5082    0.7607    0.6093       163

    accuracy                         0.7448       623
   macro avg     0.7026    0.7499    0.7099       623
weighted avg     0.7953    0.7448    0.7579       623

              precision    recall  f1-score   support

           0     0.8647    0.7500    0.8033       196
           1     0.5000    0.6806    0.5765        72

    accuracy                         0.7313       268
   macro avg     0.6824    0.7153    0.6899       268
weighted avg     0.7667    0.7313    0.7423       268



In [145]:
# Problema completo

X = df.drop(columns = ['Ticket', 'Cabin', 'Name', 'Signing_date','Survived'])
y = df.Survived

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.3, random_state=123)


In [146]:
from feature_engine.wrappers import SklearnTransformerWrapper

prep = Pipeline(steps=[
    ('ci', CategoricalImputer(imputation_method='frequent')),
    ('ni', MeanMedianImputer(imputation_method='mean')),
    ('ohe', OneHotEncoder()),
    ('sc', SklearnTransformerWrapper(StandardScaler(), variables = ['Age','Fare']))
])

pipe = Pipeline(steps=[
    ('prep', prep),
    ('model', SVC(random_state = 123))
])

params = {
    'model__C': np.linspace(0.01, 2, 30),
    'model__gamma': ['auto', 'scale']   # __ dunder
    }

search = GridSearchCV(pipe, params, cv = 5, scoring = 'accuracy', n_jobs=-1, return_train_score=True, refit=True)
search

In [147]:
%%time
search.fit(X_train, y_train)

y_pred = search.predict(X_test)
y_pred_train = search.predict(X_train)

print(classification_report(y_train, y_pred_train, digits=4))
print(classification_report(y_test, y_pred, digits = 4))

              precision    recall  f1-score   support

           0     0.8383    0.8892    0.8630       379
           1     0.8100    0.7336    0.7699       244

    accuracy                         0.8283       623
   macro avg     0.8241    0.8114    0.8164       623
weighted avg     0.8272    0.8283    0.8265       623

              precision    recall  f1-score   support

           0     0.8415    0.9059    0.8725       170
           1     0.8118    0.7041    0.7541        98

    accuracy                         0.8321       268
   macro avg     0.8266    0.8050    0.8133       268
weighted avg     0.8306    0.8321    0.8292       268

CPU times: total: 719 ms
Wall time: 5.93 s


In [148]:
# search.predict usa search.best_estimator_
y_pred_train_best = search.best_estimator_.predict(X_train)
y_pred_best = search.best_estimator_.predict(X_test)

print(classification_report(y_train, y_pred_train_best, digits=4))
print(classification_report(y_test, y_pred_best, digits = 4))

              precision    recall  f1-score   support

           0     0.8383    0.8892    0.8630       379
           1     0.8100    0.7336    0.7699       244

    accuracy                         0.8283       623
   macro avg     0.8241    0.8114    0.8164       623
weighted avg     0.8272    0.8283    0.8265       623

              precision    recall  f1-score   support

           0     0.8415    0.9059    0.8725       170
           1     0.8118    0.7041    0.7541        98

    accuracy                         0.8321       268
   macro avg     0.8266    0.8050    0.8133       268
weighted avg     0.8306    0.8321    0.8292       268



In [149]:
search.best_params_

{'model__C': 0.9706896551724138, 'model__gamma': 'auto'}

In [150]:
search.best_score_

0.8251225806451614

In [151]:
pd.DataFrame(search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__C,param_model__gamma,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.066565,0.020863,0.038339,0.013282,0.01,auto,"{'model__C': 0.01, 'model__gamma': 'auto'}",0.608,0.608,0.608,...,0.608348,0.002586,59,0.608434,0.608434,0.608434,0.609218,0.607214,0.608347,0.000643
1,0.073577,0.01253,0.031783,0.007751,0.01,scale,"{'model__C': 0.01, 'model__gamma': 'scale'}",0.608,0.608,0.608,...,0.608348,0.002586,59,0.608434,0.608434,0.608434,0.609218,0.607214,0.608347,0.000643
2,0.066579,0.003052,0.036816,0.007696,0.078621,auto,"{'model__C': 0.07862068965517241, 'model__gamm...",0.808,0.752,0.8,...,0.797806,0.025635,55,0.801205,0.813253,0.803213,0.797595,0.793587,0.801771,0.006613
3,0.068111,0.009336,0.035086,0.004542,0.078621,scale,"{'model__C': 0.07862068965517241, 'model__gamm...",0.808,0.752,0.8,...,0.796194,0.025801,56,0.803213,0.817269,0.803213,0.797595,0.793587,0.802975,0.008019
4,0.063944,0.005898,0.029172,0.001925,0.147241,auto,"{'model__C': 0.14724137931034484, 'model__gamm...",0.808,0.752,0.784,...,0.794606,0.024617,57,0.797189,0.821285,0.809237,0.803607,0.799599,0.806183,0.008582
5,0.065362,0.003962,0.034589,0.0044,0.147241,scale,"{'model__C': 0.14724137931034484, 'model__gamm...",0.808,0.752,0.784,...,0.794606,0.024617,57,0.793173,0.823293,0.809237,0.803607,0.801603,0.806183,0.00999
6,0.064376,0.006353,0.03279,0.005756,0.215862,auto,"{'model__C': 0.21586206896551724, 'model__gamm...",0.824,0.768,0.784,...,0.801006,0.023633,53,0.795181,0.829317,0.809237,0.803607,0.803607,0.80819,0.011478
7,0.05899,0.003116,0.029301,0.001487,0.215862,scale,"{'model__C': 0.21586206896551724, 'model__gamm...",0.824,0.768,0.784,...,0.801006,0.023633,53,0.795181,0.829317,0.809237,0.803607,0.803607,0.80819,0.011478
8,0.059906,0.007733,0.026079,0.002969,0.284483,auto,"{'model__C': 0.28448275862068967, 'model__gamm...",0.824,0.768,0.784,...,0.802619,0.025779,49,0.799197,0.835341,0.809237,0.803607,0.805611,0.810599,0.012789
9,0.065262,0.013445,0.029784,0.008006,0.284483,scale,"{'model__C': 0.28448275862068967, 'model__gamm...",0.824,0.768,0.784,...,0.802619,0.025779,49,0.799197,0.837349,0.809237,0.803607,0.811623,0.812203,0.013301


In [152]:
search.cv_results_['mean_train_score']

array([0.60834681, 0.60834681, 0.80177061, 0.80297543, 0.80618345,
       0.80618265, 0.80818987, 0.80818987, 0.81059871, 0.81220272,
       0.81581718, 0.81621878, 0.81862601, 0.82022921, 0.82143323,
       0.82183483, 0.82223644, 0.82303804, 0.82343965, 0.82424206,
       0.82544607, 0.82584768, 0.82544688, 0.82665009, 0.8274533 ,
       0.8278541 , 0.8282549 , 0.8282549 , 0.8282549 , 0.8282549 ,
       0.8282549 , 0.8282549 , 0.8286557 , 0.82905651, 0.8286557 ,
       0.82905651, 0.82905651, 0.82945811, 0.82945811, 0.82945811,
       0.82945811, 0.82945811, 0.82945811, 0.82945811, 0.82945811,
       0.82945811, 0.82945811, 0.82945811, 0.82945811, 0.82945811,
       0.82945811, 0.82945811, 0.82985972, 0.82985972, 0.83026052,
       0.83026052, 0.83026052, 0.83026052, 0.83026052, 0.83026052])