In [1]:
#pip install feature_engine

In [2]:
#pip install joblib

In [3]:
import pandas as pd
import joblib

from feature_engine.imputation import DropMissingData
from feature_engine.encoding import OneHotEncoder
from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [4]:
# Carga de datos
file_location = 'titanic.csv'
data = pd.read_csv(file_location, sep=";")

In [5]:
# Feature engineering
target = "survived"
features = ["sex", "pclass", "fare", "age"]
features_to_drop = [f for f in data.columns if f not in features and f!= target]

print(f"Features to drop {features_to_drop}")

ohe = OneHotEncoder(variables=["sex"])
drf = DropFeatures(features_to_drop=features_to_drop)
dmd = DropMissingData(missing_only=False)
stw = SklearnTransformerWrapper(StandardScaler(), variables=["pclass", "fare", "age"])

pipeline = Pipeline([
    ("drop_features", drf),
    ("one_hot_encoder", ohe),
    ("drop_missing_data", dmd),
    ("standard_scaler", stw),
])

df = pipeline.fit(data)

# Almacenamos el pipeline de transformación de datos
joblib.dump(pipeline, "pipeline.pkl")

Features to drop ['name', 'sibsp', 'parch', 'ticket', 'cabin', 'embarked']


['pipeline.pkl']

In [6]:
# TRATAMIENTO DE DATOS
# Aplicamos el pipeline de transformación y lo almacenamos en una nueva variable df

display(data.head(3))
df = pipeline.transform(data)
display(df.head(3))

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,2113375.0,B5,S,1
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,1
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,0


Unnamed: 0,pclass,age,fare,survived,sex_female,sex_male
0,-1.434601,-0.059228,4.40818,1,1,0
1,-1.434601,-2.011855,-0.319409,1,0,1
2,-1.434601,-1.936534,-0.319409,0,1,0


In [7]:
# Segmentamos el dataframe resultante en características y variable objetivo
X = df.drop(target, axis=1)
y = df[target]

display(X.head(3))
display(y.head(3))

Unnamed: 0,pclass,age,fare,sex_female,sex_male
0,-1.434601,-0.059228,4.40818,1,0
1,-1.434601,-2.011855,-0.319409,0,1
2,-1.434601,-1.936534,-0.319409,1,0


0    1
1    1
2    0
Name: survived, dtype: int64

In [8]:
# Definimos y entrenamos el modelo de regresión logística 
# buscando los mejores hiperparámetros mediante GridSearchCV

grid={
    "fit_intercept":[True, False], 
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
    "max_iter": [5000, 10000, 20000],
    "C": [0.01, 0.1, 1]
}

model=GridSearchCV(LogisticRegression(), grid, cv=25)
model.fit(X, y)
model.best_params_

{'C': 0.01, 'fit_intercept': False, 'max_iter': 5000, 'solver': 'newton-cg'}

In [9]:
# Comprobamos precisión del modelo en entrenamiento (El 100% de la muestra sin nulos)
accuracy_score(y, model.predict(X))

0.7866028708133971

In [10]:
# Almacenamos el modelo entrenado
joblib.dump(model, "model.pkl")

['model.pkl']