In [1]:
#pip install feature_engine

In [2]:
#pip install joblib

In [3]:
import pandas as pd
import joblib

from feature_engine.imputation import MeanMedianImputer, DropMissingData
from feature_engine.encoding import OneHotEncoder
from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score


In [4]:
# Carga de datos
file_location = 'titanic.csv'
data = pd.read_csv(file_location, sep=";")

In [5]:
# Feature engineering
target = "survived"
features = ["sex", "pclass", "fare", "age"]

ohe = OneHotEncoder(variables=["sex"])
dmd = DropMissingData(missing_only=True, variables=["age"])
stw = SklearnTransformerWrapper(StandardScaler(), variables=["pclass", "fare", "age"])

pipeline = Pipeline([
    ("one_hot_encoder", ohe),
    ("drop_missing_data", dmd),
    ("standard_scaler", stw),
])

# Almacenamos el pipeline de transformación de datos
joblib.dump(pipeline, "pipeline.pkl")

['pipeline.pkl']

In [12]:
# Nos quedamos solamente las características que vamos a necesitar para la predicción
data = data[features+[target]]
data

Unnamed: 0,sex,pclass,fare,age,survived
0,female,1,2113375.00,29.0000,1
1,male,1,151.55,0.9167,1
2,female,1,151.55,2.0000,0
3,male,1,151.55,30.0000,0
4,female,1,151.55,25.0000,0
...,...,...,...,...,...
1303,female,3,144542.00,14.5000,0
1304,female,3,144542.00,,0
1305,male,3,7225.00,26.5000,0
1306,male,3,7225.00,27.0000,0


In [11]:
# Aplicamos el pipeline de transformación y lo almacenamos en una nueva variable
df = pipeline.fit_transform(data)
df

Unnamed: 0,pclass,fare,age,survived,sex_female,sex_male
0,-1.434601,4.408180,-0.059228,1,1,0
1,-1.434601,-0.319409,-2.011855,1,0,1
2,-1.434601,-0.319409,-1.936534,0,1,0
3,-1.434601,-0.319409,0.010302,0,0,1
4,-1.434601,-0.319409,-0.337347,0,1,0
...,...,...,...,...,...,...
1300,0.943128,-0.303584,1.088014,0,0,1
1303,0.943128,0.003614,-1.067411,0,1,0
1305,0.943128,-0.303584,-0.233052,0,0,1
1306,0.943128,-0.303584,-0.198288,0,0,1


In [7]:
# Segmentamos en datos de entrenamiento y test 
y = df[target]
X = df.drop(target, axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75, random_state=0, shuffle=True)


In [8]:
# Definimos y entrenamos el modelo de regresión logística basándonos en los mejores hiperparámetros
grid={
    "fit_intercept":[True, False], 
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
    "max_iter": [5000, 10000, 20000],
    "C": [0.01, 0.1, 0.5, 1]
}

model=GridSearchCV(LogisticRegression(), grid, cv=25)
model.fit(X_train, y_train)
model.best_params_

{'C': 0.01, 'fit_intercept': False, 'max_iter': 5000, 'solver': 'newton-cg'}

In [9]:
# Comprobamos precisión del modelo sobre los datos de test
accuracy_score(y_test, model.predict(X_test))

0.7959183673469388

In [10]:
# Almacenamos el modelo entrenado
joblib.dump(model, "model.pkl")

['model.pkl']

In [16]:
pip show feature_engine

Name: feature-engine
Version: 1.5.1
Summary: Feature engineering package with Scikit-learn's fit transform functionality
Home-page: http://github.com/feature-engine/feature_engine
Author: Soledad Galli
Author-email: solegalli@protonmail.com
License: BSD 3 clause
Location: /opt/anaconda3/lib/python3.8/site-packages
Requires: numpy, scipy, statsmodels, scikit-learn, pandas
Required-by: 
Note: you may need to restart the kernel to use updated packages.
