# Cargar los datos

In [26]:
# Importar las librerías necesarias
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [44]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [28]:
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

| **Nombre de la Columna** | **Descripción** |
|--------------------------|-----------------|
| `PassengerId`   | ID único para cada pasajero. |
| `Survived`      | Supervivencia (0 = No, 1 = Sí). |
| `Pclass`        | Clase del billete (1 = 1ra clase, 2 = 2da clase, 3 = 3ra clase). |
| `Name`          | Nombre del pasajero. |
| `Sex`           | Género del pasajero (male = hombre, female = mujer). |
| `Age`           | Edad del pasajero en años. |
| `SibSp`         | Número de hermanos/esposos a bordo del Titanic. |
| `Parch`         | Número de padres/hijos a bordo del Titanic. |
| `Ticket`        | Número del billete. |
| `Fare`          | Tarifa pagada por el billete. |
| `Cabin`         | Número de cabina. |
| `Embarked`      | Puerto de embarque (C = Cherburgo, Q = Queenstown, S = Southampton). |


In [29]:
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
489,490,1,3,"Coutts, Master. Eden Leslie ""Neville""",male,9.0,1,1,C.A. 37671,15.9,,S
629,630,0,3,"O'Connell, Mr. Patrick D",male,,0,0,334912,7.7333,,Q
601,602,0,3,"Slabenoff, Mr. Petco",male,,0,0,349214,7.8958,,S
266,267,0,3,"Panula, Mr. Ernesti Arvid",male,16.0,4,1,3101295,39.6875,,S
789,790,0,1,"Guggenheim, Mr. Benjamin",male,46.0,0,0,PC 17593,79.2,B82 B84,C


# Análisis Exploratorio de Datos
(Omitido aquí porque lo hicimos la vez pasada)

# Dividir datos en entrenamiento (train) y prueba (test)

Es necesario separar los datos en datos de entrenamiento y prueba antes de realizar cualquier ingeniería de características

In [30]:
# Definir las características (X) y la variable objetivo (y)
X = df.drop(columns=['Survived'])  # Aquí estamos eliminando la columna 'survived', que es el objetivo
y = df['Survived']  # Esta es nuestra variable objetivo

In [31]:
X.sample()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
318,319,1,"Wick, Miss. Mary Natalie",female,31.0,0,2,36928,164.8667,C7,S


In [32]:
y.sample()

Unnamed: 0,Survived
216,1


In [33]:
# Dividir el conjunto de datos en entrenamiento (train) y prueba (test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [34]:
X_train.sample()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
849,850,1,"Goldenberg, Mrs. Samuel L (Edwiga Grabowska)",female,,1,0,17453,89.1042,C92,C


In [35]:
X_test.sample()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
847,848,3,"Markoff, Mr. Marin",male,35.0,0,0,349213,7.8958,,C


In [36]:
y_train.sample()

Unnamed: 0,Survived
842,1


In [37]:
y_test.sample()

Unnamed: 0,Survived
442,0


In [38]:
# Ver el tamaño de los conjuntos
print(f"Tamaño del conjunto de entrenamiento: {X_train.shape}. Tamaño de las etiquetas de entrenamiento: {y_train.shape}")
print(f"Tamaño del conjunto de prueba: {X_test.shape}. Tamaño de las etiquetas de prueba: {y_test.shape}")

Tamaño del conjunto de entrenamiento: (712, 11). Tamaño de las etiquetas de entrenamiento: (712,)
Tamaño del conjunto de prueba: (179, 11). Tamaño de las etiquetas de prueba: (179,)


# Ingeniería de características

In [39]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
864,865,2,"Gill, Mr. John William",male,24.0,0,0,233866,13.0,,S
66,67,2,"Nye, Mrs. (Elizabeth Ramell)",female,29.0,0,0,C.A. 29395,10.5,F33,S
467,468,1,"Smart, Mr. John Montgomery",male,56.0,0,0,113792,26.55,,S
364,365,3,"O'Brien, Mr. Thomas",male,,1,0,370365,15.5,,Q
484,485,1,"Bishop, Mr. Dickinson H",male,25.0,1,0,11967,91.0792,B49,C


In [40]:
# One-Hot Encoding para las columnas 'Sex' y 'Embarked'
X_train = pd.get_dummies(X_train, columns=['Sex', 'Embarked'], drop_first=True)

In [41]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S
864,865,2,"Gill, Mr. John William",24.0,0,0,233866,13.0,,True,False,True
66,67,2,"Nye, Mrs. (Elizabeth Ramell)",29.0,0,0,C.A. 29395,10.5,F33,False,False,True
467,468,1,"Smart, Mr. John Montgomery",56.0,0,0,113792,26.55,,True,False,True
364,365,3,"O'Brien, Mr. Thomas",,1,0,370365,15.5,,True,True,False
484,485,1,"Bishop, Mr. Dickinson H",25.0,1,0,11967,91.0792,B49,True,False,False


In [42]:
# Rellenar los valores faltantes de 'Age' con la mediana
X_train['Age'].fillna(X_train['Age'].median(), inplace=True)

In [43]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S
864,865,2,"Gill, Mr. John William",24.0,0,0,233866,13.0,,True,False,True
66,67,2,"Nye, Mrs. (Elizabeth Ramell)",29.0,0,0,C.A. 29395,10.5,F33,False,False,True
467,468,1,"Smart, Mr. John Montgomery",56.0,0,0,113792,26.55,,True,False,True
364,365,3,"O'Brien, Mr. Thomas",28.0,1,0,370365,15.5,,True,True,False
484,485,1,"Bishop, Mr. Dickinson H",25.0,1,0,11967,91.0792,B49,True,False,False


In [46]:
# Crear un MinMaxScaler
scaler = MinMaxScaler()

scaler.fit(X_train[['Age', 'Fare']])

In [48]:
# Ver los mínimos y máximos aprendidos por el scaler
print("Mínimos aprendidos por el scaler:", scaler.data_min_)
print("Máximos aprendidos por el scaler:", scaler.data_max_)

Mínimos aprendidos por el scaler: [0.42 0.  ]
Máximos aprendidos por el scaler: [ 80.     512.3292]


In [49]:
# Aplicar el escalado solo en las columnas 'Age' y 'Fare'
X_train[['Age', 'Fare']] = scaler.transform(X_train[['Age', 'Fare']])

In [50]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S
864,865,2,"Gill, Mr. John William",0.296306,0,0,233866,0.025374,,True,False,True
66,67,2,"Nye, Mrs. (Elizabeth Ramell)",0.359135,0,0,C.A. 29395,0.020495,F33,False,False,True
467,468,1,"Smart, Mr. John Montgomery",0.698417,0,0,113792,0.051822,,True,False,True
364,365,3,"O'Brien, Mr. Thomas",0.346569,1,0,370365,0.030254,,True,True,False
484,485,1,"Bishop, Mr. Dickinson H",0.308872,1,0,11967,0.177775,B49,True,False,False


In [54]:
# Extraer la primera letra de la cabina
X_train['Letra_Cabina'] = X_train['Cabin'].str[0]

In [59]:
X_train['Letra_Cabina'].fillna('Unknown', inplace=True)

In [60]:
X_train.Letra_Cabina.value_counts()

Unnamed: 0_level_0,count
Letra_Cabina,Unnamed: 1_level_1
Unknown,556
C,42
B,38
E,25
D,21
A,14
F,11
G,4
T,1


In [61]:
# One-Hot Encoding para la columna Letra_Cabina
X_train = pd.get_dummies(X_train, columns=['Letra_Cabina'], drop_first=True)

In [62]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S,Letra_Cabina_B,Letra_Cabina_C,Letra_Cabina_D,Letra_Cabina_E,Letra_Cabina_F,Letra_Cabina_G,Letra_Cabina_T,Letra_Cabina_Unknown
864,865,2,"Gill, Mr. John William",0.296306,0,0,233866,0.025374,,True,False,True,False,False,False,False,False,False,False,True
66,67,2,"Nye, Mrs. (Elizabeth Ramell)",0.359135,0,0,C.A. 29395,0.020495,F33,False,False,True,False,False,False,False,True,False,False,False
467,468,1,"Smart, Mr. John Montgomery",0.698417,0,0,113792,0.051822,,True,False,True,False,False,False,False,False,False,False,True
364,365,3,"O'Brien, Mr. Thomas",0.346569,1,0,370365,0.030254,,True,True,False,False,False,False,False,False,False,False,True
484,485,1,"Bishop, Mr. Dickinson H",0.308872,1,0,11967,0.177775,B49,True,False,False,True,False,False,False,False,False,False,False


In [65]:
# Resultado para ser entrenado
X_train.columns

Index(['PassengerId', 'Pclass', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Sex_male', 'Embarked_Q', 'Embarked_S',
       'Letra_Cabina_B', 'Letra_Cabina_C', 'Letra_Cabina_D', 'Letra_Cabina_E',
       'Letra_Cabina_F', 'Letra_Cabina_G', 'Letra_Cabina_T',
       'Letra_Cabina_Unknown'],
      dtype='object')

In [69]:
X_train = X_train[['Pclass', 'Age', 'SibSp', 'Parch',
       'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S',
       'Letra_Cabina_B', 'Letra_Cabina_C', 'Letra_Cabina_D', 'Letra_Cabina_E',
       'Letra_Cabina_F', 'Letra_Cabina_G', 'Letra_Cabina_T',
       'Letra_Cabina_Unknown']]

In [70]:
X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Letra_Cabina_B,Letra_Cabina_C,Letra_Cabina_D,Letra_Cabina_E,Letra_Cabina_F,Letra_Cabina_G,Letra_Cabina_T,Letra_Cabina_Unknown
864,2,0.296306,0,0,0.025374,True,False,True,False,False,False,False,False,False,False,True
66,2,0.359135,0,0,0.020495,False,False,True,False,False,False,False,True,False,False,False
467,1,0.698417,0,0,0.051822,True,False,True,False,False,False,False,False,False,False,True
364,3,0.346569,1,0,0.030254,True,True,False,False,False,False,False,False,False,False,True
484,1,0.308872,1,0,0.177775,True,False,False,True,False,False,False,False,False,False,False
