# Entrenamiento del modelo

In [1]:
import joblib
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("data/train.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [4]:
# Vamos a entrenar el primer modelo
from sklearn.linear_model import LogisticRegression

In [5]:
df = df.dropna(subset=['ShoppingMall', 'Age', 'FoodCourt', 'Spa', 'RoomService', 'VRDeck', 'VIP'])

In [6]:
# Las variables que vamos a usar.

features = list(set(df.columns) - set(['Transported', 'PassengerId', 'HomePlanet', 'CryoSleep', 
                                       'Cabin', 'Destination', 'Name']))

x_train, x_test, y_train, y_test = train_test_split(df[features], df.Transported, 
                                                    test_size=0.3, random_state=1)

In [7]:
logistic_reg = LogisticRegression()

logistic_reg.fit(x_train, y_train)

y_pred = logistic_reg.predict(x_test)

print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       0.65      0.87      0.75       843
        True       0.90      0.72      0.80      1389

    accuracy                           0.78      2232
   macro avg       0.78      0.79      0.77      2232
weighted avg       0.81      0.78      0.78      2232



Los resultados no son malos, pero vamos a ver si podemos mejorarlos

In [8]:
from sklearn.tree import DecisionTreeClassifier

In [9]:
dec_tree = DecisionTreeClassifier()

dec_tree.fit(x_train, y_train)

y_pred2 = dec_tree.predict(x_test)

print(classification_report(y_pred2,y_test))

              precision    recall  f1-score   support

       False       0.62      0.80      0.70       870
        True       0.85      0.69      0.76      1362

    accuracy                           0.73      2232
   macro avg       0.73      0.75      0.73      2232
weighted avg       0.76      0.73      0.74      2232



No mejoraron sino que empeoraron un poco. Vamos a probar con otro modelo.

In [10]:
from sklearn.ensemble import RandomForestClassifier

In [11]:
rand_forest = RandomForestClassifier()

rand_forest.fit(x_train, y_train)

y_pred3 = rand_forest.predict(x_test)

print(classification_report(y_pred3,y_test))

              precision    recall  f1-score   support

       False       0.74      0.83      0.78      1007
        True       0.85      0.76      0.80      1225

    accuracy                           0.79      2232
   macro avg       0.79      0.80      0.79      2232
weighted avg       0.80      0.79      0.79      2232



No cambia tanto respecto a la regresión logística. Probemos un último 

In [12]:
from sklearn.ensemble import GradientBoostingClassifier

In [13]:
grad_boosting = GradientBoostingClassifier()

grad_boosting.fit(x_train, y_train)

y_pred4 = grad_boosting.predict(x_test)

print(classification_report(y_pred4,y_test))

              precision    recall  f1-score   support

       False       0.74      0.84      0.79      1001
        True       0.85      0.76      0.81      1231

    accuracy                           0.80      2232
   macro avg       0.80      0.80      0.80      2232
weighted avg       0.80      0.80      0.80      2232



### Mejorando el modelo de Regresión logística:

En definitiva, los mejores resultados son los obtenidos con la Regresión Logística. Entonces, procederemos a tratar de mejorarlo manipulando los datos de forma distinta.

In [14]:
df2 = pd.read_csv("data/train.csv")

In [15]:
age_median = df2['Age'].mean(skipna=True)

df2['Age'] = df2['Age'].fillna(age_median)

In [16]:
roomservice_median = df2['RoomService'].mean(skipna=True)

df2['RoomService'] = df2['RoomService'].fillna(roomservice_median)

In [17]:
foodcour_median = df2['FoodCourt'].mean(skipna=True)

df2['FoodCourt'] = df2['FoodCourt'].fillna(foodcour_median)

In [18]:
shoping_median = df2['ShoppingMall'].mean(skipna=True)

df2['ShoppingMall'] = df2['ShoppingMall'].fillna(shoping_median)

In [19]:
spa_median = df2['Spa'].mean(skipna=True)

df2['Spa'] = df2['Spa'].fillna(spa_median)

In [20]:
vrdeck_median = df2['VRDeck'].mean(skipna=True)

df2['VRDeck'] = df2['VRDeck'].fillna(vrdeck_median)

In [21]:
vip_mode = df2['VIP'].mode().values[0]

df2['VIP'] = df2['VIP'].fillna(vip_mode)

In [22]:
features2 = list(set(df2.columns) - set(['Transported', 'PassengerId', 'HomePlanet', 'CryoSleep', 
                                       'Cabin', 'Destination', 'Name']))

x_train2, x_test2, y_train2, y_test2 = train_test_split(df2[features], df2.Transported, 
                                                    test_size=0.3, random_state=1)

In [23]:
logistic_reg2 = LogisticRegression()

logistic_reg2.fit(x_train2, y_train2)

y_pred2 = logistic_reg2.predict(x_test2)

print(classification_report(y_pred2,y_test2))

              precision    recall  f1-score   support

       False       0.66      0.86      0.75       999
        True       0.89      0.73      0.80      1609

    accuracy                           0.78      2608
   macro avg       0.78      0.79      0.78      2608
weighted avg       0.81      0.78      0.78      2608



La verdad es que no cambia mucho.

### Prediciendo con los datos de test ahora.

In [24]:
df_test = pd.read_csv("data/test.csv")

features_test = features

df_test[features_test].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   VIP           4184 non-null   object 
 1   VRDeck        4197 non-null   float64
 2   FoodCourt     4171 non-null   float64
 3   Spa           4176 non-null   float64
 4   Age           4186 non-null   float64
 5   RoomService   4195 non-null   float64
 6   ShoppingMall  4179 non-null   float64
dtypes: float64(6), object(1)
memory usage: 234.0+ KB


In [25]:
df_test = df_test.dropna(subset=['ShoppingMall', 'Age', 'FoodCourt', 'Spa', 'RoomService', 
                                 'VRDeck', 'VIP'])

In [26]:
y_pred_test = logistic_reg.predict(df_test[features_test])

In [27]:
df_kaggle = pd.DataFrame(y_pred_test,columns=['Transported'])
df_kaggle = pd.concat([df_kaggle, df_test['PassengerId']], axis=1)

df_kaggle.to_csv("data/predictions.csv", index=False)

### Exportar el modelo:

In [28]:
joblib.dump(logistic_reg, "logistic_regression.joblib")

['logistic_regression.joblib']