<a href="https://colab.research.google.com/github/thomaschiari/Spaceship-Titanic-Kaggle-Competition/blob/main/ST_XGBOOST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spaceship Titanic utilizando método XGBoost

In [None]:
import pandas as pd
import warnings
import os
import logging
warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO)

In [None]:
# Importando os dados
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))

In [None]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [None]:
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


### Pré-processamento

In [None]:
# Removendo colunas que não serão utilizadas
train.drop(['PassengerId', 'Name'], axis=1, inplace=True)

In [None]:
num_cols = train.select_dtypes(include=['int64', 'float64']).columns
train[num_cols].describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [None]:
train[num_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Age           8514 non-null   float64
 1   RoomService   8512 non-null   float64
 2   FoodCourt     8510 non-null   float64
 3   ShoppingMall  8485 non-null   float64
 4   Spa           8510 non-null   float64
 5   VRDeck        8505 non-null   float64
dtypes: float64(6)
memory usage: 407.6 KB


In [None]:
# Preenchendo valores faltantes
train[num_cols] = train[num_cols].fillna(train[num_cols].median())

In [None]:
# Tratando valores booleanos
train.Transported = train.Transported.astype(int)
train.VIP = train.VIP.astype(bool).astype(int)
train.CryoSleep = train.CryoSleep.astype(bool).astype(int)

In [None]:
# Separando dados da cabine
train[['Deck', 'Cabin', 'Side']] = train.Cabin.str.split('/', expand=True)
try:
    train.drop('Cabin', axis=1, inplace=True)
except:
    pass

In [None]:
# Tratando valores categóricos
cat_cols = train.select_dtypes(include=['object']).columns
train[cat_cols].describe()

Unnamed: 0,HomePlanet,Destination,Deck,Side
count,8492,8511,8494,8494
unique,3,3,8,2
top,Earth,TRAPPIST-1e,F,S
freq,4602,5915,2794,4288


In [None]:
train[cat_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   HomePlanet   8492 non-null   object
 1   Destination  8511 non-null   object
 2   Deck         8494 non-null   object
 3   Side         8494 non-null   object
dtypes: object(4)
memory usage: 271.8+ KB


In [None]:
# Preenchendo valores faltantes
train[cat_cols] = train[cat_cols].fillna(train[cat_cols].mode().iloc[0])

In [None]:
train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Side
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,0,B,P
1,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,1,F,S
2,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,A,S
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,A,S
4,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,1,F,S


In [None]:
# Transformando valores categóricos em numéricos
train = pd.get_dummies(train, columns=cat_cols, drop_first=True)
train.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Europa,...,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_S
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,True,...,False,True,True,False,False,False,False,False,False,False
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,False,...,False,True,False,False,False,False,True,False,False,True
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,True,...,False,True,False,False,False,False,False,False,False,True
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,True,...,False,True,False,False,False,False,False,False,False,True
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,False,...,False,True,False,False,False,False,True,False,False,True


### Criando o modelo

In [None]:
from xgboost import XGBClassifier

In [None]:
# Separando dados de treino e teste
X = train.drop('Transported', axis=1)
y = train.Transported

In [None]:
# Treinando o modelo
model = XGBClassifier()
model.fit(X, y)

In [None]:
# Fazendo previsões
y_pred = model.predict(X)

### Preparando a submissão

In [None]:
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [None]:
# Trabalhando os dados de teste
test.drop(['PassengerId', 'Name'], axis=1, inplace=True)

In [None]:
num_cols = test.select_dtypes(include=['int64', 'float64']).columns

In [None]:
test[num_cols] = test[num_cols].fillna(test[num_cols].median())

In [None]:
# Tratando valores booleanos
test.VIP = test.VIP.astype(bool).astype(int)
test.CryoSleep = test.CryoSleep.astype(bool).astype(int)

In [None]:
test[['Deck', 'Cabin', 'Side']] = test.Cabin.str.split('/', expand=True)
try:
    test.drop('Cabin', axis=1, inplace=True)
except:
    pass

In [None]:
cat_cols = test.select_dtypes(include=['object']).columns
test[cat_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   HomePlanet   4190 non-null   object
 1   Destination  4185 non-null   object
 2   Deck         4177 non-null   object
 3   Side         4177 non-null   object
dtypes: object(4)
memory usage: 133.8+ KB


In [None]:
test[cat_cols] = test[cat_cols].fillna(test[cat_cols].mode().iloc[0])

In [None]:
test = pd.get_dummies(test, columns=cat_cols, drop_first=True)

In [None]:
test.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_S
0,1,27.0,0,0.0,0.0,0.0,0.0,0.0,False,False,False,True,False,False,False,False,False,True,False,True
1,0,19.0,0,0.0,9.0,0.0,2823.0,0.0,False,False,False,True,False,False,False,False,True,False,False,True
2,1,31.0,0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,False,True,False,False,False,False,False,True
3,0,38.0,0,0.0,6652.0,0.0,181.0,585.0,True,False,False,True,False,True,False,False,False,False,False,True
4,0,20.0,0,10.0,0.0,635.0,0.0,0.0,False,False,False,True,False,False,False,False,True,False,False,True


In [None]:
# Fazendo previsões
y_pred = model.predict(test)

### Submissão

In [None]:
# Criando o arquivo de submissão
submission = pd.DataFrame()
submission['PassengerId'] = pd.read_csv(os.path.join('data', 'test.csv')).PassengerId
submission['Transported'] = y_pred.astype(bool)

In [None]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [None]:
# Salvando o arquivo
submission.to_csv(os.path.join('submission', 'submission5.csv'), index=False)