In [1]:
import pandas            as pd
import numpy             as np
import matplotlib.pyplot as plt
import seaborn           as sns

In [2]:
from os import path

data_dir  = 'data/spaceship-titanic'
data_file = 'train_clean.csv'
data_path = path.join(data_dir, data_file)

df = pd.read_csv(data_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6606 entries, 0 to 6605
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       6606 non-null   int64  
 1   HomePlanet       6606 non-null   object 
 2   CryoSleep        6606 non-null   bool   
 3   Cabin            6606 non-null   object 
 4   Destination      6606 non-null   object 
 5   Age              6606 non-null   float64
 6   VIP              6606 non-null   bool   
 7   RoomService      6606 non-null   float64
 8   FoodCourt        6606 non-null   float64
 9   ShoppingMall     6606 non-null   float64
 10  Spa              6606 non-null   float64
 11  VRDeck           6606 non-null   float64
 12  Name             6606 non-null   object 
 13  Transported      6606 non-null   bool   
 14  PassengerGroup   6606 non-null   int64  
 15  PassengerNumber  6606 non-null   int64  
 16  CabinDeck        6606 non-null   object 
 17  CabinNumber   

#### Droppando atributos que não usaremos para o agrupamento

In [3]:
drop_attribs = [
    'Unnamed: 0',
    'Cabin',
    'RoomService',
    'FoodCourt',
    'ShoppingMall',
    'Spa',
    'VRDeck',
    'Name',
    'Transported',
    'PassengerGroup',
    'PassengerNumber'
]

df = df.drop(columns=drop_attribs)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6606 entries, 0 to 6605
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   HomePlanet   6606 non-null   object 
 1   CryoSleep    6606 non-null   bool   
 2   Destination  6606 non-null   object 
 3   Age          6606 non-null   float64
 4   VIP          6606 non-null   bool   
 5   CabinDeck    6606 non-null   object 
 6   CabinNumber  6606 non-null   int64  
 7   CabinSide    6606 non-null   object 
 8   TotalSpent   6606 non-null   float64
dtypes: bool(2), float64(2), int64(1), object(4)
memory usage: 374.3+ KB


In [4]:
df

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,CabinDeck,CabinNumber,CabinSide,TotalSpent
0,Europa,False,TRAPPIST-1e,39.0,False,B,0,P,0.0
1,Earth,False,TRAPPIST-1e,24.0,False,F,0,S,736.0
2,Europa,False,TRAPPIST-1e,58.0,True,A,0,S,10383.0
3,Europa,False,TRAPPIST-1e,33.0,False,A,0,S,5176.0
4,Earth,False,TRAPPIST-1e,16.0,False,F,1,S,1091.0
...,...,...,...,...,...,...,...,...,...
6601,Europa,False,55 Cancri e,41.0,True,A,98,P,8536.0
6602,Earth,True,PSO J318.5-22,18.0,False,G,1499,S,0.0
6603,Earth,False,TRAPPIST-1e,26.0,False,G,1500,S,1873.0
6604,Europa,False,55 Cancri e,32.0,False,E,608,S,4637.0


#### Convertendo atributos booleanos para numéricos

In [5]:
# binary attribs:
#   CryoSleep
#   VIP
#   CabinSide

df['CryoSleep'] = df['CryoSleep'].apply(lambda x: float(x))
df['VIP'] = df['VIP'].apply(lambda x: float(x))
df['CabinSide'] = df['CabinSide'].apply(lambda x: float(x == 'P'))

In [6]:
df.tail(5)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,CabinDeck,CabinNumber,CabinSide,TotalSpent
6601,Europa,0.0,55 Cancri e,41.0,1.0,A,98,1.0,8536.0
6602,Earth,1.0,PSO J318.5-22,18.0,0.0,G,1499,0.0,0.0
6603,Earth,0.0,TRAPPIST-1e,26.0,0.0,G,1500,0.0,1873.0
6604,Europa,0.0,55 Cancri e,32.0,0.0,E,608,0.0,4637.0
6605,Europa,0.0,TRAPPIST-1e,44.0,0.0,E,608,0.0,4826.0


#### Convertendo atributos categórios para numéricos (ordinais)

In [7]:
# categorical attribs:
#    HomePlanet
#    Destination
#    CabinDeck

from sklearn.preprocessing import OrdinalEncoder

HomePlanet = np.reshape(df['HomePlanet'].values, (-1, 1))
df['HomePlanet'] = np.reshape(OrdinalEncoder().fit_transform(HomePlanet), (-1,))

Destination = np.reshape(df['Destination'].values, (-1, 1))
df['Destination'] = np.reshape(OrdinalEncoder().fit_transform(Destination), (-1,))

CabinDeck = np.reshape(df['CabinDeck'].values, (-1, 1))
df['CabinDeck'] = np.reshape(OrdinalEncoder().fit_transform(CabinDeck), (-1,))

In [8]:
df.tail(5)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,CabinDeck,CabinNumber,CabinSide,TotalSpent
6601,1.0,0.0,0.0,41.0,1.0,0.0,98,1.0,8536.0
6602,0.0,1.0,1.0,18.0,0.0,6.0,1499,0.0,0.0
6603,0.0,0.0,2.0,26.0,0.0,6.0,1500,0.0,1873.0
6604,1.0,0.0,0.0,32.0,0.0,4.0,608,0.0,4637.0
6605,1.0,0.0,2.0,44.0,0.0,4.0,608,0.0,4826.0


#### E só para padronização...

In [9]:
df['CabinNumber'] = df['CabinNumber'].astype('float64')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6606 entries, 0 to 6605
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   HomePlanet   6606 non-null   float64
 1   CryoSleep    6606 non-null   float64
 2   Destination  6606 non-null   float64
 3   Age          6606 non-null   float64
 4   VIP          6606 non-null   float64
 5   CabinDeck    6606 non-null   float64
 6   CabinNumber  6606 non-null   float64
 7   CabinSide    6606 non-null   float64
 8   TotalSpent   6606 non-null   float64
dtypes: float64(9)
memory usage: 464.6 KB


#### Salvando o novo DataFrame em .csv

In [12]:
new_file = 'train_cluster.csv'
new_path = path.join(data_dir, new_file)

df.to_csv(new_path)