<a href="https://colab.research.google.com/github/yegyu-han/SpaceshipTitanic/blob/main/SpaceshipTitanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

train_path = 'train.csv'
test_path = 'test.csv'
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [None]:
def fill_missing(df):
  lux = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
  categorical = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

  df['Age'] = df['Age'].fillna(df['Age'].mean())
  df['VIP'] = df['VIP'].fillna(df['VIP'].mode())

  for col in categorical:
    most_freq = df[col].value_counts(dropna=True).idxmax()
    df[col] = df[col].fillna(most_freq)

  for col in lux:
    df[col] = df[col].fillna(df.groupby('VIP')[col].transform('median'))

  return df;

def extract_deck(cabin):
  if pd.isna(cabin):
    return cabin
  else:
    return cabin.split('/')[0]

def extract_side(cabin):
  if pd.isna(cabin):
    return cabin
  else:
    return cabin.split('/')[2]

def extract_group(id):
  return id.split('_')[0]

def find_cabin(df):
  df['Group'] = df['PassengerId'].apply(extract_group)
  g_cabin = (df.groupby(['Group']+['Cabin']).size()
  .to_frame('counts').reset_index()
  .sort_values('counts', ascending=False)
  .drop_duplicates(subset='Group')).drop(columns='counts')
  df.loc[df.Cabin.isnull(), 'Cabin'] = df.Group.map(g_cabin.set_index('Group').Cabin)
  return df

def fill_cabin(df):
  cabin = ['Deck', 'Side']
  for col in cabin:
    most_freq = df[col].value_counts(dropna=True).idxmax()
    df[col] = df[col].fillna(most_freq)
  return df

def drop_cols(df):
  cols = ['PassengerId', 'FoodCourt', 'Cabin', 'RoomService', 'ShoppingMall', 'Spa', 'VRDeck', 'Name','Group']
  for column in cols:
        df = df.drop(column, axis = 1)
  return df
  
def to_int(df):
  df = df.astype({'Age' : 'int'})
  df = df.astype({'Luxury' : 'int'})
  return df;


def data_preproc(df):
  df = fill_missing(df)
  df = find_cabin(df)
  df['Luxury'] = df['FoodCourt'] + df['RoomService'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
  df['Deck'] = df['Cabin'].apply(extract_deck)
  df['Side'] = df['Cabin'].apply(extract_side)
  df = fill_cabin(df)
  df = to_int(df)
  df = drop_cols(df)
  return df

In [None]:
train_id = train.loc[:, 'PassengerId']
test_id = test.loc[:, 'PassengerId']
train_data = data_preproc(train)
train_data = train_data.drop('Transported', axis=1)
train_target = train.loc[:,'Transported']
test_data = data_preproc(test)

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8693 non-null   bool   
 3   Cabin         8594 non-null   object 
 4   Destination   8693 non-null   object 
 5   Age           8693 non-null   float64
 6   VIP           8693 non-null   bool   
 7   RoomService   8693 non-null   float64
 8   FoodCourt     8693 non-null   float64
 9   ShoppingMall  8693 non-null   float64
 10  Spa           8693 non-null   float64
 11  VRDeck        8693 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
 14  Group         8693 non-null   object 
 15  Luxury        8693 non-null   float64
 16  Deck          8693 non-null   object 
 17  Side          8693 non-null   object 
dtypes: bool(3), float64(7), obje

In [None]:
categorical = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']
le = LabelEncoder()
for col in categorical:
  le = le.fit(train_data[col])
  train_data[col] = le.transform(train[col])
  test_data[col] = le.transform(test[col])

In [None]:
train_data.info()
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   HomePlanet   8693 non-null   int64
 1   CryoSleep    8693 non-null   int64
 2   Destination  8693 non-null   int64
 3   Age          8693 non-null   int64
 4   VIP          8693 non-null   int64
 5   Luxury       8693 non-null   int64
 6   Deck         8693 non-null   int64
 7   Side         8693 non-null   int64
dtypes: int64(8)
memory usage: 543.4 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   HomePlanet   4277 non-null   int64
 1   CryoSleep    4277 non-null   int64
 2   Destination  4277 non-null   int64
 3   Age          4277 non-null   int64
 4   VIP          4277 non-null   int64
 5   Luxury       4277 non-null   int64
 6   Deck         4277 non-n

In [None]:
train_target.to_frame().info()

In [None]:
clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(train_data, train_target)
pred = pd.Series(clf.predict(test_data))

In [None]:
pred

0        True
1       False
2        True
3        True
4       False
        ...  
4272     True
4273    False
4274     True
4275    False
4276    False
Length: 4277, dtype: bool

In [None]:
result = pd.DataFrame({
    'PassengerId' : test_id,
    'Transported' : pred
})
result.to_csv('1810052.csv', index=False)