In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [2]:
# input dataset
train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

# print the shape of dataset
print("shape of train:", train.shape)
print("shape of test:", test.shape,)

shape of train: (8693, 14)
shape of test: (4277, 13)


In [3]:
# check the contents of train set
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [4]:
# check the missing values
print("\n----Missing Values----")
print(train.isnull().sum())

# check the unique values
print("\n----Unique Values----")
categorical_columns = train.select_dtypes(include='object').columns
for col in categorical_columns:
    print(f"{col}: {train[col].nunique()}")


----Missing Values----
PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

----Unique Values----
PassengerId: 8693
HomePlanet: 3
CryoSleep: 2
Cabin: 6560
Destination: 3
VIP: 2
Name: 8473


In [5]:
#Fill in the missing values(1)

# 1. numerical columns([Age]): fill with median
train['Age'].fillna(train['Age'].median(), inplace=True)
test['Age'].fillna(test['Age'].median(), inplace=True)


# 2. numerical columns(except [Age]): fill with 0
for col in ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
    train[col].fillna(0, inplace=True)
    test[col].fillna(0, inplace=True)


# 3. categorical columns(except [Name]and [Cabin]): fill with mode
for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']:
    train[col].fillna(train[col].mode()[0], inplace=True)
    test[col].fillna(test[col].mode()[0], inplace=True)


# check the missing values
print("\n----Missing Values----")
print(train.isnull().sum())


----Missing Values----
PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin           199
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Transported       0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Age'].fillna(train['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['Age'].fillna(test['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

In [6]:
#Fill in the missing values(2)

# 4. categorical columns([Cabin])
  # splite the [Cabin] column
train[['Deck', 'CabinNum', 'Side']] = train['Cabin'].str.split('/', expand=True)
test[['Deck', 'CabinNum', 'Side']] = test['Cabin'].str.split('/', expand=True)

  # delete the original [Cabin] column
train.drop(columns=['Cabin'], inplace=True)
test.drop(columns=['Cabin'], inplace=True)

  # transfer [CabinNum] into numerical form
train['CabinNum'] = pd.to_numeric(train['CabinNum'], errors='coerce')
test['CabinNum'] = pd.to_numeric(test['CabinNum'], errors='coerce')

  # [CabinNum]:fill with median
cabin_median = train['CabinNum'].median()
train['CabinNum'].fillna(cabin_median, inplace=True)
test['CabinNum'].fillna(cabin_median, inplace=True)

  # [Deck] and [Side]:fill with mode
for col in ['Deck', 'Side']:
    train[col].fillna(train[col].mode()[0], inplace=True)
    test[col].fillna(test[col].mode()[0], inplace=True)


# check the missing values
print("\n----Missing Values----")
print(train.isnull().sum())


----Missing Values----
PassengerId       0
HomePlanet        0
CryoSleep         0
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Transported       0
Deck              0
CabinNum          0
Side              0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['CabinNum'].fillna(cabin_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['CabinNum'].fillna(cabin_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are set

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

# save the [PassengerId] for submission
test_ids = test['PassengerId'].copy()

# delete unnecessary columns
train_model = train.drop(columns=['Name', 'PassengerId'], errors='ignore')
test_model = test.drop(columns=['Name', 'PassengerId'], errors='ignore')

# split the train set and test set
X = train_model.drop(columns='Transported')
y = train_model['Transported']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# split categorical and numerical columns
num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CabinNum']
cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']

# preprocess
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

# Pipeline construction
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# model training
clf.fit(X_train, y_train)

# check the accuracy
val_score = clf.score(X_val, y_val)
print(f"Accuracy：{val_score:.4f}")

# prediction
preds = clf.predict(test_model)

# generate the submission file
submission = pd.DataFrame({
    'PassengerId': test_ids,
    'Transported': preds.astype(bool)
})

submission.head()

submission.to_csv("submission.csv", index=False)


Accuracy：0.8068
