In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


# Standard Libraries

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.model_selection import train_test_split

# Load the data

In [3]:
train_data = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
train_data.shape

(8693, 14)

In [5]:
test_data = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
test_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [6]:
test_data.shape

(4277, 13)

In [7]:
# Check for duplicate entries
train_data.duplicated().sum()

0

In [8]:
# Check for missing values in target column
train_data.Transported.isnull().sum()

0

# Data Analysis
Check

* data types of each feature
* missing values in each feature

In [9]:
train_data.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [10]:
test_data.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
dtype: object

In [11]:
train_data.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [12]:
test_data.isnull().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

In [13]:
train_data.nunique()

PassengerId     8693
HomePlanet         3
CryoSleep          2
Cabin           6560
Destination        3
Age               80
VIP                2
RoomService     1273
FoodCourt       1507
ShoppingMall    1115
Spa             1327
VRDeck          1306
Name            8473
Transported        2
dtype: int64

In [14]:
train_data.select_dtypes(include='object').nunique()

PassengerId    8693
HomePlanet        3
CryoSleep         2
Cabin          6560
Destination       3
VIP               2
Name           8473
dtype: int64

In [15]:
test_data.nunique()

PassengerId     4277
HomePlanet         3
CryoSleep          2
Cabin           3265
Destination        3
Age               79
VIP                2
RoomService      842
FoodCourt        902
ShoppingMall     715
Spa              833
VRDeck           796
Name            4176
dtype: int64

In [16]:
test_data.select_dtypes(include='object').nunique()

PassengerId    4277
HomePlanet        3
CryoSleep         2
Cabin          3265
Destination       3
VIP               2
Name           4176
dtype: int64

* drop Name, Cabin & PassengerId
* use onehot encoder for remaining object columns

# Data Cleaning

In [18]:
# convert target from bool to 1/0 int
train_data['Transported'] = train_data['Transported'].astype(int)

train_data['CryoSleep'] = train_data['CryoSleep'].astype(str)
test_data['CryoSleep'] = test_data['CryoSleep'].astype(str)

train_data['VIP'] = train_data['VIP'].astype(str)
test_data['VIP'] = test_data['VIP'].astype(str)

In [19]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8693 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8693 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   int64  
dtypes: float64(6), int64(1), object(7)
memory usage: 950.9+ KB


# Modeling
### X and y (features and target)

In [20]:
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in test_data.columns if test_data[cname].nunique() < 10 and 
                        test_data[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in test_data.columns if test_data[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols

In [52]:
X_test = test_data[my_cols].copy()
X = train_data[my_cols].copy()
y = train_data.Transported.copy()
X_test

Unnamed: 0,HomePlanet,CryoSleep,Destination,VIP,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Earth,True,TRAPPIST-1e,False,27.0,0.0,0.0,0.0,0.0,0.0
1,Earth,False,TRAPPIST-1e,False,19.0,0.0,9.0,0.0,2823.0,0.0
2,Europa,True,55 Cancri e,False,31.0,0.0,0.0,0.0,0.0,0.0
3,Europa,False,TRAPPIST-1e,False,38.0,0.0,6652.0,0.0,181.0,585.0
4,Earth,False,TRAPPIST-1e,False,20.0,10.0,0.0,635.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
4272,Earth,True,TRAPPIST-1e,False,34.0,0.0,0.0,0.0,0.0,0.0
4273,Earth,False,TRAPPIST-1e,False,42.0,0.0,847.0,17.0,10.0,144.0
4274,Mars,True,55 Cancri e,False,,0.0,0.0,0.0,0.0,0.0
4275,Europa,False,,False,,0.0,2680.0,0.0,0.0,523.0


In [22]:
# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2,random_state=0)

In [23]:
numerical_transformer = Pipeline(steps=[
    ('scalar', StandardScaler()),
    ('knnimputer', KNNImputer())   ## using KNNImputer
    ])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [24]:
model = RandomForestClassifier(random_state=0)

In [25]:
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

my_pipeline.fit(X_train, y_train)

preds = my_pipeline.predict(X_valid)

my_pipeline.score(X_valid,y_valid)

0.7814836112708453

# Hyperparameter Tunning

In [26]:
RandomForestClassifier().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### 1st using RandomizedSearchCV

In [27]:
rscv_hp = {'model__n_estimators': np.arange(100,600,100),
          'model__max_depth': [None, 3,5,6,10],
          'model__min_samples_split': np.arange(2,20,2),
          'model__min_samples_leaf': np.arange(1,20,2)}

In [28]:
rscv_model = RandomizedSearchCV(my_pipeline,
                        param_distributions = rscv_hp,
                        cv = 5,
                        n_iter = 20,
                        verbose = True)
# Fit
rscv_model.fit(X_train,y_train)
rscv_model.score(X_valid,y_valid)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


0.7901092581943646

In [30]:
# lets see best parameters
rscv_model.best_params_

{'model__n_estimators': 200,
 'model__min_samples_split': 2,
 'model__min_samples_leaf': 1,
 'model__max_depth': 10}

### Now using GridSearchCV

In [32]:
gscv_hp = {'model__n_estimators': [100,500,600],
          'model__max_depth': [None,1,5],
          'model__min_samples_split': [5,6],
          'model__min_samples_leaf': [6,7]}

In [33]:
gscv_model = GridSearchCV(my_pipeline,
                            param_grid = gscv_hp,
                            cv = 5,
                            verbose = True)
# Fit
gscv_model.fit(X_train,y_train)
gscv_model.score(X_valid,y_valid)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


0.7918343875790684

In [34]:
# lets see best parameters
gscv_model

# Predictions

In [50]:
predictions = gscv_model.predict(X_test)
predictions = (predictions ==1)
predictions

array([ True, False,  True, ...,  True,  True,  True])

In [36]:
# Save test predictions to file
output = pd.DataFrame({'PassengerId': test_data.PassengerId,
                       'Transported': predictions})
output.to_csv('submission01.csv', index=False)

In [37]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8693 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8693 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   int64  
dtypes: float64(6), int64(1), object(7)
memory usage: 950.9+ KB


In [40]:
import pickle

with open('model.pkl', 'wb') as file:
    pickle.dump(gscv_model, file)

In [41]:
import pickle
loaded_model = pickle.load(open("model.pkl", "rb"))
loaded_model

In [84]:

data = ['Earth' ,'FALSE' ,'TRAPPIST-1e','FALSE', 19 ,0 ,9 ,0 ,2823,0]  # Use a Python list

print(data)

data_df = pd.DataFrame([data], columns=['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'])


prediction = loaded_model.predict(data_df)
result = prediction[0]
print(prediction)
if result==0 :
    print('No')
else : print('Yanawa')

['Earth', 'FALSE', 'TRAPPIST-1s', 'TRUE', 25, 1, 0, 1, 0, 0]
[0]
No
