<a href="https://colab.research.google.com/github/zdanielshi/data_science_LFZ/blob/main/Kaggle_Space_Titanic_Competition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler, MinMaxScaler, MaxAbsScaler, StandardScaler, PowerTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

pd.options.display.float_format = '{:,.3f}'.format

In [None]:
# mounting Google Drive and loading files
from google.colab import drive
drive.mount('/content/drive')

train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Colab data/train space titanic.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Colab data/test space titanic.csv')

Mounted at /content/drive


In [None]:
train.Transported.value_counts(normalized=True)

TypeError: ignored

# Analysis

In [None]:
train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.000,False,0.000,0.000,0.000,0.000,0.000,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.000,False,109.000,9.000,25.000,549.000,44.000,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.000,True,43.000,3576.000,0.000,6715.000,49.000,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.000,False,0.000,1283.000,371.000,3329.000,193.000,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.000,False,303.000,70.000,151.000,565.000,2.000,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.000,True,0.000,6819.000,0.000,1643.000,74.000,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.000,False,0.000,0.000,0.000,0.000,0.000,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.000,False,0.000,0.000,1872.000,1.000,0.000,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.000,False,0.000,1049.000,0.000,353.000,3235.000,Celeon Hontichre,False


In [None]:
train.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

#Stacking and separating the testing and training data

In [None]:
train['train_data'] = True
test['train_data'] = False

In [None]:
x = train.loc[:,train.columns != 'Transported'].copy()
transported = train.loc[:,train.columns == 'Transported'].copy()

In [None]:
x = pd.concat([x, test], axis = 0)

In [None]:
x.isnull().sum()

PassengerId       0
HomePlanet      288
CryoSleep       310
Cabin           299
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Name            294
train_data        0
dtype: int64

## Ad hoc feature engineering: splitting up passenger ID and cabin columns

In [None]:
# Splitting up Passenger Id into Group and GroupSubId within group
x[['GroupId', 'GroupSubId']] = x['PassengerId'].str.split('_', expand = True)

In [None]:
x[['Deck', 'Number', 'Side']] = x['Cabin'].str.split('/', expand = True)

In [None]:
x

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,train_data,GroupId,GroupSubId,Deck,Number,Side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.000,False,0.000,0.000,0.000,0.000,0.000,Maham Ofracculy,True,0001,01,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.000,False,109.000,9.000,25.000,549.000,44.000,Juanna Vines,True,0002,01,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.000,True,43.000,3576.000,0.000,6715.000,49.000,Altark Susent,True,0003,01,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.000,False,0.000,1283.000,371.000,3329.000,193.000,Solam Susent,True,0003,02,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.000,False,303.000,70.000,151.000,565.000,2.000,Willy Santantines,True,0004,01,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.000,False,0.000,0.000,0.000,0.000,0.000,Jeron Peter,False,9266,02,G,1496,S
4273,9269_01,Earth,False,,TRAPPIST-1e,42.000,False,0.000,847.000,17.000,10.000,144.000,Matty Scheron,False,9269,01,,,
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.000,0.000,0.000,0.000,0.000,Jayrin Pore,False,9271,01,D,296,P
4275,9273_01,Europa,False,D/297/P,,,False,0.000,2680.000,0.000,0.000,523.000,Kitakan Conale,False,9273,01,D,297,P


# Clean up

## Filling in nulls w/ rules
Ideas
* Same ID group, would have the same Cabin and home planet

In [None]:
x.groupby('Destination')['HomePlanet'].value_counts(normalize=True)

Destination    HomePlanet
55 Cancri e    Europa       0.506
               Earth        0.380
               Mars         0.114
PSO J318.5-22  Earth        0.917
               Mars         0.058
               Europa       0.025
TRAPPIST-1e    Earth        0.539
               Mars         0.260
               Europa       0.200
Name: HomePlanet, dtype: float64

In [None]:
#Fill Home Planet nulls: 55 Chancri is Europa, Earth for others
# x.HomePlanet.fillna(x.Destination, inplace=True)

In [None]:
# x.HomePlanet.value_counts()

In [None]:
# x.HomePlanet.isnull().sum()

In [None]:
# x.HomePlanet = x['HomePlanet'].apply(lambda x: 'Europa' if x == '55 Cancri e' else ('Earth' if x == 'PSO J318.5-22' else ('Earth' if x == 'TRAPPIST-1e' else x)))

In [None]:
# x.HomePlanet.value_counts()

In [None]:
# x.HomePlanet.fillna('Earth',inplace=True)

In [None]:
#x.CryoSleep.value_counts()

In [None]:
#x.CryoSleep.isna().sum()

In [None]:
#x.CryoSleep.fillna(x.CryoSleep.mode()[0], inplace=True)

In [None]:
# # Drop cabin, and then fill in the modes for Deck, Number and Side
x.drop(columns = 'Cabin', inplace=True)
# x.Deck.fillna(x.Deck.mode()[0], inplace=True)
# x.Number.fillna(x.Number.mode()[0], inplace=True)
# x.Side.fillna(x.Side.mode()[0], inplace=True)

In [None]:
# Destination, fill with mode
# x.Destination.fillna(x.Destination.mode()[0], inplace=True)

In [None]:
# VIP fill with mode
# x.VIP.fillna(x.VIP.mode()[0], inplace=True)

In [None]:
# x.Age.hist()

In [None]:
# x.Age.fillna(x.Age.median(), inplace=True)

In [None]:
# x.fillna(method = 'bfill', inplace=True)

In [None]:
x.isnull().sum()

PassengerId       0
HomePlanet      288
CryoSleep       310
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Name            294
train_data        0
GroupId           0
GroupSubId        0
Deck            299
Number          299
Side            299
dtype: int64

In [None]:
x.isnull().sum().sum()

4039

# Feature Engineering
* Break up the cabin into deck, number, and side
* ID can be broken up into group and ID

In [None]:
# Create a total spend colum
x['TotalSpend'] = x['RoomService'] + x['FoodCourt'] + x['ShoppingMall'] + x['ShoppingMall'] + x['Spa'] + x['VRDeck']

In [None]:
# Boolean for if they spent ANY money
x['SpentMoney'] = x['TotalSpend']
x['SpentMoney'] = x['SpentMoney'].apply(lambda x: 1 if x > 0 else 0)
x['SpentMoney'] = x['SpentMoney'].astype('bool')

In [None]:
# Create a column for the travel corridor
x['TravelCorridor'] = x['HomePlanet']+" / " + x['Destination']

In [None]:
# Create a column showing if someone is an adult at age 18
cutoff = 18
x['Adult'] = x['Age']
x['Adult'] = x['Adult'].apply(lambda x: 1 if x >= 18 else 0)
x['Adult'] = x['Adult'].astype('bool')

In [None]:
#group_size = x.groupby('GroupId')['GroupSubId'].max()

In [None]:
#group_size

GroupId
0001    01
0002    01
0003    02
0004    01
0005    01
        ..
9276    01
9277    01
9278    01
9279    01
9280    02
Name: GroupSubId, Length: 9280, dtype: object

## Dropping some columns

In [None]:
# drop GroupId and GroupSubId until I figure out how to utilize them
x.drop(columns = ['GroupId', 'GroupSubId'], inplace=True)

In [None]:
# Drop Passenger Id
x.drop(columns = ['PassengerId'], inplace=True)

In [None]:
# Drop name
x.drop(columns = ['Name'], inplace=True)

In [None]:
# drop Number since that's the cabin number
x.drop(columns = ['Number'], inplace=True)

In [None]:
# Drop the VIP column
x.drop(columns = ['VIP'], inplace=True)

# Preprocessing

## Transforming

In [None]:
x.skew()

  """Entry point for launching an IPython kernel.


CryoSleep       0.575
Age             0.440
RoomService     6.140
FoodCourt       7.052
ShoppingMall   11.009
Spa             7.653
VRDeck          8.060
train_data     -0.724
TotalSpend      4.343
SpentMoney     -0.081
Adult          -1.537
dtype: float64

In [None]:
# Transform RoomService, FoodCout, ShoppingMall, Spa, VRDeck, and TotalSpend, use YJ

x['RoomService'] = list(stats.yeojohnson(x['RoomService'])[0])
x['FoodCourt'] = list(stats.yeojohnson(x['FoodCourt'])[0])
x['ShoppingMall'] = list(stats.yeojohnson(x['ShoppingMall'])[0])
x['Spa'] = list(stats.yeojohnson(x['Spa'])[0])
x['Spa'] = list(stats.yeojohnson(x['Spa'])[0])
x['VRDeck'] = list(stats.yeojohnson(x['VRDeck'])[0])
x['TotalSpend'] = list(stats.yeojohnson(x['TotalSpend'])[0])

## Scaling

In [None]:
num_cols_to_scale = x.select_dtypes(exclude = ['object', 'bool']).columns.values.tolist()
num_cols_to_scale

['Age',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'TotalSpend']

### Robust Scaler

In [None]:
robust_scaler = RobustScaler().fit(x[num_cols_to_scale])
x[num_cols_to_scale] = robust_scaler.transform(x[num_cols_to_scale])

### Standard Scaler

In [None]:
# standard_scaler = StandardScaler().fit(x[num_cols_to_scale])
# x[num_cols_to_scale] = standard_scaler.transform(x[num_cols_to_scale])

### MinMax Scaler

In [None]:
# minmax_scaler = MinMaxScaler().fit(x[num_cols_to_scale])
# x[num_cols_to_scale] = minmax_scaler.transform(x[num_cols_to_scale])

## Encoding

In [None]:
x = pd.get_dummies(x)

In [None]:
x

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,train_data,TotalSpend,SpentMoney,Adult,...,Side_S,TravelCorridor_Earth / 55 Cancri e,TravelCorridor_Earth / PSO J318.5-22,TravelCorridor_Earth / TRAPPIST-1e,TravelCorridor_Europa / 55 Cancri e,TravelCorridor_Europa / PSO J318.5-22,TravelCorridor_Europa / TRAPPIST-1e,TravelCorridor_Mars / 55 Cancri e,TravelCorridor_Mars / PSO J318.5-22,TravelCorridor_Mars / TRAPPIST-1e
0,0.632,0.000,0.000,0.000,0.000,0.000,True,-0.001,False,True,...,0,0,0,0,0,0,1,0,0,0
1,-0.158,796.254,0.000,0.297,"13,228,936,039,108,817,403,580,030,574,435,272,...",1.470,True,-0.000,True,True,...,1,0,0,1,0,0,0,0,0,0
2,1.632,0.339,119072816904517.562,0.000,"13,339,162,749,497,127,126,375,259,670,229,686,...",3.589,True,2753650.953,True,True,...,1,0,0,0,0,0,1,0,0,0
3,0.316,0.000,20234972476.998,1834919500.686,"1,807,417,754,824,374,840,538,648,288,832,806,3...",349623.299,True,13600.484,True,True,...,1,0,0,0,0,0,1,0,0,0
4,-0.579,4378623.332,0.451,934336.504,"103,613,941,685,341,521,385,603,775,198,428,853...",0.000,True,0.042,True,False,...,1,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,0.368,0.000,0.000,0.000,0.000,0.000,False,-0.001,False,True,...,1,0,0,1,0,0,0,0,0,0
4273,0.789,0.000,602122599.876,0.013,0.000,29678.138,False,0.008,True,True,...,0,0,0,1,0,0,0,0,0,0
4274,,0.000,0.000,0.000,0.000,0.000,False,-0.001,False,False,...,0,0,0,0,0,0,0,1,0,0
4275,,0.000,10349366536624.928,0.000,0.000,1583351655.759,False,129.844,True,False,...,0,0,0,0,0,0,0,0,0,0


In [None]:
imp_neighbors = 5
imputer = KNNImputer(n_neighbors=imp_neighbors)

In [None]:
x.isnull().sum().sum()

3043

In [None]:
x = pd.DataFrame(imputer.fit_transform(x), columns = x.columns)

  ret = a @ b
  distances = -2 * safe_sparse_dot(X, Y.T, dense_output=True)
  distances += XX
  distances += YY
  XX = X * X
  YY = Y * Y
  distances -= np.dot(XX, missing_Y.T)
  distances -= np.dot(missing_X, YY.T)


In [None]:
x.isnull().sum().sum()

0

In [None]:
x

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,train_data,TotalSpend,SpentMoney,Adult,...,Side_S,TravelCorridor_Earth / 55 Cancri e,TravelCorridor_Earth / PSO J318.5-22,TravelCorridor_Earth / TRAPPIST-1e,TravelCorridor_Europa / 55 Cancri e,TravelCorridor_Europa / PSO J318.5-22,TravelCorridor_Europa / TRAPPIST-1e,TravelCorridor_Mars / 55 Cancri e,TravelCorridor_Mars / PSO J318.5-22,TravelCorridor_Mars / TRAPPIST-1e
0,0.632,0.000,0.000,0.000,0.000,0.000,1.000,-0.001,0.000,1.000,...,0.000,0.000,0.000,0.000,0.000,0.000,1.000,0.000,0.000,0.000
1,-0.158,796.254,0.000,0.297,"13,228,936,039,108,817,403,580,030,574,435,272,...",1.470,1.000,-0.000,1.000,1.000,...,1.000,0.000,0.000,1.000,0.000,0.000,0.000,0.000,0.000,0.000
2,1.632,0.339,119072816904517.562,0.000,"13,339,162,749,497,127,126,375,259,670,229,686,...",3.589,1.000,2753650.953,1.000,1.000,...,1.000,0.000,0.000,0.000,0.000,0.000,1.000,0.000,0.000,0.000
3,0.316,0.000,20234972476.998,1834919500.686,"1,807,417,754,824,374,840,538,648,288,832,806,3...",349623.299,1.000,13600.484,1.000,1.000,...,1.000,0.000,0.000,0.000,0.000,0.000,1.000,0.000,0.000,0.000
4,-0.579,4378623.332,0.451,934336.504,"103,613,941,685,341,521,385,603,775,198,428,853...",0.000,1.000,0.042,1.000,0.000,...,1.000,0.000,0.000,1.000,0.000,0.000,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,0.368,0.000,0.000,0.000,0.000,0.000,0.000,-0.001,0.000,1.000,...,1.000,0.000,0.000,1.000,0.000,0.000,0.000,0.000,0.000,0.000
12966,0.789,0.000,602122599.876,0.013,0.000,29678.138,0.000,0.008,1.000,1.000,...,0.000,0.000,0.000,1.000,0.000,0.000,0.000,0.000,0.000,0.000
12967,0.211,0.000,0.000,0.000,0.000,0.000,0.000,-0.001,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1.000,0.000,0.000
12968,0.284,0.000,10349366536624.928,0.000,0.000,1583351655.759,0.000,129.844,1.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000


#Training and Predicting

## Prepping the data and splitting it

In [None]:
# Unstacking the training and test data
train = x[x.train_data == True]
test = x[x.train_data == False]

In [None]:
train.drop(columns = 'train_data', inplace=True)
test.drop(columns = 'train_data', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
train.shape, test.shape

((8693, 36), (4277, 36))

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(train, transported, test_size = .25, random_state = 7)

In [None]:
xtrain.shape, xtest.shape, ytrain.shape, ytest.shape

((6519, 36), (2174, 36), (6519, 1), (2174, 1))

## Logistic Regression

In [None]:
lr = LogisticRegression(max_iter = 100000).fit(xtrain, ytrain)

  y = column_or_1d(y, warn=True)


ValueError: ignored

### Analyzing logistic regression results

In [None]:
lr_score = lr.score(xtest, ytest)

## KNN Classification

In [None]:
n = 75
knn = KNeighborsClassifier(n_neighbors = n).fit(xtrain, ytrain)

  return self._fit(X, y)


### Analyzing KNN Classification results

In [None]:
knn_score = knn.score(xtest, ytest)
knn_score

  ret = a @ b
  distances = -2 * safe_sparse_dot(X, Y.T, dense_output=True)
  distances += XX
  distances += YY


0.765409383624655

In [None]:
# Summarized Logistic Regression and KNN scores
# print('{:.5f}'.format(lr_score), '- LR score')
print('{:.5f}'.format(knn_score), '- KNN score with', n, 'neighbors')

0.76541 - KNN score with 75 neighbors


## LGBM

In [None]:
lgbm = LGBMClassifier()
lgbm.fit(xtrain, ytrain)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LGBMClassifier()

In [None]:
lgbm.score(xtest, ytest)

0.8100275988960441

## XG Boost Classifier

In [None]:
xgbc = XGBClassifier(gamma = 1.5, subsample = 1.0, max_depth = 5, colsample_bytree=1.0, n_estimators=100)

In [None]:
xgbc.fit(xtrain, ytrain)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(colsample_bytree=1.0, gamma=1.5, max_depth=5, subsample=1.0)

In [None]:
xgbc.score(xtest, ytest)

0.8095676172953082

In [None]:
# xgbc_params = {
#         "gamma": [0.5, 1, 1.5],
#         "subsample": [0.6, 0.8, 1.0],
#         "colsample_bytree": [0.6, 0.8, 1.0],
#         "max_depth": [3, 4, 5],
#         "n_estimators": [100, 130, 150]
# }

# xgbc_cv_model = GridSearchCV(xgbc, xgbc_params, cv = 5, n_jobs = 3)
# # xgbc_cv_model.fit(xtrain, ytrain)

# print("Best hyperparametres of the model: \n", xgbc_cv_model.best_params_)


### Analyzing XGB classification results

# Predicting and uploading results

In [None]:
to_submit = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Colab data/test space titanic.csv')

In [None]:
lr_prediction = lr.predict(test)
submission = pd.DataFrame()
submission['PassengerId'] = to_submit['PassengerId']
submission['Transported'] = lr_prediction
submission.to_csv('space titanic prediction (logistic).csv', index=False)

NameError: ignored

In [None]:
knn_prediction = knn.predict(test)
submission = pd.DataFrame()
submission['PassengerId'] = to_submit['PassengerId']
submission['Transported'] = knn_prediction
submission.to_csv('space titanic prediction (KNN).csv', index=False)

  ret = a @ b
  distances = -2 * safe_sparse_dot(X, Y.T, dense_output=True)
  distances += XX
  distances += YY


In [None]:
lgbm_prediction = lgbm.predict(test)
submission = pd.DataFrame()
submission['PassengerId'] = to_submit['PassengerId']
submission['Transported'] = lgbm_prediction
submission.to_csv('space titanic prediction (LGBM).csv', index=False)

In [None]:
xgbc_prediction = xgbc.predict(test)
submission = pd.DataFrame()
submission['PassengerId'] = to_submit['PassengerId']
submission['Transported'] = xgbc_prediction
submission.to_csv('space titanic prediction (XGB).csv', index=False)