In [248]:
import pandas as pd
import numpy as np

Loading data and removing useless columns

In [249]:
only_train_data = pd.read_csv('train.csv').drop(['PassengerId','Name'],axis=1)
only_test_data = pd.read_csv('test.csv').drop(['PassengerId','Name'],axis=1)

In [250]:
flag = [0] * len(only_train_data) + [1]*len(only_test_data)

Separating Data to Features and Labels

In [251]:
train_label  = only_train_data['Transported'].map({True:1,False:0})
only_train_data.drop('Transported',axis=1,inplace=True) 

In [252]:
train_data = pd.concat([only_train_data,only_test_data])

In [253]:
train_data= train_data.reset_index()

In [254]:
train_data['is_test'] = flag

All columns besides the Cabin look prety standart, let's try and make the Cabin column Useful by extracting some featues from it

In [255]:
train_data['Cabin'].sample(10)

11220    F/1142/P
12040     E/473/P
6459     G/1108/P
1129      F/238/P
6724      E/454/P
9009      F/124/S
11746    F/1391/P
4510      E/310/S
4893      G/843/P
12837    F/1845/P
Name: Cabin, dtype: object

Each cabin consists of 3 values, let's separate them into 3 different columns

In [256]:
train_data['Cabin'].isna().value_counts()

Cabin
False    12671
True       299
Name: count, dtype: int64

We keep in mind that we have 199 missing values

In [257]:
train_data['Cabin'] = train_data['Cabin'].apply(lambda x : x.split("/") if type(x) == str else [np.nan]*3)  

In [258]:
train_data['Cabin_1'] = [i[0] for i in train_data['Cabin']]
train_data['Cabin_2'] = [i[1] for i in train_data['Cabin']]
train_data['Cabin_3'] = [i[2] for i in train_data['Cabin']]

In [259]:
train_data.drop('Cabin',axis=1,inplace=True)

In [260]:
train_data.head()

Unnamed: 0,index,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,is_test,Cabin_1,Cabin_2,Cabin_3
0,0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,0,B,0,P
1,1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,0,F,0,S
2,2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,0,A,0,S
3,3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,0,A,0,S
4,4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,0,F,1,S


Now we try to deal with the categorical and boolean columns 
Lets check which columns are categorical

In [261]:
train_data

Unnamed: 0,index,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,is_test,Cabin_1,Cabin_2,Cabin_3
0,0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,0,B,0,P
1,1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,0,F,0,S
2,2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,0,A,0,S
3,3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,0,A,0,S
4,4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,0,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,4272,Earth,True,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,1,G,1496,S
12966,4273,Earth,False,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,1,,,
12967,4274,Mars,True,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,1,D,296,P
12968,4275,Europa,False,,,False,0.0,2680.0,0.0,0.0,523.0,1,D,297,P


In [262]:
train_nunique = train_data.nunique()

In [263]:
train_nunique

index           8693
HomePlanet         3
CryoSleep          2
Destination        3
Age               80
VIP                2
RoomService     1578
FoodCourt       1953
ShoppingMall    1367
Spa             1679
VRDeck          1642
is_test            2
Cabin_1            8
Cabin_2         1894
Cabin_3            2
dtype: int64

In [264]:
boolean_columns = ['VIP','CryoSleep']

In [265]:
categorical_columns = ['HomePlanet', 'Destination', 'Cabin_1', 'Cabin_3']

Lets check what's up with Cabin_2 column

In [266]:
train_data['Cabin_2'].sample(10)

8567      350
11128     860
11239     901
3243      237
1279      262
9747      357
6872     1183
9693       71
5928     1013
11714     NaN
Name: Cabin_2, dtype: object

For now let's treat it as a continuous value and hope it benefits the model

In [267]:
continuous_columns = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Cabin_2']

Map boolean columns to 1 and 0

In [268]:
for col in boolean_columns:
    train_data[col] = train_data[col].map({True:1,False:0}) 

In [None]:
new_categorical_columns = []
for col in categorical_columns:
    train_temp = pd.get_dummies(train_data[col]).replace({True:1,False:0})
    train_temp.columns = [col + "_" +str(i) for i in range(len(train_temp.columns))]
    train_data = pd.concat([train_data,train_temp],axis=1)
    train_data.drop(col,axis=1,inplace=True)
    new_categorical_columns += list(train_temp.columns)
categorical_columns = new_categorical_columns

In [270]:
categorical_columns

['HomePlanet_0',
 'HomePlanet_1',
 'HomePlanet_2',
 'Destination_0',
 'Destination_1',
 'Destination_2',
 'Cabin_1_0',
 'Cabin_1_1',
 'Cabin_1_2',
 'Cabin_1_3',
 'Cabin_1_4',
 'Cabin_1_5',
 'Cabin_1_6',
 'Cabin_1_7',
 'Cabin_3_0',
 'Cabin_3_1']

In [271]:
train_data.head()

Unnamed: 0,index,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,is_test,...,Cabin_1_0,Cabin_1_1,Cabin_1_2,Cabin_1_3,Cabin_1_4,Cabin_1_5,Cabin_1_6,Cabin_1_7,Cabin_3_0,Cabin_3_1
0,0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,1,0,0,0,0,0,0,1,0
1,1,0.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,0,...,0,0,0,0,0,1,0,0,0,1
2,2,0.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,0,...,1,0,0,0,0,0,0,0,0,1
3,3,0.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,0,...,1,0,0,0,0,0,0,0,0,1
4,4,0.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,0,...,0,0,0,0,0,1,0,0,0,1


In [272]:
train_data.columns

Index(['index', 'CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt',
       'ShoppingMall', 'Spa', 'VRDeck', 'is_test', 'Cabin_2', 'HomePlanet_0',
       'HomePlanet_1', 'HomePlanet_2', 'Destination_0', 'Destination_1',
       'Destination_2', 'Cabin_1_0', 'Cabin_1_1', 'Cabin_1_2', 'Cabin_1_3',
       'Cabin_1_4', 'Cabin_1_5', 'Cabin_1_6', 'Cabin_1_7', 'Cabin_3_0',
       'Cabin_3_1'],
      dtype='object')

Now we want to normalize the continuous data, But first we want to fill the missing values

Let's try to fill the test missing values using an Imputer

In [273]:
categorical_missing = list(set(missing_list.index).intersection(set(categorical_columns).union(set(boolean_columns))))

In [274]:
categorical_missing

['CryoSleep', 'VIP']

And a regression imputer for the continuous columns

In [275]:
continuous_missing = list(set(missing_list.index).intersection(set(continuous_columns)))

In [276]:
continuous_missing

['ShoppingMall', 'FoodCourt', 'RoomService', 'VRDeck', 'Spa', 'Cabin_2', 'Age']

In [277]:
# from sklearn.impute import KNNImputer
# categorical_imputer = KNNImputer(n_neighbors=5)
# categorical_imputer.fit(train_data)
# transformed_data = pd.DataFrame(categorical_imputer.transform(train_data),columns=train_data.columns)
# #Replace the values only in the categorical columns
# train_data[categorical_missing] = transformed_data[categorical_missing]

# from sklearn.experimental import enable_iterative_imputer  # For sklearn < 0.24
# from sklearn.impute import IterativeImputer
# from sklearn.impute import KNNImputer
# continuous_imputer = IterativeImputer()
# continuous_imputer.fit(train_data)
# transformed_data = pd.DataFrame(continuous_imputer.transform(train_data),columns=train_data.columns)
# #Replace the values only in the continuous columns
# train_data[continuous_missing] = transformed_data[continuous_missing]

In [278]:
# from sklearn.impute import SimpleImputer
# categorical_imputer = SimpleImputer(strategy='most_frequent')
# categorical_imputer.fit(train_data)
# transformed_data = pd.DataFrame(categorical_imputer.transform(train_data),columns=train_data.columns)
# train_data[categorical_missing] = transformed_data[categorical_missing]

# continuous_imputer = SimpleImputer(strategy='mean')
# continuous_imputer.fit(train_data)
# transformed_data = pd.DataFrame(continuous_imputer.transform(train_data),columns=train_data.columns)
# #Replace the values only in the continuous columns
# train_data[continuous_missing] = transformed_data[continuous_missing]

We will drop the missing training data and impute simply the missing test data

In [279]:
only_train_data = train_data[train_data['is_test'] == 0].copy()
only_test_data = train_data[train_data['is_test'] == 1].copy()

In [280]:
only_train_data.dropna(inplace=True)
train_label = train_label[only_train_data.index]

In [281]:
from sklearn.impute import SimpleImputer
categorical_imputer = SimpleImputer(strategy='most_frequent')
categorical_imputer.fit(train_data)
transformed_data = pd.DataFrame(categorical_imputer.transform(train_data),columns=train_data.columns)
only_test_data[categorical_missing] = transformed_data[transformed_data['is_test'] == 1][categorical_missing]

continuous_imputer = SimpleImputer(strategy='mean')
continuous_imputer.fit(train_data)
transformed_data = pd.DataFrame(continuous_imputer.transform(train_data),columns=train_data.columns)
#Replace the values only in the continuous columns
only_test_data[continuous_missing] = transformed_data[transformed_data['is_test'] == 1][continuous_missing]

In [282]:
train_data = pd.concat([only_train_data,only_test_data])

Scaling the continuous values

In [289]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train_data[continuous_columns])
train_data[continuous_columns] = scaler.transform(train_data[continuous_columns])

Now let's continue to the training phase, We will test out different types of models and then choose the best one

In [290]:
from sklearn.model_selection import train_test_split

In [291]:
only_train_data = train_data[train_data['is_test'] == 0].copy().drop('is_test',axis=1)
only_test_data = train_data[train_data['is_test'] == 1].copy().drop('is_test',axis=1)

In [292]:
X_train, X_test, y_train, y_test = train_test_split(only_train_data, train_label, test_size=0.3, random_state=42)

Let's start with simple linear regression as a base line

In [293]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
lr_model = LogisticRegression()
lr_model.fit(X_train,y_train)

In [295]:
pred = lr_model.predict(X_test)
accuracy_score(pred,y_test)

0.7257761053621825

In [296]:
#Results with smart imputation : 0.7553680981595092
#Results with simple imputation : 0.7561349693251533
#Results with droping na : 0.7634054562558796

In [297]:
from sklearn.ensemble import GradientBoostingClassifier
GB_model = GradientBoostingClassifier(random_state=42)
GB_model.fit(X_train, y_train)

In [298]:
y_pred = GB_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.8160865475070554

In [299]:
#Results with smart imputation : 0.799079754601227
#Results with simple imputation : 0.799079754601227
#Results with droping na : 0.8151458137347131

In [310]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

GradientBoosting provided the best base line result yets so let's tweak it's hyperparameters to get the best version

In [None]:
# Define the GradientBoostingClassifier
gb_model = GradientBoostingClassifier(random_state=42)

param_grid = {
    'learning_rate': [0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=gb_model, 
                           param_grid=param_grid, 
                           scoring='accuracy', 
                           cv=3, 
                           verbose=1, 
                           n_jobs=-1)

grid_search.fit(X_train, y_train)

In [None]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:\n", best_params)
print("Best Cross-Validation Score:", best_score)

The results are worse than the base line so let's go back to that 
and tweak values closer to the default ones

In [None]:
# Define the GradientBoostingClassifier
gb_model = GradientBoostingClassifier(random_state=42)

param_grid = {
    'learning_rate': [0.18, 0.1, 0.12],
    'n_estimators': [50,100,150],
    'max_depth': [2,3,4],
    'min_samples_split': [10,15,20],
    'min_samples_leaf': [1,2,3]
}

grid_search = GridSearchCV(estimator=gb_model, 
                           param_grid=param_grid, 
                           scoring='accuracy', 
                           cv=3, 
                           verbose=1, 
                           n_jobs=-1)

grid_search.fit(X_train, y_train)

In [None]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:\n", best_params)
print("Best Cross-Validation Score:", best_score)

In [None]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Set Accuracy:", accuracy)

Some improvement, let's keep digging

In [313]:
# Define the GradientBoostingClassifier
gb_model = GradientBoostingClassifier(random_state=42)

param_grid = {
    'learning_rate': [0.12,0.16,0.2,0.3],
    'n_estimators': [80,100,120],
    'max_depth': [4,5,6],
    'min_samples_split': [12,15,17],
    'min_samples_leaf': [2]
}

grid_search = GridSearchCV(estimator=gb_model, 
                           param_grid=param_grid, 
                           scoring='accuracy', 
                           cv=3, 
                           verbose=1, 
                           n_jobs=-1)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [314]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:\n", best_params)
print("Best Cross-Validation Score:", best_score)

Best Parameters:
 {'learning_rate': 0.12, 'max_depth': 4, 'min_samples_leaf': 2, 'min_samples_split': 17, 'n_estimators': 100}
Best Cross-Validation Score: 0.8069794591680838


In [315]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Set Accuracy:", accuracy)

Test Set Accuracy: 0.8156161806208843


No improvement

So the best model is GradientBoosting with the parameters we just found

In [316]:
grid_search.best_params_

{'learning_rate': 0.12,
 'max_depth': 4,
 'min_samples_leaf': 2,
 'min_samples_split': 17,
 'n_estimators': 100}

In [317]:
gb_model = GradientBoostingClassifier(learning_rate = 0.12,max_depth=4,min_samples_leaf= 2,min_samples_split= 17,n_estimators=100)

In [318]:
gb_model.fit(only_train_data,train_label)

In [None]:
submission = gb_model.predict(only_test_data)
pid = pd.read_csv('test.csv')['PassengerId']
submission = pd.DataFrame({'PassengerId':pid,'Transported':submission})
submission['Transported'] = submission['Transported'].map({1:True,0:False}) 
submission.to_csv('submission.csv', index=False)