In [1]:
import gc
import pandas as pd
import seaborn as sns
import numpy as np
import plotly.express as px
from tqdm import tqdm
gc.collect()

##disable deprecation warnings
import warnings
def fxn():
    warnings.warn("deprecated", DeprecationWarning)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()

basepath = 'C:/Users/avasquez/Desktop/kaggle/spaceship_titanic/'

train_path = basepath + 'train.csv'
train_df = pd.read_csv(train_path, index_col="PassengerId")
y_train = train_df['Transported'].values
train_df = train_df.drop('Transported', axis=1)
train_df = train_df.drop(['Name', 'Cabin'], axis=1)

test_path = basepath + 'test.csv'
test_df = pd.read_csv(test_path, index_col="PassengerId")
test_df = test_df.drop(['Name', 'Cabin'], axis=1)

In [2]:
print('----------Summary----------')
print('Initial Sample Size: ', y_train.shape)
print('Initial Num Features: ', len(list(train_df)))
print('\nInitial Features: ', list(train_df))
print('\n')
print('Statistics:\n', train_df.describe())
print('\nTrain shape: ', train_df.values.shape)
print('\nTest shape: ', test_df.values.shape)
train_df.head()

----------Summary----------
Initial Sample Size:  (8693,)
Initial Num Features:  10

Initial Features:  ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']


Statistics:
                Age   RoomService     FoodCourt  ShoppingMall           Spa  \
count  8514.000000   8512.000000   8510.000000   8485.000000   8510.000000   
mean     28.827930    224.687617    458.077203    173.729169    311.138778   
std      14.489021    666.717663   1611.489240    604.696458   1136.705535   
min       0.000000      0.000000      0.000000      0.000000      0.000000   
25%      19.000000      0.000000      0.000000      0.000000      0.000000   
50%      27.000000      0.000000      0.000000      0.000000      0.000000   
75%      38.000000     47.000000     76.000000     27.000000     59.000000   
max      79.000000  14327.000000  29813.000000  23492.000000  22408.000000   

             VRDeck  
count   8505.000000  
mean     304.854

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0
0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0
0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0
0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0
0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0


In [3]:
%%time

# ##combine X_train and X_test for encoding
data_tmp = pd.concat([train_df, test_df])
data_df = pd.DataFrame(data_tmp, columns = list(train_df))

##insert average
for col in tqdm(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], 
                desc='Filling Empty Values with Mean of Column.'):
    mean_avg = data_df[col].mean()
    data_df.loc[data_df[col] == '', col] = mean_avg

def one_hot_encode(col_name, dataframe):
    one_hot = pd.get_dummies(dataframe[col_name])
    dataframe = dataframe.drop(col_name, axis=1)
    dataframe = dataframe.join(one_hot, rsuffix='_right')
    print('Deleted Column: ', col_name)
    return dataframe

##encode data
for col in tqdm(['HomePlanet', 'Destination'], 
                desc='Encoding Categorical Columns.'):
    data_df = one_hot_encode(col, data_df)
    
# ##change train/test sets to float
data_df = data_df.astype(np.float64)

##separate back into train and test sets
X_test = data_df.values[8693:]
X_train = data_df.values[:8693]
print('\nX_train Shape: ', X_train.shape)
print('X_test Shape: ', X_test.shape)

##fill X_train nan values with mean using numpy
col_mean = np.nanmean(data_df.values, axis=0)
inds = np.where(np.isnan(X_train))
X_train[inds] = np.take(col_mean, inds[1])

##fill X_test nan values with mean using numpy
col_mean = np.nanmean(data_df.values, axis=0)
inds = np.where(np.isnan(X_test))
X_test[inds] = np.take(col_mean, inds[1])
    
##debug
# data_df.to_csv('debug.csv')

Filling Empty Values with Mean of Column.: 100%|███████████████████████████████████████| 5/5 [00:00<00:00, 2495.72it/s]
Encoding Categorical Columns.: 100%|████████████████████████████████████████████████████| 2/2 [00:00<00:00, 333.12it/s]

Deleted Column:  HomePlanet
Deleted Column:  Destination

X_train Shape:  (8693, 14)
X_test Shape:  (4277, 14)
CPU times: total: 15.6 ms
Wall time: 19 ms





In [4]:
%%time

from sklearn.preprocessing import StandardScaler, MinMaxScaler

##scale X_train
scaler = MinMaxScaler()
data= scaler.fit_transform(data_df.values)

##separate back into train and test sets
X_test = data[8693:]
X_train = data[:8693]
print('\nX_train Shape: ', X_train.shape)
print('X_test Shape: ', X_test.shape)

print(X_train)
print('\n')


X_train Shape:  (8693, 14)
X_test Shape:  (4277, 14)
[[0.         0.49367089 0.         ... 0.         0.         1.        ]
 [0.         0.30379747 0.         ... 0.         0.         1.        ]
 [0.         0.73417722 1.         ... 0.         0.         1.        ]
 ...
 [0.         0.32911392 0.         ... 0.         0.         1.        ]
 [0.         0.40506329 0.         ... 1.         0.         0.        ]
 [0.         0.55696203 0.         ... 0.         0.         1.        ]]


CPU times: total: 31.2 ms
Wall time: 34 ms


In [5]:
##Direct correlation between each column of X and the target y
corrs = np.array([np.correlate(X_train[:,j], y_train)[0] for j in range(X_train.shape[1])])

##Reverse sort, numpy array negation reverses the order
ranks = np.argsort((-corrs))

##Display top-9 and bot-5
rankings = [(f'{corrs[j]:.1f}', list(data_df)[j]) for j in ranks]
print('\nMost correlated feature to target classes: ')
print('----------------------------------------------')
display(rankings[:])
print('\nLeast correlated feature to target classes: ')
print('----------------------------------------------')
display(rankings[-10:])


Most correlated feature to target classes: 
----------------------------------------------


[('2787.0', 'TRAPPIST-1e'),
 ('2521.4', 'CryoSleep'),
 ('1951.0', 'Earth'),
 ('1538.9', 'Age'),
 ('1404.0', 'Europa'),
 ('1098.0', '55 Cancri e'),
 ('920.0', 'Mars'),
 ('401.0', 'PSO J318.5-22'),
 ('78.2', 'VIP'),
 ('78.0', 'FoodCourt'),
 ('33.5', 'ShoppingMall'),
 ('20.2', 'RoomService'),
 ('13.5', 'VRDeck'),
 ('13.1', 'Spa')]


Least correlated feature to target classes: 
----------------------------------------------


[('1404.0', 'Europa'),
 ('1098.0', '55 Cancri e'),
 ('920.0', 'Mars'),
 ('401.0', 'PSO J318.5-22'),
 ('78.2', 'VIP'),
 ('78.0', 'FoodCourt'),
 ('33.5', 'ShoppingMall'),
 ('20.2', 'RoomService'),
 ('13.5', 'VRDeck'),
 ('13.1', 'Spa')]

In [6]:
##remove least correlated features
data_df = data_df.drop(['Spa'], axis=1)

##scale X_train
scaler = MinMaxScaler()
data = scaler.fit_transform(data_df.values)

##separate back into train and test sets
X_test = data[8693:]
X_train = data[:8693]
print('\nX_train Shape: ', X_train.shape)
print('X_test Shape: ', X_test.shape)

print(X_train)
print('\n')


X_train Shape:  (8693, 13)
X_test Shape:  (4277, 13)
[[0.         0.49367089 0.         ... 0.         0.         1.        ]
 [0.         0.30379747 0.         ... 0.         0.         1.        ]
 [0.         0.73417722 1.         ... 0.         0.         1.        ]
 ...
 [0.         0.32911392 0.         ... 0.         0.         1.        ]
 [0.         0.40506329 0.         ... 1.         0.         0.        ]
 [0.         0.55696203 0.         ... 0.         0.         1.        ]]




In [7]:
%%time

from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(criterion='gini')
dtr_scores = cross_val_score(dt_clf, X_train, y_train, cv=10, verbose=1, n_jobs=-1)

print('Cross Validation Scores: ', dtr_scores)
print('Cross Validation Mean Score: ', sum(dtr_scores)/len(dtr_scores))
print('Cross Validation Stdev Score: ', np.std(dtr_scores))
print('\n')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Cross Validation Scores:  [0.73678161 0.68505747 0.73333333 0.72382048 0.69044879 0.72842348
 0.72151899 0.69620253 0.73647871 0.71921749]
Cross Validation Mean Score:  0.7171282885599777
Cross Validation Stdev Score:  0.018432377048894423


CPU times: total: 703 ms
Wall time: 884 ms


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.7s finished


In [8]:
# %%time

from sklearn.neural_network import MLPClassifier

mlp_clf = MLPClassifier(hidden_layer_sizes=(50,), max_iter=300, verbose=1, solver='adam', activation='tanh', 
                        learning_rate='adaptive', early_stopping=True, alpha=0.001)
mlp_scores = cross_val_score(mlp_clf, X_train, y_train, cv=10, verbose=1, n_jobs=-1)

print('Cross Validation Scores: ', mlp_scores)
print('Cross Validation Mean Score: ', sum(mlp_scores)/len(mlp_scores))
print('Cross Validation Stdev Score: ', np.std(mlp_scores))
print('\n')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Cross Validation Scores:  [0.76436782 0.72298851 0.73908046 0.77445339 0.76294591 0.75373993
 0.7410817  0.76064442 0.76869965 0.76179517]
Cross Validation Mean Score:  0.7549796965728873
Cross Validation Stdev Score:  0.015061582155308473




[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.1s finished


In [9]:
%%time

from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(max_depth=5, n_estimators=500, n_jobs=-1, warm_start=True, 
                                ccp_alpha=0.0011, max_features='log2', min_samples_leaf=2)

# rf_clf = RandomForestClassifier(verbose=1, n_estimators=500)

rf_scores = cross_val_score(rf_clf, X_train, y_train, cv=10, verbose=1, n_jobs=-1)

print('Cross Validation Scores: ', rf_scores)
print('Cross Validation Mean Score: ', sum(rf_scores)/len(rf_scores))
print('Cross Validation Stdev Score: ', np.std(rf_scores))
print('\n')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Cross Validation Scores:  [0.76896552 0.73448276 0.74827586 0.77100115 0.72612198 0.76294591
 0.74453395 0.77560414 0.78365938 0.77100115]
Cross Validation Mean Score:  0.7586591801912623
Cross Validation Stdev Score:  0.018143569480779085


CPU times: total: 31.2 ms
Wall time: 1.73 s


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.6s finished


In [10]:
%%time

##split a small validation set for sanity
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.01, random_state=42)

rf_clf = RandomForestClassifier(verbose=1, 
                                max_depth=25, 
                                n_estimators=500, 
                                n_jobs=-1, 
                                warm_start=True, 
                                ccp_alpha=0.0011, 
                                max_features='log2', 
                                min_samples_leaf=2).fit(X_tr, y_tr)

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
preds = rf_clf.predict(X_val)
accuracy = accuracy_score(y_val, preds)
f1 = f1_score(y_val, preds)

print('Accuracy Score: ', accuracy)
print('F1 Score: ', f1)
print('\n')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    0.1s


Accuracy Score:  0.7701149425287356
F1 Score:  0.7777777777777777


CPU times: total: 3.72 s
Wall time: 714 ms


[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    0.5s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 500 out of 500 | elapsed:    0.0s finished


In [11]:
%%time

##get test predictions
preds = rf_clf.predict(X_test)
print('Prediction Shape: ', preds.shape)
print('Last prediction: ', preds[-1])

##write to csv
preds_df = pd.DataFrame(test_df.index.values, columns=['PassengerId'])
preds_df['Transported'] = preds 
preds_df.to_csv(basepath + 'vasquez_submission.csv', index = False)

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.0s


Prediction Shape:  (4277,)
Last prediction:  True
CPU times: total: 219 ms
Wall time: 95.6 ms


[Parallel(n_jobs=16)]: Done 500 out of 500 | elapsed:    0.0s finished
