In [15]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mutual_info_score, roc_auc_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, KFold

from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [16]:
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [17]:
del df_train['Name']

In [18]:
df_cabin = df_train['Cabin'].str.split('/')

In [19]:
df_cabin = df_cabin.fillna(0)

In [20]:
deck = []
num = []
side  = []
for x in df_cabin:
    if x!=0:
        deck.append(x[0])
        num.append(x[1])
        side.append(x[2])
    else:
        deck.append(0)
        num.append(0)
        side.append(0)

In [21]:
df_train['deck'] = deck
df_train['num'] = num
df_train['side'] = side

In [22]:
del df_train['Cabin']

In [23]:
df_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,deck,num,side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1,S


In [26]:
group = []
number  = []
for i in df_train.PassengerId.str.split('_'):
    group.append(i[0])
    number.append(i[1])

In [33]:
df_train['group'] = group
df_train['number'] = number

In [34]:
df_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,deck,num,side,group,number
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0,P,1,1
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0,S,2,1
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0,S,3,1
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0,S,3,2
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1,S,4,1


In [35]:
df_train.nunique()

PassengerId     8693
HomePlanet         3
CryoSleep          2
Destination        3
Age               80
VIP                2
RoomService     1273
FoodCourt       1507
ShoppingMall    1115
Spa             1327
VRDeck          1306
Transported        2
deck               9
num             1818
side               3
group           6217
number             8
dtype: int64

In [36]:
df = df_train.copy()

In [38]:
df.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
deck              0
num               0
side              0
group             0
number            0
dtype: int64

In [39]:
df.Transported = df.Transported.astype(int)
df.group = df.group.astype(int)
df.number = df.number.astype(int)

In [41]:
df['side'] = (df.side == 'P').astype(int) # P is True S is False

In [42]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,deck,num,side,group,number
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,0,B,0,1,1,1
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,1,F,0,0,2,1
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,0,A,0,0,3,1
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,0,A,0,0,3,2
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,1,F,1,0,4,1


In [45]:
num = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'group', 'number']
cat = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'side']

In [47]:
df_fill_zero = df.fillna(0)

In [48]:
for n in num:
    score = roc_auc_score(df_fill_zero.Transported, df_fill_zero[n])
    if score < 0.5:
        print(n, roc_auc_score(df_fill_zero.Transported, -df_fill_zero[n]))
    else:
        print(n, score)

Age 0.5395503801531625
RoomService 0.6769596957716001
FoodCourt 0.591758592816606
ShoppingMall 0.6099563179851644
Spa 0.6825356901435439
VRDeck 0.6700734791623767
group 0.5125714689533203
number 0.5403735733338556


In [49]:
df_fill_zero[num].corrwith(df_fill_zero.Transported)

Age            -0.071330
RoomService    -0.241124
FoodCourt       0.045583
ShoppingMall    0.009391
Spa            -0.218545
VRDeck         -0.204874
group           0.021491
number          0.066390
dtype: float64

In [51]:
df_fill_zero.CryoSleep = df_fill_zero.CryoSleep.astype(int)
df_fill_zero.VIP = df_fill_zero.VIP.astype(int)

In [55]:
round(mutual_info_score(df_fill_zero['side'], df_fill_zero.Transported), 3)

0.005

In [59]:
for v in ['europa', 'earth', 'mars']:
    df_fill_zero[v] = (df_fill_zero.HomePlanet == v).astype(int)

In [60]:
df_fill_zero.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,deck,num,side,group,number,europa,earth,mars
0,0001_01,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,0,B,0,1,1,1,0,0,0
1,0002_01,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,1,F,0,0,2,1,0,0,0
2,0003_01,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,A,0,0,3,1,0,0,0
3,0003_02,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,A,0,0,3,2,0,0,0
4,0004_01,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,1,F,1,0,4,1,0,0,0


In [61]:
num = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'group', 'number']
cat = ['CryoSleep', 'VIP', 'side', 'europa', 'mars', 'earth']

In [63]:
for c in cat:
    print(c, round(mutual_info_score(df_fill_zero[c], df_fill_zero.Transported), 3))

CryoSleep 0.112
VIP 0.001
side 0.005
europa 0.0
mars 0.0
earth 0.0


In [62]:
df_train, df_val = train_test_split(df_fill_zero, test_size = 0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

y_train = df_train.Transported.values
y_val = df_val.Transported.values

del df_train['Transported']
del df_val['Transported']

In [64]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[cat + num].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[cat + num].to_dict(orient='records')
X_val = dv.fit_transform(val_dict)

In [65]:
rf = RandomForestClassifier(max_depth=10, min_samples_leaf=3, n_estimators=200)
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, min_samples_leaf=3, n_estimators=200)

In [66]:
y_pred = rf.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred)

0.8629760622260165

In [67]:
rf_parameters = {'max_depth' : [None,5,10,15,25],
                'min_samples_leaf' : [1,3,5,10,50],
                'n_estimators': np.arange(100,501,50)
                }
scores = []

for max_depth in rf_parameters['max_depth']:
    for min_samples_leaf in rf_parameters['min_samples_leaf']:
        for n_estimators in rf_parameters['n_estimators']:
            rf = RandomForestClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf, n_estimators=n_estimators)
            rf.fit(X_train, y_train)

            y_pred = rf.predict_proba(X_val)[:, 1]
            score = roc_auc_score(y_val, y_pred)
            scores.append([max_depth, min_samples_leaf, n_estimators, score])

In [68]:
scores = pd.DataFrame(scores, columns=['max_depth', 'min_samples_leaf', 'n_estimators', 'score'])
scores.sort_values('score', ascending=False)

Unnamed: 0,max_depth,min_samples_leaf,n_estimators,score
156,15.0,5,250,0.865852
205,25.0,5,450,0.865798
16,,3,450,0.865661
151,15.0,3,450,0.865588
149,15.0,3,350,0.865563
...,...,...,...,...
73,5.0,10,150,0.848462
86,5.0,50,350,0.848267
88,5.0,50,450,0.848049
89,5.0,50,500,0.847919


In [69]:
from catboost import CatBoostClassifier, Pool

In [103]:
model = CatBoostClassifier(iterations=5,
                           depth=10,
                           learning_rate=1,
                           loss_function='Logloss',
                           verbose=True)
model.fit(X_train, y_train)

0:	learn: 0.5081738	total: 15ms	remaining: 60.2ms
1:	learn: 0.4727607	total: 30.3ms	remaining: 45.4ms
2:	learn: 0.4548077	total: 45.7ms	remaining: 30.4ms
3:	learn: 0.4444712	total: 61ms	remaining: 15.2ms
4:	learn: 0.4378506	total: 76.6ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1781bbde9a0>

In [104]:
y_pred = model.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred)

0.862109647607239

In [105]:
preds_class = model.predict(X_val)
roc_auc_score(y_val, preds_class)

0.797451233476296

In [109]:
import xgboost as xgb
from xgboost import XGBClassifier

In [152]:
model = XGBClassifier(silent=False, 
                      scale_pos_weight=1,
                      learning_rate=0.1,  
                      colsample_bytree = 0.6,
                      subsample = 0.6,
                      objective='binary:logistic', 
                      n_estimators=1000, 
                      reg_alpha = 0.3,
                      max_depth=4, 
                      gamma=10)
model.fit(X_train, y_train)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.6,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=10, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=1000,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0.3, reg_lambda=1, ...)

In [153]:
y_pred = model.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred)

0.8720535127480674

In [154]:
preds_class = model.predict(X_val)
roc_auc_score(y_val, preds_class)

0.8093701512964883

In [155]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [157]:
del df_test['Name']

In [158]:
df_cabin = df_test['Cabin'].str.split('/')
df_cabin = df_cabin.fillna(0)

deck = []
num = []
side  = []
for x in df_cabin:
    if x!=0:
        deck.append(x[0])
        num.append(x[1])
        side.append(x[2])
    else:
        deck.append(0)
        num.append(0)
        side.append(0)

df_test['deck'] = deck
df_test['num'] = num
df_test['side'] = side

del df_test['Cabin']

In [159]:
group = []
number  = []
for i in df_test.PassengerId.str.split('_'):
    group.append(i[0])
    number.append(i[1])
    
df_test['group'] = group
df_test['number'] = number

In [166]:
df_test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,deck,num,side,group,number
0,0013_01,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,G,3,0,13,1
1,0018_01,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,F,4,0,18,1
2,0019_01,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,C,0,0,19,1
3,0021_01,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,C,1,0,21,1
4,0023_01,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,F,5,0,23,1


In [165]:
df_test.group = df_test.group.astype(int)
df_test.number = df_test.number.astype(int)
df_test['side'] = (df_test.side == 'P').astype(int) # P is True S is False

In [169]:
df_test = df_test.fillna(0)

In [170]:
for v in ['europa', 'earth', 'mars']:
    df_test[v] = (df_test.HomePlanet == v).astype(int)

In [167]:
num = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'group', 'number']
cat = ['CryoSleep', 'VIP', 'side', 'europa', 'mars', 'earth']

In [173]:
test_dict = df_test[cat + num].to_dict(orient='records')
X_test = dv.fit_transform(test_dict)

In [174]:
preds_class = model.predict(X_test)

In [183]:
df_test['Transported'] = preds_class==1

In [184]:
df_test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,deck,num,side,group,number,europa,earth,mars,Transported
0,0013_01,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,G,3,0,13,1,0,0,0,True
1,0018_01,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,F,4,0,18,1,0,0,0,False
2,0019_01,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,C,0,0,19,1,0,0,0,True
3,0021_01,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,C,1,0,21,1,0,0,0,True
4,0023_01,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,F,5,0,23,1,0,0,0,True


In [185]:
df_test_final = df_test[['PassengerId', 'Transported']]
df_test_final = df_test_final.set_index('PassengerId')
df_test_final.to_csv('XGboost.csv')

In [186]:
df_test_final

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,True
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,True
