In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from kaggle_titanic_helpers import *
import missingno as msmn

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
# These people are not in the same cabin, but in same passenger group
# Need to look at capacity of cabin sizes
def check_pass_group(group_num):
    return train[train['PassengerId'].apply(lambda x: str(x[:4]))==group_num]

check_pass_group('0138')

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
127,0138_01,Earth,True,G/18/P,TRAPPIST-1e,13.0,False,0.0,0.0,0.0,0.0,0.0,Fayene Gambs,True
128,0138_02,Earth,False,E/5/P,,34.0,False,0.0,22.0,0.0,564.0,207.0,Monah Gambs,False


In [4]:
def basic_prep(df):
    df[['deck', 'number', 'side']] = pd.DataFrame(
        df.Cabin.apply(lambda x: str(x).split('/')).tolist(),
        index= df.index,
        columns=['deck', 'number', 'side'],
    )
    # PassengerId follows XXXX_XX structure for all
    df['passenger_group'] = df['PassengerId'].apply(lambda x: str(x[:4]))
    df['num_group'] = df['PassengerId'].apply(lambda x: float((x[-2:])))
    
    df['total_spend'] = df[spend_cols].sum(axis=1)
    df = df.join(df.groupby('passenger_group')\
                   .agg({'total_spend':'sum','num_group':'max'}),on='passenger_group',rsuffix='_max')
    df['num_group_max'] = df['num_group_max'].astype('int')
    
    df['surname'] = df[df['Name'].notna()].Name.apply(lambda x: str(x).split(" ")[-1])
    
    df.replace({'nan':np.nan},inplace=True) # Needed, since split above creates "nan"
    df.number = df.number.astype(float)


    # Setting up error handling as test_df doesn't have "Transported"
    try:
        df["Transported"] = df["Transported"].replace({True: 1, False: 0})
    except:
        pass
    return df

In [5]:
def fill_missing_basic(df, float_cols, object_cols):
    df[spend_cols] = df[spend_cols].fillna(df[spend_cols].mean())
    df[float_cols] = df[float_cols].fillna(df[float_cols].mean().to_dict())
    object_dict = {k:v[0] for k, v in df[object_cols].mode().to_dict().items()}
    df[object_cols] = df[object_cols].fillna(object_dict)
    return df

In [6]:
# Getting spend cols
spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

train_df = basic_prep(train)
test_df = basic_prep(test)

In [7]:
train_df = fill_cryosleep(train_df)

217 Initially
98 After Step 1: Spending Money = No CryoSleep
61 After Step 2: 1 Passenger / Zero Spend = CryoSleep
43 After Step 3: Total Group Spend Zero = CryoSleep
0 After Step 4: Total Group Spend > Zero = No CryoSleep


In [8]:
train_df = fill_homeplanet(train_df)

201 Initially
111 after filling passenger groups
104 after filling Europa decks
63 after filling Earth decks
0 after filling the rest with Earth


In [9]:
float_cols = list(train_df.dtypes[train_df.dtypes==float].index)
object_cols = list(train_df.dtypes[train_df.dtypes==object].index)

float_cols.remove('number')

object_cols.remove('PassengerId')
object_cols.remove('Name')
object_cols.remove('surname')
object_cols.remove('Cabin')
object_cols.remove('passenger_group')

In [10]:
object_cols,float_cols

(['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'deck', 'side'],
 ['Age',
  'RoomService',
  'FoodCourt',
  'ShoppingMall',
  'Spa',
  'VRDeck',
  'num_group',
  'total_spend',
  'total_spend_max'])

In [11]:
# Feature encoding - Given the lack of signal for most, I am only going to use CryoSleep, HomePlanet, Destination
# Scaling numerical variables
from sklearn.preprocessing import MinMaxScaler

def feature_transformation(df,discrete_variables,numeric_variables):

    cat_df = pd.get_dummies(df[discrete_variables], drop_first=False)
    
    # define min max scaler
    scaler = MinMaxScaler()
    # transform data
    num_df = pd.DataFrame(scaler.fit_transform(df[numeric_variables]),columns=numeric_variables)
    
    final_df = pd.merge(cat_df,num_df,left_index=True,right_index=True)
    
    return final_df

In [12]:
train_df_trans = feature_transformation(train_df,object_cols,float_cols)

In [13]:
train_df_trans[train_df_trans.isna()==False]

Unnamed: 0,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,CryoSleep_True,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True,...,side_S,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,num_group,total_spend,total_spend_max
0,0,1,0,1,0,0,0,1,1,0,...,0,0.493671,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,1,0,0,1,0,0,0,1,1,0,...,1,0.303797,0.007608,0.000302,0.001064,0.024500,0.001823,0.000000,0.020452,0.013974
2,0,1,0,1,0,0,0,1,0,1,...,1,0.734177,0.003001,0.119948,0.000000,0.299670,0.002030,0.000000,0.288521,0.295417
3,0,1,0,1,0,0,0,1,1,0,...,1,0.417722,0.000000,0.043035,0.015793,0.148563,0.007997,0.142857,0.143830,0.295417
4,1,0,0,1,0,0,0,1,1,0,...,1,0.202532,0.021149,0.002348,0.006428,0.025214,0.000083,0.000000,0.030317,0.020715
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0,1,0,1,0,1,0,0,0,1,...,0,0.518987,0.000000,0.228726,0.000000,0.073322,0.003066,0.000000,0.237197,0.162072
8689,1,0,0,0,1,0,1,0,1,0,...,1,0.227848,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8690,1,0,0,1,0,0,0,1,1,0,...,1,0.329114,0.000000,0.000000,0.079687,0.000045,0.000000,0.000000,0.052047,0.035562
8691,0,1,0,1,0,1,0,0,1,0,...,1,0.405063,0.000000,0.035186,0.000000,0.015753,0.134049,0.000000,0.128852,0.179673


In [14]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
train_df_trans2 = pd.DataFrame(imputer.fit_transform(train_df_trans),columns = train_df_trans.columns)

In [15]:
train_df.isna().sum()

PassengerId          0
HomePlanet           0
CryoSleep            0
Cabin              199
Destination        182
Age                179
VIP                203
RoomService        181
FoodCourt          183
ShoppingMall       208
Spa                183
VRDeck             188
Name               200
Transported          0
deck               199
number             199
side               199
passenger_group      0
num_group            0
total_spend          0
total_spend_max      0
num_group_max        0
surname            200
dtype: int64

In [16]:
train_df_trans2.isna().any()

HomePlanet_Earth             False
HomePlanet_Europa            False
HomePlanet_Mars              False
CryoSleep_False              False
CryoSleep_True               False
Destination_55 Cancri e      False
Destination_PSO J318.5-22    False
Destination_TRAPPIST-1e      False
VIP_False                    False
VIP_True                     False
deck_A                       False
deck_B                       False
deck_C                       False
deck_D                       False
deck_E                       False
deck_F                       False
deck_G                       False
deck_T                       False
side_P                       False
side_S                       False
Age                          False
RoomService                  False
FoodCourt                    False
ShoppingMall                 False
Spa                          False
VRDeck                       False
num_group                    False
total_spend                  False
total_spend_max     

In [17]:
import xgboost as xgb

In [18]:
clf = xgb.XGBClassifier()

In [19]:
Y_train = train_df.Transported

clf.fit(train_df_trans2, Y_train)

In [20]:
test_df = fill_cryosleep(test_df)
test_df = fill_homeplanet(test_df)

test_df_trans = feature_transformation(test_df,object_cols,float_cols)

test_df_trans2 = pd.DataFrame(imputer.fit_transform(test_df_trans),columns = test_df_trans.columns)

93 Initially
38 After Step 1: Spending Money = No CryoSleep
23 After Step 2: 1 Passenger / Zero Spend = CryoSleep
16 After Step 3: Total Group Spend Zero = CryoSleep
0 After Step 4: Total Group Spend > Zero = No CryoSleep
87 Initially
46 after filling passenger groups
40 after filling Europa decks
31 after filling Earth decks
0 after filling the rest with Earth


In [22]:
y_sub = clf.predict(test_df_trans2)
y_sub = pd.Series(y_sub).replace({1:'True',0:'False'}).values

pd.merge(test_df['PassengerId'],pd.DataFrame(y_sub,columns=['Transported']),left_index=True,right_index=True)\
            .to_csv('xgb_submission.csv',index=False)

## Grid Search

In [46]:
from sklearn.model_selection import GridSearchCV

parameters = {
     "eta"    : [0.05, 0.10, 0.15, 0.20] ,
     "max_depth"        : [ 5, 8, 10, 12],
     "min_child_weight" : [ 3, 5, 10],
     "gamma"            : [ 0.1, 0.2, 0.3],
     "colsample_bytree" : [ 0.4, 0.5, 0.7],
     "scoring" : ['accuracy','roc_auc','precision']
     }

grid = GridSearchCV(clf,
                    parameters, n_jobs=4,
                    scoring="neg_log_loss",
                    cv=3,
                       verbose=3)

grid.fit(train_df_trans2, Y_train)

Fitting 3 folds for each of 1296 candidates, totalling 3888 fits
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.4, eta=0.05, gamma=0.1, max_depth=5, min_child_weight=3, scoring=accuracy;, score=-0.435 total time=   0.5s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.4, eta=0.05, gamma=0.1, max_depth=5, min_child_weight=3, scoring=roc_auc;, score=-0.435 total time=   0.5s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.4, eta=0.05, gamma=0.1, max_depth=5, min_child_weight=5, scoring=accuracy;, score=-0.436 total time=   0.5s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.4, eta=0.05, gamma=0.1, max_depth=5, min_child_weight=5, scoring=precision;, score=-0.447 total time=   0.5s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.4, eta=0.05, gamma=0.1, max_depth=5, min_child_weight=10, scoring=accuracy;, score=-0.437 total time=   0.5s
Parameters: { "scoring" } are

Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.4, eta=0.05, gamma=0.1, max_depth=5, min_child_weight=3, scoring=roc_auc;, score=-0.446 total time=   0.5s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.4, eta=0.05, gamma=0.1, max_depth=5, min_child_weight=3, scoring=precision;, score=-0.435 total time=   0.5s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.4, eta=0.05, gamma=0.1, max_depth=5, min_child_weight=3, scoring=precision;, score=-0.410 total time=   0.5s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.4, eta=0.05, gamma=0.1, max_depth=5, min_child_weight=5, scoring=roc_auc;, score=-0.410 total time=   0.5s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.4, eta=0.05, gamma=0.1, max_depth=5, min_child_weight=5, scoring=precision;, score=-0.436 total time=   0.5s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.4, eta=0.05, gamma=0.

Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.4, eta=0.05, gamma=0.1, max_depth=5, min_child_weight=3, scoring=accuracy;, score=-0.410 total time=   0.5s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.4, eta=0.05, gamma=0.1, max_depth=5, min_child_weight=3, scoring=precision;, score=-0.446 total time=   0.5s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.4, eta=0.05, gamma=0.1, max_depth=5, min_child_weight=5, scoring=accuracy;, score=-0.410 total time=   0.5s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.4, eta=0.05, gamma=0.1, max_depth=5, min_child_weight=5, scoring=roc_auc;, score=-0.447 total time=   0.5s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.4, eta=0.05, gamma=0.1, max_depth=5, min_child_weight=5, scoring=precision;, score=-0.410 total time=   0.5s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.4, eta=0.05, gamma=0.

Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.4, eta=0.05, gamma=0.1, max_depth=5, min_child_weight=3, scoring=accuracy;, score=-0.446 total time=   0.5s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.4, eta=0.05, gamma=0.1, max_depth=5, min_child_weight=3, scoring=roc_auc;, score=-0.410 total time=   0.5s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.4, eta=0.05, gamma=0.1, max_depth=5, min_child_weight=5, scoring=accuracy;, score=-0.447 total time=   0.5s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.4, eta=0.05, gamma=0.1, max_depth=5, min_child_weight=5, scoring=roc_auc;, score=-0.436 total time=   0.5s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.4, eta=0.05, gamma=0.1, max_depth=5, min_child_weight=10, scoring=accuracy;, score=-0.446 total time=   0.5s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.4, eta=0.05, gamma=0.1,

Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.4, eta=0.15, gamma=0.2, max_depth=8, min_child_weight=5, scoring=roc_auc;, score=-0.403 total time=   0.7s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.4, eta=0.15, gamma=0.2, max_depth=8, min_child_weight=10, scoring=accuracy;, score=-0.443 total time=   0.7s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.4, eta=0.15, gamma=0.2, max_depth=8, min_child_weight=10, scoring=roc_auc;, score=-0.426 total time=   0.8s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.4, eta=0.15, gamma=0.2, max_depth=8, min_child_weight=10, scoring=precision;, score=-0.405 total time=   0.8s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.4, eta=0.15, gamma=0.2, max_depth=10, min_child_weight=3, scoring=roc_auc;, score=-0.450 total time=   0.9s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.4, eta=0.15, gamma=0

Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.4, eta=0.15, gamma=0.2, max_depth=8, min_child_weight=5, scoring=precision;, score=-0.444 total time=   0.7s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.4, eta=0.15, gamma=0.2, max_depth=8, min_child_weight=10, scoring=accuracy;, score=-0.426 total time=   0.7s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.4, eta=0.15, gamma=0.2, max_depth=8, min_child_weight=10, scoring=roc_auc;, score=-0.405 total time=   0.8s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.4, eta=0.15, gamma=0.2, max_depth=10, min_child_weight=3, scoring=accuracy;, score=-0.450 total time=   0.9s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.4, eta=0.15, gamma=0.2, max_depth=10, min_child_weight=3, scoring=roc_auc;, score=-0.437 total time=   0.9s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.4, eta=0.15, gamma=

Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.4, eta=0.15, gamma=0.2, max_depth=8, min_child_weight=10, scoring=roc_auc;, score=-0.443 total time=   0.8s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.4, eta=0.15, gamma=0.2, max_depth=8, min_child_weight=10, scoring=precision;, score=-0.426 total time=   0.7s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.4, eta=0.15, gamma=0.2, max_depth=10, min_child_weight=3, scoring=accuracy;, score=-0.411 total time=   0.9s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.4, eta=0.15, gamma=0.2, max_depth=10, min_child_weight=3, scoring=precision;, score=-0.450 total time=   0.9s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.4, eta=0.15, gamma=0.2, max_depth=10, min_child_weight=5, scoring=accuracy;, score=-0.434 total time=   0.9s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.4, eta=0.15, gam

Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.4, eta=0.15, gamma=0.2, max_depth=8, min_child_weight=10, scoring=precision;, score=-0.443 total time=   0.8s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.4, eta=0.15, gamma=0.2, max_depth=10, min_child_weight=3, scoring=accuracy;, score=-0.437 total time=   0.9s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.4, eta=0.15, gamma=0.2, max_depth=10, min_child_weight=3, scoring=roc_auc;, score=-0.411 total time=   0.9s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.4, eta=0.15, gamma=0.2, max_depth=10, min_child_weight=5, scoring=accuracy;, score=-0.454 total time=   0.9s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.4, eta=0.15, gamma=0.2, max_depth=10, min_child_weight=5, scoring=roc_auc;, score=-0.434 total time=   0.9s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.4, eta=0.15, gamma

Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.5, eta=0.05, gamma=0.3, max_depth=12, min_child_weight=3, scoring=roc_auc;, score=-0.445 total time=   1.4s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.5, eta=0.05, gamma=0.3, max_depth=12, min_child_weight=3, scoring=precision;, score=-0.445 total time=   1.4s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.5, eta=0.05, gamma=0.3, max_depth=12, min_child_weight=5, scoring=accuracy;, score=-0.403 total time=   1.3s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.5, eta=0.05, gamma=0.3, max_depth=12, min_child_weight=5, scoring=roc_auc;, score=-0.403 total time=   1.3s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.5, eta=0.05, gamma=0.3, max_depth=12, min_child_weight=10, scoring=accuracy;, score=-0.441 total time=   1.3s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.5, eta=0.05, gamm

Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.5, eta=0.05, gamma=0.3, max_depth=12, min_child_weight=3, scoring=accuracy;, score=-0.427 total time=   1.4s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.5, eta=0.05, gamma=0.3, max_depth=12, min_child_weight=3, scoring=roc_auc;, score=-0.404 total time=   1.4s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.5, eta=0.05, gamma=0.3, max_depth=12, min_child_weight=5, scoring=accuracy;, score=-0.441 total time=   1.3s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.5, eta=0.05, gamma=0.3, max_depth=12, min_child_weight=5, scoring=roc_auc;, score=-0.426 total time=   1.3s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.5, eta=0.05, gamma=0.3, max_depth=12, min_child_weight=5, scoring=precision;, score=-0.403 total time=   1.4s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.5, eta=0.05, gamma

Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.5, eta=0.05, gamma=0.3, max_depth=12, min_child_weight=10, scoring=accuracy;, score=-0.426 total time=   1.3s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.5, eta=0.05, gamma=0.3, max_depth=12, min_child_weight=10, scoring=roc_auc;, score=-0.403 total time=   1.2s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.5, eta=0.1, gamma=0.1, max_depth=5, min_child_weight=3, scoring=accuracy;, score=-0.437 total time=   0.6s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.5, eta=0.1, gamma=0.1, max_depth=5, min_child_weight=3, scoring=accuracy;, score=-0.395 total time=   0.6s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.5, eta=0.1, gamma=0.1, max_depth=5, min_child_weight=3, scoring=roc_auc;, score=-0.395 total time=   0.6s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.5, eta=0.1, gamma=0.1, 

Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.5, eta=0.05, gamma=0.3, max_depth=12, min_child_weight=5, scoring=roc_auc;, score=-0.441 total time=   1.4s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.5, eta=0.05, gamma=0.3, max_depth=12, min_child_weight=5, scoring=precision;, score=-0.426 total time=   1.3s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.5, eta=0.05, gamma=0.3, max_depth=12, min_child_weight=10, scoring=accuracy;, score=-0.403 total time=   1.3s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.5, eta=0.05, gamma=0.3, max_depth=12, min_child_weight=10, scoring=precision;, score=-0.441 total time=   1.2s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.5, eta=0.1, gamma=0.1, max_depth=5, min_child_weight=3, scoring=accuracy;, score=-0.423 total time=   0.5s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.5, eta=0.1, gamm

Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.5, eta=0.2, gamma=0.2, max_depth=5, min_child_weight=10, scoring=accuracy;, score=-0.398 total time=   0.7s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.5, eta=0.2, gamma=0.2, max_depth=5, min_child_weight=10, scoring=precision;, score=-0.439 total time=   0.5s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.5, eta=0.2, gamma=0.2, max_depth=8, min_child_weight=3, scoring=accuracy;, score=-0.438 total time=   0.7s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.5, eta=0.2, gamma=0.2, max_depth=8, min_child_weight=3, scoring=roc_auc;, score=-0.409 total time=   0.8s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.5, eta=0.2, gamma=0.2, max_depth=8, min_child_weight=5, scoring=accuracy;, score=-0.449 total time=   0.8s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.5, eta=0.2, gamma=0.2, ma

Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.5, eta=0.2, gamma=0.2, max_depth=8, min_child_weight=3, scoring=precision;, score=-0.438 total time=   0.8s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.5, eta=0.2, gamma=0.2, max_depth=8, min_child_weight=5, scoring=accuracy;, score=-0.409 total time=   0.7s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.5, eta=0.2, gamma=0.2, max_depth=8, min_child_weight=5, scoring=precision;, score=-0.449 total time=   0.7s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.5, eta=0.2, gamma=0.2, max_depth=8, min_child_weight=10, scoring=accuracy;, score=-0.434 total time=   0.8s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.5, eta=0.2, gamma=0.2, max_depth=8, min_child_weight=10, scoring=roc_auc;, score=-0.405 total time=   0.8s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.5, eta=0.2, gamma=0.2, m

Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.7, eta=0.1, gamma=0.3, max_depth=10, min_child_weight=3, scoring=precision;, score=-0.450 total time=   1.4s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.7, eta=0.1, gamma=0.3, max_depth=10, min_child_weight=5, scoring=accuracy;, score=-0.429 total time=   1.3s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.7, eta=0.1, gamma=0.3, max_depth=10, min_child_weight=5, scoring=roc_auc;, score=-0.429 total time=   1.2s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.7, eta=0.1, gamma=0.3, max_depth=10, min_child_weight=5, scoring=precision;, score=-0.401 total time=   1.3s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.5, eta=0.2, gamma=0.2, max_depth=8, min_child_weight=3, scoring=accuracy;, score=-0.409 total time=   0.8s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.5, eta=0.2, gamma=0.2,

Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.5, eta=0.2, gamma=0.2, max_depth=5, min_child_weight=10, scoring=accuracy;, score=-0.430 total time=   0.6s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.5, eta=0.2, gamma=0.2, max_depth=5, min_child_weight=10, scoring=roc_auc;, score=-0.398 total time=   0.7s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.5, eta=0.2, gamma=0.2, max_depth=8, min_child_weight=3, scoring=accuracy;, score=-0.449 total time=   0.7s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.5, eta=0.2, gamma=0.2, max_depth=8, min_child_weight=3, scoring=roc_auc;, score=-0.438 total time=   0.8s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.5, eta=0.2, gamma=0.2, max_depth=8, min_child_weight=3, scoring=precision;, score=-0.409 total time=   0.8s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.5, eta=0.2, gamma=0.2, max

Parameters: { "scoring" } are not used.



Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.7, eta=0.1, gamma=0.3, max_depth=10, min_child_weight=10, scoring=accuracy;, score=-0.441 total time=   1.2s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.7, eta=0.1, gamma=0.3, max_depth=10, min_child_weight=10, scoring=roc_auc;, score=-0.441 total time=   1.2s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.7, eta=0.1, gamma=0.3, max_depth=10, min_child_weight=10, scoring=precision;, score=-0.402 total time=   1.2s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.7, eta=0.1, gamma=0.3, max_depth=12, min_child_weight=3, scoring=roc_auc;, score=-0.453 total time=   1.5s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.7, eta=0.1, gamma=0.3, max_depth=12, min_child_weight=3, scoring=precision;, score=-0.435 total time=   1.8s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.7, eta=0.1, gamma=0

Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.7, eta=0.1, gamma=0.3, max_depth=10, min_child_weight=10, scoring=accuracy;, score=-0.402 total time=   1.2s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.7, eta=0.1, gamma=0.3, max_depth=10, min_child_weight=10, scoring=precision;, score=-0.441 total time=   1.2s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.7, eta=0.1, gamma=0.3, max_depth=12, min_child_weight=3, scoring=accuracy;, score=-0.435 total time=   1.5s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.7, eta=0.1, gamma=0.3, max_depth=12, min_child_weight=3, scoring=roc_auc;, score=-0.410 total time=   1.7s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.7, eta=0.1, gamma=0.3, max_depth=12, min_child_weight=5, scoring=accuracy;, score=-0.453 total time=   1.7s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.7, eta=0.1, gamma=0.

Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.7, eta=0.1, gamma=0.3, max_depth=10, min_child_weight=3, scoring=roc_auc;, score=-0.450 total time=   1.6s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.7, eta=0.1, gamma=0.3, max_depth=10, min_child_weight=3, scoring=precision;, score=-0.427 total time=   1.3s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.7, eta=0.1, gamma=0.3, max_depth=10, min_child_weight=5, scoring=accuracy;, score=-0.401 total time=   1.3s
Parameters: { "scoring" } are not used.

[CV 1/3] END colsample_bytree=0.7, eta=0.1, gamma=0.3, max_depth=10, min_child_weight=5, scoring=precision;, score=-0.445 total time=   1.2s
Parameters: { "scoring" } are not used.

[CV 2/3] END colsample_bytree=0.7, eta=0.1, gamma=0.3, max_depth=10, min_child_weight=10, scoring=accuracy;, score=-0.430 total time=   1.2s
Parameters: { "scoring" } are not used.

[CV 3/3] END colsample_bytree=0.7, eta=0.1, gamma=0.

In [31]:
grid.best_score_

-0.4167815128549101

In [32]:
grid.best_params_

{'colsample_bytree': 0.5,
 'eta': 0.15,
 'gamma': 0.2,
 'max_depth': 5,
 'min_child_weight': 5}

In [42]:
clf = xgb.XGBClassifier(colsample_bytree= 0.5,
 eta= 0.15,
 gamma= 0.2,
 max_depth= 5,
 min_child_weight= 5)

In [43]:
clf.fit(train_df_trans2, Y_train)

In [45]:
y_sub = clf.predict(test_df_trans2)
y_sub = pd.Series(y_sub).replace({1:'True',0:'False'}).values

pd.merge(test_df['PassengerId'],pd.DataFrame(y_sub,columns=['Transported']),left_index=True,right_index=True)\
            .to_csv('xgb_submission_gs.csv',index=False)