In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
average_age = train_data['Age'].mean()
std_age = train_data['Age'].std()
count_age = train_data['Age'].isnull().sum()

In [4]:
random_1 = np.random.randint(average_age - std_age, average_age + std_age,size = count_age)

In [5]:
train_data[np.isnan(train_data['Age'])]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [6]:
train_data[np.isnan(train_data['Age'])]['Age']

5     NaN
17    NaN
19    NaN
26    NaN
28    NaN
       ..
859   NaN
863   NaN
868   NaN
878   NaN
888   NaN
Name: Age, Length: 177, dtype: float64

In [7]:
train_data[np.isnan(train_data['Age'])]['Age']=random_1

In [8]:
train_data['Age'][np.isnan(train_data['Age'])]

5     NaN
17    NaN
19    NaN
26    NaN
28    NaN
       ..
859   NaN
863   NaN
868   NaN
878   NaN
888   NaN
Name: Age, Length: 177, dtype: float64

In [9]:
train_data['Age'][np.isnan(train_data['Age'])]=random_1

In [10]:
train_data['Age'][np.isnan(train_data['Age'])]

Series([], Name: Age, dtype: float64)

In [11]:
train_data['Age'] = train_data['Age'].round().astype(int)

In [12]:
average_age = test_data['Age'].mean()
std_age = test_data['Age'].std()
count_age = test_data['Age'].isnull().sum()

In [13]:
random_1 = np.random.randint(average_age - std_age, average_age + std_age,size = count_age)
test_data['Age'][np.isnan(test_data['Age'])] = random_1
test_data['Age'] = test_data['Age'].astype(int)

In [14]:
train_data['AgeBucket'] = train_data['Age']//15 * 15
train_data[['AgeBucket','Survived']].groupby(['AgeBucket']).mean()

Unnamed: 0_level_0,Survived
AgeBucket,Unnamed: 1_level_1
0,0.576923
15,0.352645
30,0.378738
45,0.404494
60,0.24
75,1.0


In [15]:
train_data['RelativesOnboard'] = train_data['SibSp'] + train_data['Parch']
train_data[['RelativesOnboard','Survived']].groupby('RelativesOnboard').mean()

Unnamed: 0_level_0,Survived
RelativesOnboard,Unnamed: 1_level_1
0,0.303538
1,0.552795
2,0.578431
3,0.724138
4,0.2
5,0.136364
6,0.333333
7,0.0
10,0.0


In [16]:
test_data['RelativesOnboard'] = test_data['SibSp'] + test_data['Parch']

In [17]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse

class CategoricalEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, encoding='onehot', categories='auto', dtype=np.float64, handle_unknown='error'):

        self.encoding = encoding
        self.categories = categories
        self.dtype = dtype
        self.handle_unknown = handle_unknown

    def fit(self, X, y=None):

        if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
            template = ("encoding should be either 'onehot', 'onehot-dense' "
            "or 'ordinal', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or "
            "'ignore', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
            raise ValueError("handle_unknown='ignore' is not supported for"
            " encoding='ordinal'")

        X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
        n_samples, n_features = X.shape

        self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]

        for i in range(n_features):
            le = self._label_encoders_[i]
            Xi = X[:, i]
            if self.categories == 'auto':
                le.fit(Xi)
            else:
                valid_mask = np.in1d(Xi, self.categories[i])
                if not np.all(valid_mask):
                    if self.handle_unknown == 'error':
                        diff = np.unique(Xi[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, i))
                        raise ValueError(msg)
                le.classes_ = np.array(np.sort(self.categories[i]))

        self.categories_ = [le.classes_ for le in self._label_encoders_]

        return self

    def transform(self, X):

        X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
        n_samples, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)

        for i in range(n_features):
            valid_mask = np.in1d(X[:, i], self.categories_[i])

            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:

                    X_mask[:, i] = valid_mask
                    X[:, i][~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(X[:, i])

        if self.encoding == 'ordinal':
            return X_int.astype(self.dtype, copy=False)

        mask = X_mask.ravel()
        n_values = [cats.shape[0] for cats in self.categories_]
        n_values = np.array([0] + n_values)
        indices = np.cumsum(n_values)

        column_indices = (X_int + indices[:-1]).ravel()[mask]
        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
                                n_features)[mask]
        data = np.ones(n_samples * n_features)[mask]

        out = sparse.csc_matrix((data, (row_indices, column_indices)),
                                shape=(n_samples, indices[-1]),
                                dtype=self.dtype).tocsr()
        if self.encoding == 'onehot-dense':
            return out.toarray()
        else:
            return out

In [18]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [19]:
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent = pd.Series([X[c].value_counts().index[0] for c in X],index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent)

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(["SibSp", "Parch", "Fare",'RelativesOnboard'])),
        ("imputer", SimpleImputer(strategy="median")),
        ('Scaler', StandardScaler())
    ])

cat_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["Pclass", "Sex", "Embarked","AgeBucket"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", CategoricalEncoder(encoding='onehot-dense')),
    ])

In [21]:
from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [22]:
X_train = preprocess_pipeline.fit_transform(train_data)

In [23]:
X_train

array([[ 0.43279337, -0.47367361, -0.50244517, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.43279337, -0.47367361,  0.78684529, ...,  0.        ,
         0.        ,  0.        ],
       [-0.4745452 , -0.47367361, -0.48885426, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.43279337,  2.00893337, -0.17626324, ...,  0.        ,
         0.        ,  0.        ],
       [-0.4745452 , -0.47367361, -0.04438104, ...,  0.        ,
         0.        ,  0.        ],
       [-0.4745452 , -0.47367361, -0.49237783, ...,  0.        ,
         0.        ,  0.        ]])

In [24]:
y_train = train_data['Survived']

In [25]:
from sklearn.model_selection import train_test_split

validation_size = 0.2
seed = 7

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = validation_size, random_state=seed)

In [26]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression()

In [27]:
some_data = train_data[:5]
some_label = y_train.iloc[:5]
some_data_prepared = preprocess_pipeline.transform(some_data)

In [28]:
print("예측 :", lin_reg.predict(some_data_prepared))
print("레이블 :", list(some_label))

예측 : [0.01965332 0.96105957 0.63879395 0.87902832 0.10559082]
레이블 : [0, 0, 1, 0, 0]


In [29]:
from sklearn.metrics import mean_squared_error
predictions = lin_reg.predict(X_train)
lin_mse = mean_squared_error(y_train, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.3650349728957788

In [30]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)

prediction = tree_reg.predict(X_train)
tree_mse = mean_squared_error(y_train, prediction)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.1764345736033211

In [31]:
X_train = preprocess_pipeline.fit_transform(train_data)
y_train = train_data['Survived']

from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=10)

tree_rmse_scores = np.sqrt(-scores)

In [32]:
def display_scores(scores) :
    print("Scores :", scores)
    print("Mean :", scores.mean())
    print("Standard deviation :", scores.std())

display_scores(tree_rmse_scores)

Scores : [0.45988684 0.42986883 0.4148682  0.46350363 0.39188172 0.41463984
 0.40501459 0.44311621 0.35211957 0.40628388]
Mean : 0.41811832994779746
Standard deviation : 0.03153731686525303


In [33]:
lin_scores = cross_val_score(lin_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=10)

lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores : [3.91699353e-01 3.79426244e-01 3.99970331e-01 3.89946580e-01
 3.87538767e-01 3.65243770e-01 3.99030395e-01 3.47126504e+10
 3.22681005e-01 3.74301803e-01]
Mean : 3471265041.921869
Standard deviation : 10413795124.628996


In [34]:
X_train_df = pd.DataFrame(X_train)

In [35]:
X_train_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,4.3860660000000004e-17,5.3829000000000005e-17,3.9873330000000004e-18,-3.9873330000000004e-18,0.242424,0.20651,0.551066,0.352413,0.647587,0.188552,0.08642,0.725028,0.087542,0.445567,0.337823,0.099888,0.028058,0.001122
std,1.000562,1.000562,1.000562,1.000562,0.42879,0.405028,0.497665,0.47799,0.47799,0.391372,0.281141,0.446751,0.282787,0.497307,0.473233,0.300019,0.165232,0.033501
min,-0.4745452,-0.4736736,-0.6484217,-0.5609748,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.4745452,-0.4736736,-0.4891482,-0.5609748,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.4745452,-0.4736736,-0.3573909,-0.5609748,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.4327934,-0.4736736,-0.02424635,0.05915988,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
max,6.784163,6.974147,9.667167,5.640372,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [36]:
y_train.describe()

count    891.000000
mean       0.383838
std        0.486592
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: Survived, dtype: float64

In [37]:
lin_scores = cross_val_score(lin_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=10)

lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores : [3.91699353e-01 3.79426244e-01 3.99970331e-01 3.89946580e-01
 3.87538767e-01 3.65243770e-01 3.99030395e-01 3.47126504e+10
 3.22681005e-01 3.74301803e-01]
Mean : 3471265041.921869
Standard deviation : 10413795124.628996


In [38]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)

RandomForestRegressor()

In [39]:
forest_scores = cross_val_score(forest_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=10)

forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores : [0.41020181 0.33887808 0.41646988 0.38412049 0.3256238  0.35100442
 0.38046169 0.38370143 0.34281315 0.35781793]
Mean : 0.36910926746840983
Standard deviation : 0.029069650531524254


In [40]:
type(lin_rmse_scores)

numpy.ndarray

In [41]:
lin_rmse_scores_2 = np.delete(lin_rmse_scores,7)

In [42]:
lin_rmse_scores_2.mean()

0.3788709164645445

In [43]:
lin_rmse_scores_2.std()

0.02255401197796532

In [44]:
from sklearn.model_selection import GridSearchCV

param_grid = [{'n_estimators':[3,10,30], 'max_features':[2,4,6,8]},{'bootstrap':[False], 'n_estimators':[3,10], 'max_features':[2,3,4]}]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [45]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [46]:
grid_search.best_estimator_

RandomForestRegressor(max_features=6, n_estimators=30)

In [47]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]) :
    print(np.sqrt(-mean_score), params)

0.4146080944964394 {'max_features': 2, 'n_estimators': 3}
0.3920898349146473 {'max_features': 2, 'n_estimators': 10}
0.380789487250327 {'max_features': 2, 'n_estimators': 30}
0.40769984286971805 {'max_features': 4, 'n_estimators': 3}
0.38073887104766924 {'max_features': 4, 'n_estimators': 10}
0.37546539007922713 {'max_features': 4, 'n_estimators': 30}
0.4081039586109844 {'max_features': 6, 'n_estimators': 3}
0.3819686050053024 {'max_features': 6, 'n_estimators': 10}
0.3736953281440946 {'max_features': 6, 'n_estimators': 30}
0.3978203145777167 {'max_features': 8, 'n_estimators': 3}
0.3751050587668157 {'max_features': 8, 'n_estimators': 10}
0.37420304745948063 {'max_features': 8, 'n_estimators': 30}
0.4235486021004512 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
0.407480559922158 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
0.4151138622368785 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
0.4092441507368678 {'bootstrap': False, 'max_features': 3, 

In [48]:
feature_importants = grid_search.best_estimator_.feature_importances_
feature_importants

array([0.03637989, 0.03083053, 0.31434367, 0.05931552, 0.03024487,
       0.01210875, 0.06910713, 0.13585434, 0.16816037, 0.01480488,
       0.00813185, 0.0164965 , 0.04051322, 0.02124998, 0.01880505,
       0.01158226, 0.00978007, 0.00229113])

In [49]:
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeBucket,RelativesOnboard
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.2500,,S,15,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,30,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.9250,,S,15,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1000,C123,S,30,1
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.0500,,S,30,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13.0000,,S,15,0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30.0000,B42,S,15,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,31,1,2,W./C. 6607,23.4500,,S,30,3
889,890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30.0000,C148,C,15,0


In [50]:
num_attribs = ["SibSp", "Parch", "Fare",'RelativesOnboard']
cat_attribs = ["Pclass", "Sex", "Embarked","AgeBucket"]

In [51]:
data_cat = train_data[cat_attribs]

In [52]:
data_cat_reshaped = data_cat.values.reshape(-1,1)

In [53]:
cat_encoder = CategoricalEncoder(encoding='onehot-dense')

In [54]:
train_data.isnull().sum()

PassengerId           0
Survived              0
Pclass                0
Name                  0
Sex                   0
Age                   0
SibSp                 0
Parch                 0
Ticket                0
Fare                  0
Cabin               687
Embarked              2
AgeBucket             0
RelativesOnboard      0
dtype: int64

In [55]:
most_freq_imputer = MostFrequentImputer()

In [56]:
train_data["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [57]:
train_data["Embarked"].fillna("S")

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

In [58]:
train_data["Embarked"] = train_data["Embarked"].fillna("S")

In [59]:
data_cat = train_data[cat_attribs]
data_cat_reshaped = data_cat.values.reshape(-1,1)

In [62]:
data_cat_reshaped = data_cat.values.reshape(-1,1)

In [65]:
data_cat_reshaped.astype('str')

array([['3'],
       ['male'],
       ['S'],
       ...,
       ['male'],
       ['Q'],
       ['30']], dtype='<U6')

In [67]:
data_cat_reshaped = data_cat_reshaped.astype('str')

In [68]:
data_cat_1hot = cat_encoder.fit_transform(data_cat_reshaped)

In [69]:
cat_encoder.categories_

[array(['0', '1', '15', '2', '3', '30', '45', '60', '75', 'C', 'Q', 'S',
        'female', 'male'], dtype=object)]

In [72]:
attributes = num_attribs + ["AgeBucket-0", "Pclass-1", "AgeBucket-15", "Pclass-2", "Pclass-3", "AgeBucket-30"
, "AgeBucket-45", "AgeBucket-60", "AgeBucket-75", "Embarked-C", "Embarked-Q", "Embarked-S",'female', 'male']

In [74]:
sorted(zip(feature_importants,attributes), reverse=True)

[(0.3143436700285912, 'Fare'),
 (0.16816037212641172, 'Pclass-3'),
 (0.13585434235148305, 'Pclass-2'),
 (0.06910713010943811, 'AgeBucket-15'),
 (0.05931551985763974, 'RelativesOnboard'),
 (0.040513223403629985, 'AgeBucket-75'),
 (0.03637988664250617, 'SibSp'),
 (0.03083052765363843, 'Parch'),
 (0.03024487338057231, 'AgeBucket-0'),
 (0.021249976041074113, 'Embarked-C'),
 (0.018805045664740283, 'Embarked-Q'),
 (0.016496496170565873, 'AgeBucket-60'),
 (0.014804878638085776, 'AgeBucket-30'),
 (0.012108748457058942, 'Pclass-1'),
 (0.011582260253499501, 'Embarked-S'),
 (0.009780070698154264, 'female'),
 (0.008131847720114802, 'AgeBucket-45'),
 (0.0022911308027956595, 'male')]

In [75]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,RelativesOnboard
0,892,3,"Kelly, Mr. James",male,34,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47,1,0,363272,7.0,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22,1,1,3101298,12.2875,,S,2


In [77]:
final_model = grid_search.best_estimator_

In [79]:
test_data['AgeBucket'] = test_data['Age']//15 * 15

In [80]:
X_test = test_data
X_test_prepared = preprocess_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

In [81]:
gender_submission = pd.read_csv("gender_submission.csv")

In [82]:
gender_submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [84]:
y_test = gender_submission["Survived"]

In [85]:
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [86]:
final_rmse

0.33361558358514326