In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler,LabelEncoder,MinMaxScaler
from sklearn.neural_network import MLPClassifier

from sklearn.svm import SVC

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

## Get a sense from raw data

In [3]:
print(df_train.isnull().sum())
print(df_test.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


* 1, Since Cabin isn't a well collected data, we might want to drop it
* 2, Fill in Age with mean value
* 3, Fill in missing Embarked with 'C'
* 4, Fill in missing fare with 8.05
>credit to https://www.kaggle.com/mrisdal/titanic/exploring-survival-on-the-titanic section3.1

In [4]:
#It's always good to copy
df_train2 = df_train.copy()
df_test2 = df_test.copy()

In [5]:
df_train = df_train.drop(['Ticket','Cabin'],axis = 1)
df_test = df_test.drop(['Ticket','Cabin'],axis = 1)

def fill_in_age(df):
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    
fill_in_age(df_train)
fill_in_age(df_test)
df_train['Embarked'] = df_train['Embarked'].fillna('C')
df_test['Fare'] = df_test['Fare'].fillna(8.05)

## Prepare Labels and scaling

In [6]:
def label_encode(df):
    df['Sex'] = LabelEncoder().fit(df['Sex']).transform(df['Sex'])
    df['Embarked'] = LabelEncoder().fit(df['Embarked']).transform(df['Embarked'])

label_encode(df_train)
label_encode(df_test)

In [7]:
df_train['Name'] = df_train['Name'].str.replace('(.*, )|(\\..*)', '', case=False)
df_test['Name'] = df_test['Name'].str.replace('(.*, )|(\\..*)', '', case=False)

title = LabelEncoder().fit(df_test['Name'].unique().tolist()+df_train['Name'].unique().tolist())

df_train['Name'] = title.transform(df_train['Name'])
df_test['Name'] = title.transform(df_test['Name'])

In [8]:
def new_feature(df):
    df['Age_Sex'] = df['Age']*df['Sex']
    df['Embarked_Fare'] = df['Embarked']*df['Fare']
    df['Pclass_Sex'] = df['Pclass']*df['Sex']
    df['Sib_parch1'] = df['SibSp']*df['Parch']
    df['Sib_parch2'] = df['SibSp']+df['Parch']
    
new_feature(df_train)
new_feature(df_test)

In [9]:
#df_train

## Training

In [10]:
data = df_train.values
data = np.random.permutation(data)

In [11]:
cv = 700

In [12]:
x_train = data[:cv,2:]
scaler = MinMaxScaler().fit(x_train)
x_train = scaler.transform(x_train)
y_train = data[:cv,1]

In [13]:
x_cv = data[cv:,2:]
x_cv = scaler.transform(x_cv)
y_cv = data[cv:,1]

In [14]:
'''model = MLPClassifier(hidden_layer_sizes=(5, ), activation='relu', solver='lbfgs', alpha=0.00001, batch_size='auto', 
                       learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=2000, 
                       shuffle=True, random_state=1, tol=1e-8, verbose=False, warm_start=True, 
                       momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, 
                       beta_1=0.9, beta_2=0.999, epsilon=1e-08)'''

"model = MLPClassifier(hidden_layer_sizes=(5, ), activation='relu', solver='lbfgs', alpha=0.00001, batch_size='auto', \n                       learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=2000, \n                       shuffle=True, random_state=1, tol=1e-8, verbose=False, warm_start=True, \n                       momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, \n                       beta_1=0.9, beta_2=0.999, epsilon=1e-08)"

In [15]:
model = SVC(C=5, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True, 
            tol=0.00001, cache_size=200, 
            class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None, random_state=1)

In [16]:
model.fit(x_train,y_train)

SVC(C=5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=1, shrinking=True, tol=1e-05,
  verbose=False)

In [17]:
model.score(x_train,y_train)

0.79714285714285715

In [18]:
model.score(x_cv,y_cv)

0.84816753926701571

## Predict

In [19]:
x_test = df_test.values[:,1:]
x_test = scaler.transform(x_test)

In [20]:
pred = model.predict(x_test)
pred = [int(x) for x in pred]
#pred

In [21]:
submission = pd.DataFrame({
        "PassengerId": df_test["PassengerId"],
        "Survived": pred
    })
submission.to_csv('submission_2.csv', index=False)

## Submit

In [22]:
df1 = pd.read_csv('gptest.csv')
df2 = pd.read_csv('submission_2.csv')

In [23]:
(df1 != df2).sum()

PassengerId     0
Survived       74
dtype: int64

In [24]:
df1-df2

Unnamed: 0,PassengerId,Survived
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,-1
7,0,1
8,0,0
9,0,0
