In [402]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import neighbors

In [617]:
# Just read training and testing dataset

train = pd.read_csv("train.csv", index_col=["PassengerId"])
test = pd.read_csv("test.csv", index_col=["PassengerId"])

In [618]:
# Function to preprocess both train and test
# Drops certain nan rows
# Creates dummy variables for categorical features
# Normalizes numerical data

def preprocess(df, dup):
    drop = ["Name", "Ticket", "Cabin"]
    categorical = ["Embarked", "Sex"]
    normalize = ["Fare", "Age","SibSp", "Parch"]
    
    df.drop(columns=drop, inplace=True)

    # Handle nan data
    df["Age_known"] = np.where(df["Age"].isnull(), "No", "Yes")
    df["Age"].fillna(df["Age"].mean(), inplace=True)
    categorical.append("Age_known")

    # There are two nans in training and non in test
    df.dropna(subset=["Embarked"], inplace=True)
    
    df.fillna(0, inplace=True)
    
    # Dup if train. Duplicate only Age known columns (Just trying)
    if dup == 1:
        df_sub = df[df["Age_known"] == "Yes"]
        df = df.append([df_sub], ignore_index=True)
    
    # Take care of categorical features
    df = pd.get_dummies(data=df, columns=categorical)
    # Normalize numerical columns
    df[normalize] = df[normalize].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
    
    return df

In [619]:
# Cleanup train and test

train = preprocess(train, 1)
test = preprocess(test, 0)
train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male,Age_known_No,Age_known_Yes
0,0,3,0.271174,0.125,0.0,0.014151,0,0,1,0,1,0,1
1,1,1,0.472229,0.125,0.0,0.139136,1,0,0,1,0,0,1
2,1,3,0.321438,0.0,0.0,0.015469,0,0,1,1,0,0,1
3,1,1,0.434531,0.125,0.0,0.103644,0,0,1,1,0,0,1
4,0,3,0.434531,0.0,0.0,0.015713,0,0,1,0,1,0,1


In [620]:
# Split the training dataset into test and validation. 
# Should mostly cross validate as data is scarce

X, Y = train.iloc[:, 1:], train.iloc[:,0]
train_x, valid_x, train_y, valid_y = train_test_split(X, Y, test_size=0.2)

In [621]:
# Trying out different classifiers. Comment out those not in use

classifier = SGDClassifier(loss="log", tol=1, max_iter= 10000, penalty=None)
#classifier = RandomForestClassifier(max_features=4)
#classifier = svm.SVC(tol=0.3)
classifier.fit(train_x, train_y)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=10000, n_iter=None,
       n_jobs=1, penalty=None, power_t=0.5, random_state=None,
       shuffle=True, tol=1, verbose=0, warm_start=False)

In [622]:
# Training accuracy
classifier.score(train_x, train_y)

0.7875

In [623]:
# Validation accuracy
classifier.score(valid_x, valid_y)

0.8037383177570093

In [615]:
# Our predictions
test["Survived"] = classifier.predict(test)

In [616]:
# Create submission file
test.to_csv('submit.csv', columns = ["Survived"])
test["Survived"]

PassengerId
892     0
893     0
894     0
895     0
896     1
897     0
898     0
899     0
900     1
901     0
902     0
903     0
904     1
905     0
906     1
907     1
908     0
909     0
910     1
911     0
912     0
913     0
914     1
915     1
916     1
917     0
918     1
919     0
920     0
921     0
       ..
1280    0
1281    0
1282    1
1283    1
1284    0
1285    0
1286    0
1287    1
1288    0
1289    1
1290    0
1291    0
1292    1
1293    0
1294    1
1295    1
1296    0
1297    0
1298    0
1299    0
1300    0
1301    1
1302    0
1303    1
1304    1
1305    0
1306    1
1307    0
1308    0
1309    0
Name: Survived, Length: 418, dtype: int64