In [1]:
import pandas as pd
import sklearn
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import math
import gzip, csv

matplotlib.style.use('ggplot')

%matplotlib inline

In [2]:
train_sets = pd.read_csv("./train.csv")
train_sets.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [3]:
test_sets = pd.read_csv("./test.csv")
test_sets.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [48]:
test_sets.groupby("Parch").size()

Parch
0    324
1     52
2     33
3      3
4      2
5      1
6      1
9      2
dtype: int64

In [74]:
def logloss(act, pred, score):
    res = 0
    sample_size = len(act)
    for i in xrange(sample_size):
        res += math.log(max(score[i])) if act[i] == pred[i] else 0
    return - res / sample_size

def accuracy(act, pred):
    count = 0
    size = len(act)
    for i in xrange(size):
        if act[i] == pred[i]:
            count += 1
    return 100.0 * count / size

In [75]:
from sklearn.preprocessing import OneHotEncoder

Y_train = train_sets.Survived.values

def get_id_map(data_sets, name):
    ids = list(set(data_sets[name]))
    return dict([ x for x in zip(ids, xrange(len(ids))) ])

sex_map = get_id_map(train_sets, "Sex")
cabin_map = get_id_map(train_sets, "Cabin")
embark_map = get_id_map(train_sets, "Embarked")
parch_map = get_id_map(train_sets, "Parch")

X_train_raw = [ [x[0], sex_map[x[1]], int(x[2]) if False == math.isnan(x[2]) else 0, x[3], parch_map[x[4]], 
                 cabin_map[x[5]], embark_map[x[6]], int(x[7]) ] 
               for x in zip(train_sets.Pclass, train_sets.Sex, train_sets.Age, train_sets.SibSp,
                            train_sets.Parch, train_sets.Cabin, train_sets.Embarked, train_sets.Fare) ]

enc = OneHotEncoder()
X_train = enc.fit_transform(X_train_raw)
print X_train.shape

(891, 333)


In [76]:
from sklearn import linear_model
from sklearn.cross_validation import train_test_split

if True:
    X_train_s, X_val, Y_train_s, Y_val = train_test_split(X_train, Y_train, test_size = 0.3)
    logreg = linear_model.LogisticRegression(C=1e5, solver="lbfgs", multi_class="multinomial")
    logreg.fit(X_train_s, Y_train_s)
    print accuracy(Y_val, logreg.predict(X_val))
else:
    logreg = linear_model.LogisticRegression(C=1e5, solver="lbfgs", multi_class="multinomial")
    logreg.fit(X_train, Y_train)
    print accuracy(Y_val, logreg.predict(X_val))   

71.2686567164


In [73]:
X_test_raw = [ [x[0], sex_map[x[1]], int(x[2]) if False == math.isnan(x[2]) else 0, x[3], parch_map.get(x[4], 0), 
                 cabin_map.get(x[5], 0), embark_map[x[6]], int(x[7]) if False == math.isnan(x[7]) else 0] 
               for x in zip(test_sets.Pclass, test_sets.Sex, test_sets.Age, test_sets.SibSp,
                            test_sets.Parch, test_sets.Cabin, test_sets.Embarked, test_sets.Fare) ]

predicted = np.array(logreg.predict(enc.transform(X_test_raw)))
labels = ['PassengerId', 'Survived']
with gzip.open('./titanic_lr.csv.gz', 'wt') as outf:
    fo = csv.writer(outf, lineterminator='\n')
    fo.writerow(labels)

    for i, pred in enumerate(predicted):
        fo.writerow([test_sets.PassengerId[i]] + [pred])

In [103]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 200)
if True:
    X_train_s, X_val, Y_train_s, Y_val = train_test_split(X_train_raw, Y_train, test_size = 0.3)
    rf.fit(X_train_s, Y_train_s)
    print accuracy(Y_val, rf.predict(X_val))
else:
    rf.fit(X_train_raw, Y_train)
    print accuracy(Y_train, rf.predict(X_train_raw))

80.5970149254


In [117]:
from sklearn.ensemble import GradientBoostingClassifier

gbdt = GradientBoostingClassifier(n_estimators = 200)
if False:
    X_train_s, X_val, Y_train_s, Y_val = train_test_split(X_train_raw, Y_train, test_size = 0.3)
    gbdt.fit(X_train_s, Y_train_s)
    print accuracy(Y_val, gbdt.predict(X_val))
else:
    gbdt.fit(X_train_raw, Y_train)
    print accuracy(Y_train, gbdt.predict(X_train_raw))

92.7048260382


In [118]:
X_test_raw = [ [x[0], sex_map[x[1]], int(x[2]) if False == math.isnan(x[2]) else 0, x[3], parch_map.get(x[4], 0), 
                 cabin_map.get(x[5], 0), embark_map[x[6]], int(x[7]) if False == math.isnan(x[7]) else 0] 
               for x in zip(test_sets.Pclass, test_sets.Sex, test_sets.Age, test_sets.SibSp,
                            test_sets.Parch, test_sets.Cabin, test_sets.Embarked, test_sets.Fare) ]

predicted = np.array(gbdt.predict(X_test_raw))
labels = ['PassengerId', 'Survived']
with gzip.open('./titanic_gbdt.csv.gz', 'wt') as outf:
    fo = csv.writer(outf, lineterminator='\n')
    fo.writerow(labels)

    for i, pred in enumerate(predicted):
        fo.writerow([test_sets.PassengerId[i]] + [pred])