In [1]:
%matplotlib inline
import pandas as ps
import numpy as np
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
TRAIN_FILE = '/home/sergio/Data/titanic/train.csv'
TEST_FILE = '/home/sergio/Data/titanic/test.csv'

# Построение примитивной модели с One Hot Encoding

## Подготовка данных

In [13]:
class Transformer:
    def __init__(self, fulldata):
        self._age_maping = {(sex, cls): fulldata[(fulldata.Sex == sex) & (fulldata.Pclass == cls)].Age.median()
                                      for sex in ["male", "female"] for cls in range(1,4)}
        self._codes = {}
    
    def apply(self, data):
        data.Age = [self._age_maping[(row.Sex,row.Pclass)] if np.isnan(row.Age) else row.Age
                    for ind, row in data.iterrows()]
        data.Embarked.fillna("S", inplace=True)
        data = self.oneHot(data, "Embarked")
        data = self.oneHot(data, "Pclass")
        data = self.encode(data, "Sex")
        data.insert(len(data.columns), "lowFare", [fare <= 10 for fare in data.Fare])
        data.insert(len(data.columns), "isBaby", [age <= 2 for age in data.Age])
        data.insert(len(data.columns), "dangerTicket", [int(self.isDanger(ticket)) for ticket in data.Ticket])
        data.insert(len(data.columns), "hasCabin", [int(type(cabin) == str) for cabin in data.Cabin])
        data.Fare.fillna(0, inplace=True)
        return data.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)

    def encode(self, data, column):
        values = data[column].drop_duplicates()
        if column in self._codes:
            mapping = self._codes[column]
            for val in values:
                if val not in mapping:
                    mapping[val] = len(mapping)
        else:
            mapping = {v: i for i, v in enumerate(sorted(values))}
            self._codes[column] = mapping
        data[column] = [mapping[v] for v in data[column]]
        return data
    
    def oneHot(self, data, column):
        values = data[column].drop_duplicates()
        for val in sorted(values):
            data.insert(len(data.columns), "{}_{}".format(column, val),
                        [1 if rowval == val else 0 for rowval in data[column]])
        return data.drop([column], axis=1)
    
    @staticmethod
    def isDanger(ticket):
        def is_num(x):
            try:
                int(x)
            except ValueError:
                return False
            else:
                return True
        prefix = ticket.split()[0].split(".")[0].split("/")[0] if not is_num(ticket) else "None"
        return prefix in ["A", "CA", "S", "SOTON", "W"]

In [14]:
def get_data():
    data = ps.read_csv(TRAIN_FILE)
    test = ps.read_csv(TEST_FILE)
    train = data.drop(['Survived'], axis=1)
    target = data.Survived
    fulldata = ps.concat([train, test])
    trans = Transformer(fulldata)
    return trans.apply(train), target, trans.apply(test)

train, target, test = get_data()

TypeError: Not implemented for this type

## Построение и оценка моделей

In [12]:
models = [
    RandomForestClassifier(n_estimators = 80, max_features='auto', criterion='entropy', max_depth=4),
    LogisticRegression(penalty='l1', tol=0.01)
]
for mdl in models:
    scores = cross_validation.cross_val_score(mdl, train, target, cv=3)
    print("{}: {:.2f} (+/- {:.2f})".format(mdl.__class__.__name__, scores.mean(), scores.std() * 2))

RandomForestClassifier: 0.81 (+/- 0.04)
LogisticRegression: 0.80 (+/- 0.01)


Экспортируем результаты:

In [10]:
for mdl in models:
    mdl.fit(train, target)
    result = ps.DataFrame()
    result.insert(0, "PassengerId", range(892, 1310))
    result.insert(1, "Survived", mdl.predict(test))
    result.to_csv("results/thresholds-{}.csv".format(mdl.__class__.__name__), index=False)

Данная модель даёт результат в 0.76077 с логистической регрессией и 0.77512 методом Random Forest.