In [1]:
%matplotlib inline
import pandas as ps
import numpy as np
from math import log
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
TRAIN_FILE = '/home/sergio/Data/titanic/train.csv'
TEST_FILE = '/home/sergio/Data/titanic/test.csv'

# Построение модели с максимальным набором признаков

## Подготовка данных

In [10]:
class Transformer:
    _title_mapping = {
        "mister": {"Capt", "Col", "Don", "Dr", "Jonkheer", "Major", "Mr", "Rev", "Sir"}, # Взрослые мужчины
        "missis": {"Mme", "Mrs", "Dona", "the Countess"}, # Взрослые женщины
        "miss": {"Lady", "Miss", "Mlle", "Ms"}, # Девочки и девушки
        "master": {"Master"} # Мальчики и юноши
    }
    
    def __init__(self, fulldata):
        self._age_maping = {(sex, cls): fulldata[(fulldata.Sex == sex) & (fulldata.Pclass == cls)].Age.median()
                                      for sex in ["male", "female"] for cls in range(1,4)}
        self._tickets_count = fulldata.groupby(["Ticket"])["PassengerId"].count()
        tmp = fulldata.copy()
        tmp.insert(len(tmp.columns), "Surname", [name.split(",")[0].strip() for name in tmp.Name])
        self._surnames_count = tmp.groupby(["Surname"])["PassengerId"].count()
        self._codes = {}
    
    def apply(self, data):
        data.Age = [self._age_maping[(row.Sex,row.Pclass)] if np.isnan(row.Age) else row.Age
                    for ind, row in data.iterrows()]
        data.Fare.fillna(0, inplace=True)
        data.Embarked.fillna("S", inplace=True)
        data.insert(len(data.columns), "HasCabin", [int(type(cabin) == str) for cabin in data.Cabin])
        data.insert(len(data.columns), "Title", [self.title(name) for name in data.Name])
        data.insert(len(data.columns), "SurnameCount", [self.surnameCount(name) for name in data.Name])
        data.insert(len(data.columns), "FarRelatives", [row.SurnameCount-row.SibSp-row.Parch for ind, row in data.iterrows()])
        data.insert(len(data.columns), "Cotravellers", [self._tickets_count[ticket] for ticket in data.Ticket])
        data.insert(len(data.columns), "FarePerTraveller", [row.Fare/row.Cotravellers for ind, row in data.iterrows()])
        data.Fare = [log(fare, 3) if fare != 0 else 0 for fare in data.Fare]
        data.FarePerTraveller = [log(fare, 3) if fare != 0 else 0 for fare in data.FarePerTraveller]
        data = self.oneHot(data, "Embarked")
        data = self.oneHot(data, "Pclass")
        data = self.encode(data, "Sex")
        data = self.oneHot(data, "Title")
        return data.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)

    def encode(self, data, column):
        values = data[column].drop_duplicates()
        if column in self._codes:
            mapping = self._codes[column]
            for val in values:
                if val not in mapping:
                    mapping[val] = len(mapping)
        else:
            mapping = {v: i for i, v in enumerate(sorted(values))}
            self._codes[column] = mapping
        data[column] = [mapping[v] for v in data[column]]
        return data
    
    def oneHot(self, data, column):
        values = data[column].drop_duplicates()
        for val in sorted(values):
            data.insert(len(data.columns), "{}_{}".format(column, val),
                        [1 if rowval == val else 0 for rowval in data[column]])
        return data.drop([column], axis=1)
    
    @classmethod
    def title(cls, name):
        title = name.split(",")[1].split(".")[0].strip()
        return next(key for key, value in cls._title_mapping.items() if title in value)

    def surnameCount(self, name):
        return self._surnames_count[name.split(",")[0].strip()]

In [11]:
def get_data():
    data = ps.read_csv(TRAIN_FILE)
    test = ps.read_csv(TEST_FILE)
    train = data.drop(['Survived'], axis=1)
    target = data.Survived
    fulldata = ps.concat([train, test])
    trans = Transformer(fulldata)
    return trans.apply(train), target, trans.apply(test)

train, target, test = get_data()
train[:5]

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,HasCabin,SurnameCount,FarRelatives,Cotravellers,FarePerTraveller,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,Title_master,Title_miss,Title_missis,Title_mister
0,1,22,1,0,1.803185,0,2,1,1,1.803185,0,0,1,0,0,1,0,0,0,1
1,0,38,1,0,3.883683,1,2,1,2,3.252753,1,0,0,1,0,0,0,0,1,0
2,0,26,0,0,1.884216,0,1,1,1,1.884216,0,0,1,0,0,1,0,1,0,0
3,0,35,1,0,3.615631,1,2,1,2,2.984702,0,0,1,1,0,0,0,0,1,0
4,1,35,0,0,1.898461,0,2,2,1,1.898461,0,0,1,0,0,1,0,0,0,1


## Построение и оценка моделей

In [13]:
models = [
    RandomForestClassifier(n_estimators = 80, max_features='auto', criterion='entropy', max_depth=4),
    LogisticRegression(penalty='l1', tol=0.01)
]
for mdl in models:
    scores = cross_validation.cross_val_score(mdl, train, target, cv=3)
    print("{}: {:.4f} (+/- {:.4f})".format(mdl.__class__.__name__, scores.mean(), scores.std() * 2))

RandomForestClassifier: 0.8316 (+/- 0.0145)
LogisticRegression: 0.8215 (+/- 0.0145)


Экспортируем результаты:

In [14]:
for mdl in models:
    mdl.fit(train, target)
    result = ps.DataFrame()
    result.insert(0, "PassengerId", range(892, 1310))
    result.insert(1, "Survived", mdl.predict(test))
    result.to_csv("results/allfeatures-v3-{}.csv".format(mdl.__class__.__name__), index=False)

Данная модель даёт результат в 0.79904 методом Random Forest.