In [1]:
%matplotlib inline
import pandas as ps
import numpy as np
from math import log
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
TRAIN_FILE = '/home/sergio/Data/titanic/train.csv'
TEST_FILE = '/home/sergio/Data/titanic/test.csv'

# Построение модели с максимальным набором признаков

## Подготовка данных

In [2]:
class Transformer:
    _title_mapping = {
        "mister": {"Capt", "Col", "Don", "Dr", "Jonkheer", "Major", "Mr", "Rev", "Sir"}, # Взрослые мужчины
        "missis": {"Mme", "Mrs", "Dona", "the Countess"}, # Взрослые женщины
        "miss": {"Lady", "Miss", "Mlle", "Ms"}, # Девочки и девушки
        "master": {"Master"} # Мальчики и юноши
    }
    
    def __init__(self, train, target, fulldata):
        self._age_maping = {(sex, cls): fulldata[(fulldata.Sex == sex) & (fulldata.Pclass == cls)].Age.median()
                                      for sex in ["male", "female"] for cls in range(1,4)}
        self._tickets_count = fulldata.groupby(["Ticket"])["PassengerId"].count()
        
        self._tickets_survival = {ticket: {"survived": 0, "perished": 0} for ticket in train.Ticket.drop_duplicates()}
        for ind, row in train.iterrows():
            if target[ind]:
                self._tickets_survival[row.Ticket]["survived"] += 1
            else:
                self._tickets_survival[row.Ticket]["perished"] += 1
        
        tmp = fulldata.copy()
        tmp.insert(len(tmp.columns), "Surname", [name.split(",")[0].strip() for name in tmp.Name])
        self._surnames_count = tmp.groupby(["Surname"])["PassengerId"].count()
        
        self._codes = {}
    
    def apply(self, data, target=None):
        data.Age = [self._age_maping[(row.Sex,row.Pclass)] if np.isnan(row.Age) else row.Age
                    for ind, row in data.iterrows()]
        data.Fare.fillna(0, inplace=True)
        data.Embarked.fillna("S", inplace=True)
        data.insert(len(data.columns), "HasCabin", [int(type(cabin) == str) for cabin in data.Cabin])
        data.insert(len(data.columns), "Title", [self.title(name) for name in data.Name])
        data.insert(len(data.columns), "SurnameCount", [self.surnameCount(name) for name in data.Name])
        data.insert(len(data.columns), "FarRelatives", [row.SurnameCount-row.SibSp-row.Parch for ind, row in data.iterrows()])
        data.insert(len(data.columns), "Cotravellers", [self._tickets_count[ticket] for ticket in data.Ticket])
        data.insert(len(data.columns), "FarePerTraveller", [row.Fare/row.Cotravellers for ind, row in data.iterrows()])
        data.insert(len(data.columns), "TicketSurvival",
                    [self.ticketSurvival(row.Ticket, target=(target[ind] if target is not None else None))
                     for ind, row in data.iterrows()])
        data.Fare = [log(fare, 3) if fare != 0 else 0 for fare in data.Fare]
        data.FarePerTraveller = [log(fare, 3) if fare != 0 else 0 for fare in data.FarePerTraveller]
        data = self.oneHot(data, "Embarked")
        data = self.oneHot(data, "Pclass")
        data = self.encode(data, "Sex")
        data = self.oneHot(data, "Title")
        data = self.oneHot(data, "TicketSurvival")
        return data.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)

    def encode(self, data, column):
        values = data[column].drop_duplicates()
        if column in self._codes:
            mapping = self._codes[column]
            for val in values:
                if val not in mapping:
                    mapping[val] = len(mapping)
        else:
            mapping = {v: i for i, v in enumerate(sorted(values))}
            self._codes[column] = mapping
        data[column] = [mapping[v] for v in data[column]]
        return data
    
    def oneHot(self, data, column):
        values = data[column].drop_duplicates()
        for val in sorted(values):
            data.insert(len(data.columns), "{}_{}".format(column, val),
                        [1 if rowval == val else 0 for rowval in data[column]])
        return data.drop([column], axis=1)
    
    @classmethod
    def title(cls, name):
        title = name.split(",")[1].split(".")[0].strip()
        return next(key for key, value in cls._title_mapping.items() if title in value)

    def surnameCount(self, name):
        return self._surnames_count[name.split(",")[0].strip()]
    
    def ticketSurvival(self, ticket, target=None):
        try:
            info = self._tickets_survival[ticket]
        except KeyError:
            return 0
        index = lambda surv, per: surv - per
        if target is None:
            return index(info["survived"], info["perished"])
        elif target == 1:
            return index(info["survived"]-1, info["perished"])
        elif target == 0:
            return index(info["survived"], info["perished"]-1)

In [26]:
def get_data():
    data = ps.read_csv(TRAIN_FILE)
    test = ps.read_csv(TEST_FILE)
    train = data.drop(['Survived'], axis=1)
    target = data.Survived
    fulldata = ps.concat([train, test])
    trans = Transformer(train, target, fulldata)
    return trans.apply(train, target), target, trans.apply(test)

train, target, test = get_data()

test["TicketSurvival_-6"] = test["TicketSurvival_-7"] | test["TicketSurvival_-6"]
test = test.drop(["TicketSurvival_-7"], axis=1)
test.insert(test.columns.tolist().index("TicketSurvival_-2"), "TicketSurvival_-3", [0]*len(test))
test.insert(test.columns.tolist().index("TicketSurvival_3")+1, "TicketSurvival_4", [0]*len(test))

In [27]:
train[:5].drop(["Embarked_C", "Embarked_Q", "Embarked_S", "HasCabin", "SurnameCount", "FarRelatives",
               "Pclass_1", "Pclass_2", "Pclass_3", "Fare", "Cotravellers", "FarePerTraveller"], axis=1)

Unnamed: 0,Sex,Age,SibSp,Parch,Title_master,Title_miss,Title_missis,Title_mister,TicketSurvival_-6,TicketSurvival_-5,TicketSurvival_-4,TicketSurvival_-3,TicketSurvival_-2,TicketSurvival_-1,TicketSurvival_0,TicketSurvival_1,TicketSurvival_2,TicketSurvival_3,TicketSurvival_4
0,1,22,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
1,0,38,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
2,0,26,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,35,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
4,1,35,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0


In [28]:
test[:5].drop(["Embarked_C", "Embarked_Q", "Embarked_S", "HasCabin", "SurnameCount", "FarRelatives",
               "Pclass_1", "Pclass_2", "Pclass_3", "Fare", "Cotravellers", "FarePerTraveller"], axis=1)

Unnamed: 0,Sex,Age,SibSp,Parch,Title_master,Title_miss,Title_missis,Title_mister,TicketSurvival_-6,TicketSurvival_-5,TicketSurvival_-4,TicketSurvival_-3,TicketSurvival_-2,TicketSurvival_-1,TicketSurvival_0,TicketSurvival_1,TicketSurvival_2,TicketSurvival_3,TicketSurvival_4
0,1,34.5,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
1,0,47.0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
2,1,62.0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
3,1,27.0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,22.0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0


## Построение и оценка моделей

In [29]:
models = [
    RandomForestClassifier(n_estimators = 80, max_features='auto', criterion='entropy', max_depth=4),
    LogisticRegression(penalty='l1', tol=0.01)
]
for mdl in models:
    scores = cross_validation.cross_val_score(mdl, train, target, cv=3)
    print("{}: {:.4f} (+/- {:.4f})".format(mdl.__class__.__name__, scores.mean(), scores.std() * 2))

RandomForestClassifier: 0.8238 (+/- 0.0277)
LogisticRegression: 0.8418 (+/- 0.0095)


Экспортируем результаты:

In [30]:
for mdl in models:
    mdl.fit(train, target)
    result = ps.DataFrame()
    result.insert(0, "PassengerId", range(892, 1310))
    result.insert(1, "Survived", mdl.predict(test))
    result.to_csv("results/linkfeatures-onehot-{}.csv".format(mdl.__class__.__name__), index=False)

Данная модель даёт результат в 0.77512 методом Logistic Regression.