In [2]:
%matplotlib inline
import pandas as ps
import numpy as np
from math import log
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
from sklearn.cluster import DBSCAN
TRAIN_FILE = '/home/sergio/Data/titanic/train.csv'
TEST_FILE = '/home/sergio/Data/titanic/test.csv'

# Построение модели с максимальным набором признаков

## Подготовка данных

In [1]:
class Transformer:
    _title_mapping = {
        "mister": {"Capt", "Col", "Don", "Dr", "Jonkheer", "Major", "Mr", "Rev", "Sir"}, # Взрослые мужчины
        "missis": {"Mme", "Mrs", "Dona", "the Countess"}, # Взрослые женщины
        "miss": {"Lady", "Miss", "Mlle", "Ms"}, # Девочки и девушки
        "master": {"Master"} # Мальчики и юноши
    }
    
    def __init__(self, train, target, fulldata):
        self._age_maping = {(sex, cls): fulldata[(fulldata.Sex == sex) & (fulldata.Pclass == cls)].Age.median()
                                      for sex in ["male", "female"] for cls in range(1,4)}
        self._tickets_count = fulldata.groupby(["Ticket"])["PassengerId"].count()
        
        # Ticket survival:
        self._tickets_survival = {ticket: {"survived": 0, "perished": 0} for ticket in train.Ticket.drop_duplicates()}
        for ind, row in train.iterrows():
            if target[ind]:
                self._tickets_survival[row.Ticket]["survived"] += 1
            else:
                self._tickets_survival[row.Ticket]["perished"] += 1
        
        # Ticket cluster survival:
        self._cluster_survival, self._cluster_count = self.getClusters(train, target, fulldata)
        
        # Surname survival:
        tmp = train.copy()
        tmp.insert(len(tmp.columns), "Surname", [name.split(",")[0].strip() for name in tmp.Name])
        self._surnames_survival = {surname: {"survived": 0, "perished": 0} for surname in tmp.Surname.drop_duplicates()}
        for ind, row in tmp.iterrows():
            if target[ind]:
                self._surnames_survival[row.Surname]["survived"] += 1
            else:
                self._surnames_survival[row.Surname]["perished"] += 1
        
        tmp = fulldata.copy()
        tmp.insert(len(tmp.columns), "Surname", [name.split(",")[0].strip() for name in tmp.Name])
        self._surnames_count = tmp.groupby(["Surname"])["PassengerId"].count()
        
        self._codes = {}
    
    def getClusters(self, train, target, fulldata):
        '''
        def is_num(x):
            try: int(x)
            except ValueError: return False
            else: return True
        tickets = ps.Series([int(ticket) for ticket in fulldata.Ticket.drop_duplicates() if is_num(ticket)])
        '''
        tickets = ps.Series([self.ticketIndex(ticket) for ticket in fulldata.Ticket.drop_duplicates()])
        df = ps.DataFrame()
        df.insert(0, "Ticket", tickets)
        db = DBSCAN(eps=30, min_samples=3).fit(df)
        labels = set(db.labels_)
        ticket_cluster = {ticket: db.labels_[i] for i, ticket in enumerate(df.Ticket)}
        for ticket in fulldata.Ticket:
            if ticket not in ticket_cluster:
                ticket_cluster[ticket] = -1
        
        listed = list(db.labels_)
        cluster_count = {label: listed.count(label) for label in labels}
        cluster_count[-1] = 1
        ticket_cluster_count = {ticket: cluster_count[cluster] for ticket, cluster in ticket_cluster.items()}
        
        cluster_survival = {cluster: {"survived": 0, "perished": 0} for cluster in labels if cluster != -1}
        for ind, row in train.iterrows():
            if ticket_cluster[row.Ticket] == -1: continue
            if target[ind]:
                cluster_survival[ticket_cluster[row.Ticket]]["survived"] += 1
            else:
                cluster_survival[ticket_cluster[row.Ticket]]["perished"] += 1
        cluster_survival[-1] = {"survived": 0, "perished": 0}
        
        return cluster_survival, ticket_cluster_count
    
    def apply(self, data, target=None):
        data.Age = [self._age_maping[(row.Sex,row.Pclass)] if np.isnan(row.Age) else row.Age
                    for ind, row in data.iterrows()]
        data.Fare.fillna(0, inplace=True)
        data.Embarked.fillna("S", inplace=True)
        data.insert(len(data.columns), "NumCabins",
                    [len(cabin.split()) if type(cabin) == str else 0 for cabin in data.Cabin])
        data.insert(len(data.columns), "Title", [self.title(name) for name in data.Name])
        data.insert(len(data.columns), "SurnameCount", [self.surnameCount(name) for name in data.Name])
        data.insert(len(data.columns), "FarRelatives", [row.SurnameCount-row.SibSp-row.Parch for ind, row in data.iterrows()])
        data.insert(len(data.columns), "Cotravellers", [self._tickets_count[ticket] for ticket in data.Ticket])
        data.insert(len(data.columns), "FarePerTraveller", [row.Fare/row.Cotravellers for ind, row in data.iterrows()])
        data.insert(len(data.columns), "TicketClusterCount", [self._cluster_count[self.ticketIndex(ticket)]
                                                              for ticket in data.Ticket])
        data.insert(len(data.columns), "TicketClusterSurvival",
                    [self.clusterSurvival(self.ticketIndex(row.Ticket), target=(target[ind] if target is not None else None))
                     for ind, row in data.iterrows()])
        data.insert(len(data.columns), "TicketSurvival",
                    [self.ticketSurvival(row.Ticket, target=(target[ind] if target is not None else None))
                     for ind, row in data.iterrows()])
        data.insert(len(data.columns), "SurnameSurvival",
                    [self.surnameSurvival(row.Name, target=(target[ind] if target is not None else None))
                     for ind, row in data.iterrows()])
        data.Fare = [log(fare, 3) if fare != 0 else 0 for fare in data.Fare]
        data.FarePerTraveller = [log(fare, 3) if fare != 0 else 0 for fare in data.FarePerTraveller]
        data = self.oneHot(data, "Embarked")
        data = self.oneHot(data, "Pclass")
        data = self.encode(data, "Sex")
        data = self.oneHot(data, "Title")
        data = self.oneHot(data, "NumCabins")
        return data.drop(["PassengerId", "Name", "Ticket", "Cabin", "Fare"], axis=1)

    def encode(self, data, column):
        values = data[column].drop_duplicates()
        if column in self._codes:
            mapping = self._codes[column]
            for val in values:
                if val not in mapping:
                    mapping[val] = len(mapping)
        else:
            mapping = {v: i for i, v in enumerate(sorted(values))}
            self._codes[column] = mapping
        data[column] = [mapping[v] for v in data[column]]
        return data
    
    def oneHot(self, data, column):
        values = data[column].drop_duplicates()
        for val in sorted(values):
            data.insert(len(data.columns), "{}_{}".format(column, val),
                        [1 if rowval == val else 0 for rowval in data[column]])
        return data.drop([column], axis=1)
    
    @classmethod
    def title(cls, name):
        title = name.split(",")[1].split(".")[0].strip()
        return next(key for key, value in cls._title_mapping.items() if title in value)
    
    @staticmethod
    def ticketIndex(ticket):
        if ticket == "LINE":
            return -500
        return int(ticket.split()[-1])

    def surnameCount(self, name):
        return self._surnames_count[name.split(",")[0].strip()]
    
    def ticketSurvival(self, ticket, target=None):
        index = lambda surv, per: surv - per
        try:
            info = self._tickets_survival[ticket]
        except KeyError:
            return 0
        if target is None:
            return index(info["survived"], info["perished"])
        elif target == 1:
            return index(info["survived"]-1, info["perished"])
        elif target == 0:
            return index(info["survived"], info["perished"]-1)
    
    def clusterSurvival(self, ticket, target=None):
        index = lambda surv, per: surv - per
        try:
            info = self._tickets_survival[ticket]
        except KeyError:
            return 0
        if target is None:
            return index(info["survived"], info["perished"])
        elif target == 1:
            return index(info["survived"]-1, info["perished"])
        elif target == 0:
            return index(info["survived"], info["perished"]-1)
    
    def surnameSurvival(self, name, target=None):
        surname = name.split(",")[0].strip()
        try:
            info = self._surnames_survival[surname]
        except KeyError:
            return 0
        index = lambda surv, per: surv - per
        if target is None:
            return index(info["survived"], info["perished"])
        elif target == 1:
            return index(info["survived"]-1, info["perished"])
        elif target == 0:
            return index(info["survived"], info["perished"]-1)

In [3]:
def get_data():
    data = ps.read_csv(TRAIN_FILE)
    test = ps.read_csv(TEST_FILE)
    train = data.drop(['Survived'], axis=1)
    target = data.Survived
    fulldata = ps.concat([train, test])
    trans = Transformer(train, target, fulldata)
    return trans.apply(train, target), target, trans.apply(test)

train, target, test = get_data()
train[:5].drop(["Embarked_C", "Embarked_Q", "Embarked_S"], axis=1)

Unnamed: 0,Sex,Age,SibSp,Parch,SurnameCount,FarRelatives,Cotravellers,FarePerTraveller,TicketClusterCount,TicketClusterSurvival,...,Pclass_3,Title_master,Title_miss,Title_missis,Title_mister,NumCabins_0,NumCabins_1,NumCabins_2,NumCabins_3,NumCabins_4
0,1,22,1,0,2,1,1,1.803185,5,0,...,1,0,0,0,1,1,0,0,0,0
1,0,38,1,0,2,1,2,3.252753,31,0,...,0,0,0,1,0,0,1,0,0,0
2,0,26,0,0,1,1,1,1.884216,49,0,...,1,0,1,0,0,1,0,0,0,0
3,0,35,1,0,2,1,2,2.984702,26,0,...,0,0,0,1,0,0,1,0,0,0
4,1,35,0,0,2,2,1,1.898461,1,0,...,1,0,0,0,1,1,0,0,0,0


In [4]:
test[:5].drop(["Embarked_C", "Embarked_Q", "Embarked_S"], axis=1)

Unnamed: 0,Sex,Age,SibSp,Parch,SurnameCount,FarRelatives,Cotravellers,FarePerTraveller,TicketClusterCount,TicketClusterSurvival,...,Pclass_3,Title_master,Title_miss,Title_missis,Title_mister,NumCabins_0,NumCabins_1,NumCabins_2,NumCabins_3,NumCabins_4
0,1,34.5,0,0,5,5,1,1.873145,18,0,...,1,0,0,0,1,1,0,0,0,0
1,0,47.0,1,0,1,0,1,1.771244,3,0,...,1,0,0,1,0,1,0,0,0,0
2,1,62.0,0,0,1,1,1,2.067004,1,0,...,0,0,0,0,1,1,0,0,0,0
3,1,27.0,0,0,1,1,1,1.96521,4,0,...,1,0,0,0,1,1,0,0,0,0
4,0,22.0,1,1,2,0,2,1.65248,49,0,...,1,0,0,1,0,1,0,0,0,0


## Построение и оценка моделей

In [10]:
models = [
    RandomForestClassifier(n_estimators = 80, max_features='auto', criterion='entropy', max_depth=4),
    LogisticRegression(penalty='l1', tol=0.01)
]
for mdl in models:
    scores = cross_validation.cross_val_score(mdl, train, target, cv=3)
    print("{}: {:.4f} (+/- {:.4f})".format(mdl.__class__.__name__, scores.mean(), scores.std() * 2))

RandomForestClassifier: 0.8418 (+/- 0.0055)
LogisticRegression: 0.8384 (+/- 0.0291)


Экспортируем результаты:

In [68]:
for mdl in models:
    mdl.fit(train, target)
    result = ps.DataFrame()
    result.insert(0, "PassengerId", range(892, 1310))
    result.insert(1, "Survived", mdl.predict(test))
    result.to_csv("results/linkfeatures+cluster-m2-{}.csv".format(mdl.__class__.__name__), index=False)

Данная модель даёт результат в 0.81340 методом Random Forest.