In [1]:
%matplotlib inline
import pandas as ps
import numpy as np
from math import log
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
from sklearn.cluster import DBSCAN
from sklearn.grid_search import GridSearchCV
TRAIN_FILE = '/home/sergio/Data/titanic/train.csv'
TEST_FILE = '/home/sergio/Data/titanic/test.csv'

# Построение модели с максимальным набором признаков

## Подготовка данных

In [2]:
class Transformer:
    _title_mapping = {
        "mister": {"Capt", "Col", "Don", "Dr", "Jonkheer", "Major", "Mr", "Rev", "Sir"}, # Взрослые мужчины
        "missis": {"Mme", "Mrs", "Dona", "the Countess"}, # Взрослые женщины
        "miss": {"Lady", "Miss", "Mlle", "Ms"}, # Девочки и девушки
        "master": {"Master"} # Мальчики и юноши
    }
    
    def __init__(self, train, target, fulldata):
        self._age_maping = {(sex, cls): fulldata[(fulldata.Sex == sex) & (fulldata.Pclass == cls)].Age.median()
                                      for sex in ["male", "female"] for cls in range(1,4)}
        self._tickets_count = fulldata.groupby(["Ticket"])["PassengerId"].count()
        
        # Ticket survival:
        self._tickets_survival = {ticket: {"survived": 0, "perished": 0} for ticket in train.Ticket.drop_duplicates()}
        for ind, row in train.iterrows():
            if target[ind]:
                self._tickets_survival[row.Ticket]["survived"] += 1
            else:
                self._tickets_survival[row.Ticket]["perished"] += 1
        
        # Ticket cluster survival:
        self._cluster_survival, self._cluster_count, self._cluster_people = self.getClusters(train, target, fulldata)
        
        # Surname survival:
        tmp = train.copy()
        tmp.insert(len(tmp.columns), "Surname", [name.split(",")[0].strip() for name in tmp.Name])
        self._surnames_survival = {surname: {"survived": 0, "perished": 0} for surname in tmp.Surname.drop_duplicates()}
        for ind, row in tmp.iterrows():
            if target[ind]:
                self._surnames_survival[row.Surname]["survived"] += 1
            else:
                self._surnames_survival[row.Surname]["perished"] += 1
        
        tmp = fulldata.copy()
        tmp.insert(len(tmp.columns), "Surname", [name.split(",")[0].strip() for name in tmp.Name])
        self._surnames_count = tmp.groupby(["Surname"])["PassengerId"].count()
        
        self._codes = {}
    
    def getClusters(self, train, target, fulldata):
        tickets = ps.Series([self.ticketIndex(ticket) for ticket in fulldata.Ticket.drop_duplicates()])
        df = ps.DataFrame()
        df.insert(0, "Ticket", tickets)
        db = DBSCAN(eps=1, min_samples=2).fit(df)
        labels = set(db.labels_)
        ticket_cluster = {ticket: db.labels_[i] for i, ticket in enumerate(df.Ticket)}
        for ticket in fulldata.Ticket:
            ticket = self.ticketIndex(ticket)
            if ticket not in ticket_cluster:
                ticket_cluster[ticket] = -1
        
        listed = list(db.labels_)
        cluster_count = {label: listed.count(label) for label in labels}
        cluster_count[-1] = 1
        ticket_cluster_count = {ticket: cluster_count[cluster] for ticket, cluster in ticket_cluster.items()}
        
        cluster_survival = {cluster: {"survived": 0, "perished": 0} for cluster in labels if cluster != -1}
        for ind, row in train.iterrows():
            ticket = self.ticketIndex(row.Ticket)
            if ticket_cluster[ticket] == -1: continue
            if target[ind]:
                cluster_survival[ticket_cluster[ticket]]["survived"] += 1
            else:
                cluster_survival[ticket_cluster[ticket]]["perished"] += 1
        cluster_survival[-1] = {"survived": 0, "perished": 0}
        
        ticket_cluster_survival = {self.ticketIndex(ticket): cluster_survival[ticket_cluster[self.ticketIndex(ticket)]]
                                   for ticket in fulldata.Ticket}
        #People on cluster:
        cluster_tickets = {cluster: [ticket for ticket, cls in ticket_cluster.items() if cls == cluster]
                   for cluster in set(ticket_cluster.values())}
        tmp = fulldata.copy()
        tmp.insert(len(tmp.columns), "TicketIndex", [self.ticketIndex(ticket) for ticket in tmp.Ticket])
        people_on_cluster = {}
        for cluster, tickets in cluster_tickets.items():
            if cluster == -1: continue
            people_on_cluster[cluster] = 0
            for ticket in tickets:
                people_on_cluster[cluster] += len(tmp[tmp.TicketIndex == ticket])
        ticket_cluster_people = {self.ticketIndex(ticket): people_on_cluster[ticket_cluster[self.ticketIndex(ticket)]]
                                   for ticket in fulldata.Ticket if ticket_cluster[self.ticketIndex(ticket)] != -1}
        for ticket in fulldata.Ticket.drop_duplicates():
            ticket = self.ticketIndex(ticket)
            if ticket not in ticket_cluster_people:
                ticket_cluster_people[ticket] = len(tmp[tmp.TicketIndex == ticket])
        
        return ticket_cluster_survival, ticket_cluster_count, ticket_cluster_people
    
    def apply(self, data, target=None):
        data.Age = [self._age_maping[(row.Sex,row.Pclass)] if np.isnan(row.Age) else row.Age
                    for ind, row in data.iterrows()]
        data.Fare.fillna(0, inplace=True)
        data.Embarked.fillna("S", inplace=True)
        data.insert(len(data.columns), "NumCabins",
                    [len(cabin.split()) if type(cabin) == str else 0 for cabin in data.Cabin])
        data.insert(len(data.columns), "Title", [self.title(name) for name in data.Name])
        data.insert(len(data.columns), "SurnameCount", [self.surnameCount(name) for name in data.Name])
        data.insert(len(data.columns), "FarRelatives", [row.SurnameCount-row.SibSp-row.Parch for ind, row in data.iterrows()])
        data.insert(len(data.columns), "Cotravellers", [self._tickets_count[ticket] for ticket in data.Ticket])
        data.insert(len(data.columns), "FarePerTraveller", [row.Fare/row.Cotravellers for ind, row in data.iterrows()])
        data.insert(len(data.columns), "TicketClusterCount", [self._cluster_count[self.ticketIndex(ticket)]
                                                              for ticket in data.Ticket])
        data.insert(len(data.columns), "TicketClusterPeople", [self._cluster_people[self.ticketIndex(ticket)]
                                                              for ticket in data.Ticket])
        data.insert(len(data.columns), "TicketClusterSurvival",
                    [self.clusterSurvival(self.ticketIndex(row.Ticket), target=(target[ind] if target is not None else None))
                     for ind, row in data.iterrows()])
        data.insert(len(data.columns), "TicketSurvival",
                    [self.ticketSurvival(row.Ticket, target=(target[ind] if target is not None else None))
                     for ind, row in data.iterrows()])
        data.insert(len(data.columns), "SurnameSurvival",
                    [self.surnameSurvival(row.Name, target=(target[ind] if target is not None else None))
                     for ind, row in data.iterrows()])
        data.Fare = [log(fare, 3) if fare != 0 else 0 for fare in data.Fare]
        data.FarePerTraveller = [log(fare, 3) if fare != 0 else 0 for fare in data.FarePerTraveller]
        data = self.oneHot(data, "Embarked")
        data = self.oneHot(data, "Pclass")
        data = self.encode(data, "Sex")
        data = self.oneHot(data, "Title")
        data = self.oneHot(data, "NumCabins")
        return data.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)

    def encode(self, data, column):
        values = data[column].drop_duplicates()
        if column in self._codes:
            mapping = self._codes[column]
            for val in values:
                if val not in mapping:
                    mapping[val] = len(mapping)
        else:
            mapping = {v: i for i, v in enumerate(sorted(values))}
            self._codes[column] = mapping
        data[column] = [mapping[v] for v in data[column]]
        return data
    
    def oneHot(self, data, column):
        values = data[column].drop_duplicates()
        for val in sorted(values):
            data.insert(len(data.columns), "{}_{}".format(column, val),
                        [1 if rowval == val else 0 for rowval in data[column]])
        return data.drop([column], axis=1)
    
    @classmethod
    def title(cls, name):
        title = name.split(",")[1].split(".")[0].strip()
        return next(key for key, value in cls._title_mapping.items() if title in value)
    
    @staticmethod
    def ticketIndex(ticket):
        if ticket == "LINE":
            return -500
        return int(ticket.split()[-1])

    def surnameCount(self, name):
        return self._surnames_count[name.split(",")[0].strip()]
    
    def ticketSurvival(self, ticket, target=None):
        index = lambda surv, per: surv - per
        try:
            info = self._tickets_survival[ticket]
        except KeyError:
            return 0
        if target is None:
            return index(info["survived"], info["perished"])
        elif target == 1:
            return index(info["survived"]-1, info["perished"])
        elif target == 0:
            return index(info["survived"], info["perished"]-1)
    
    def clusterSurvival(self, ticket, target=None):
        #index = lambda surv, per: surv/(surv+per) if surv+per != 0 else 0.5
        index = lambda surv, per: surv - per
        info = self._cluster_survival[ticket]
        if target is None:
            return index(info["survived"], info["perished"])
        elif target == 1:
            return index(info["survived"]-1, info["perished"])
        elif target == 0:
            return index(info["survived"], info["perished"]-1)
    
    def surnameSurvival(self, name, target=None):
        surname = name.split(",")[0].strip()
        try:
            info = self._surnames_survival[surname]
        except KeyError:
            return 0
        index = lambda surv, per: surv - per
        if target is None:
            return index(info["survived"], info["perished"])
        elif target == 1:
            return index(info["survived"]-1, info["perished"])
        elif target == 0:
            return index(info["survived"], info["perished"]-1)

In [3]:
def get_data():
    data = ps.read_csv(TRAIN_FILE)
    test = ps.read_csv(TEST_FILE)
    train = data.drop(['Survived'], axis=1)
    target = data.Survived
    fulldata = ps.concat([train, test])
    trans = Transformer(train, target, fulldata)
    return trans.apply(train, target), target, trans.apply(test)

train, target, test = get_data()
train.head(10)
#train[:5].drop(["Embarked_C", "Embarked_Q", "Embarked_S", "Pclass_1"], axis=1)

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,SurnameCount,FarRelatives,Cotravellers,FarePerTraveller,TicketClusterCount,...,Pclass_3,Title_master,Title_miss,Title_missis,Title_mister,NumCabins_0,NumCabins_1,NumCabins_2,NumCabins_3,NumCabins_4
0,1,22,1,0,1.803185,2,1,1,1.803185,5,...,1,0,0,0,1,1,0,0,0,0
1,0,38,1,0,3.883683,2,1,2,3.252753,12,...,0,0,0,1,0,0,1,0,0,0
2,0,26,0,0,1.884216,1,1,1,1.884216,37,...,1,0,1,0,0,1,0,0,0,0
3,0,35,1,0,3.615631,2,1,2,2.984702,2,...,0,0,0,1,0,0,1,0,0,0
4,1,35,0,0,1.898461,2,2,1,1.898461,1,...,1,0,0,0,1,1,0,0,0,0
5,1,25,0,0,1.943496,3,3,1,1.943496,1,...,1,0,0,0,1,1,0,0,0,0
6,1,54,0,0,3.594167,2,2,2,2.963237,4,...,0,0,0,0,1,0,1,0,0,0
7,1,2,3,1,2.774489,5,1,5,1.309515,4,...,1,1,0,0,0,1,0,0,0,0
8,0,27,0,2,2.193622,6,4,3,1.193622,2,...,1,0,0,1,0,1,0,0,0,0
9,0,14,1,0,3.098049,2,1,2,2.467119,3,...,0,0,0,1,0,1,0,0,0,0


In [4]:
test.head(10)
#test[:5].drop(["Embarked_C", "Embarked_Q", "Embarked_S"], axis=1)

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,SurnameCount,FarRelatives,Cotravellers,FarePerTraveller,TicketClusterCount,...,Pclass_3,Title_master,Title_miss,Title_missis,Title_mister,NumCabins_0,NumCabins_1,NumCabins_2,NumCabins_3,NumCabins_4
0,1,34.5,0,0,1.873145,5,5,1,1.873145,3,...,1,0,0,0,1,1,0,0,0,0
1,0,47.0,1,0,1.771244,1,0,1,1.771244,1,...,1,0,0,1,0,1,0,0,0,0
2,1,62.0,0,0,2.067004,1,1,1,2.067004,1,...,0,0,0,0,1,1,0,0,0,0
3,1,27.0,0,0,1.96521,1,1,1,1.96521,4,...,1,0,0,0,1,1,0,0,0,0
4,0,22.0,1,1,2.28341,2,0,2,1.65248,37,...,1,0,0,1,0,1,0,0,0,0
5,1,14.0,0,0,2.022476,3,3,1,2.022476,1,...,1,0,0,0,1,1,0,0,0,0
6,0,30.0,0,0,1.849591,2,2,1,1.849591,2,...,1,0,1,0,0,1,0,0,0,0
7,1,26.0,1,1,3.065045,3,1,3,2.065045,1,...,0,0,0,0,1,1,0,0,0,0
8,0,18.0,0,0,1.80057,1,1,1,1.80057,55,...,1,0,0,1,0,1,0,0,0,0
9,1,21.0,2,0,2.898461,7,5,3,1.898461,1,...,1,0,0,0,1,1,0,0,0,0


## Построение и оценка отдельных моделей

In [5]:
models = [
    RandomForestClassifier(n_estimators = 1000, max_features=0.3, criterion='entropy', max_depth=4),
    LogisticRegression(penalty='l1', tol=0.01),
    AdaBoostClassifier(n_estimators=1000),
    GradientBoostingClassifier(n_estimators=1000, max_features=None, max_depth=5, learning_rate=0.5),
]
for mdl in models:
    scores = cross_validation.cross_val_score(mdl, train, target, cv=3)
    print("{}: {:.4f} (+/- {:.4f})".format(mdl.__class__.__name__, scores.mean(), scores.std() * 2))

RandomForestClassifier: 0.8541 (+/- 0.0222)
LogisticRegression: 0.8373 (+/- 0.0193)
AdaBoostClassifier: 0.8698 (+/- 0.0358)
GradientBoostingClassifier: 0.9181 (+/- 0.0063)


## Важности критериев

In [6]:
rfc = models[0]
rfc.fit(train, target)
features_importances = sorted(zip(train.columns, rfc.feature_importances_), key = lambda x: -x[1])
for feature, importance in features_importances:
    print("{}: {}%".format(feature, importance*100))

Title_mister: 23.128356276184096%
Sex: 15.87278733935035%
TicketSurvival: 7.883014419222369%
TicketClusterSurvival: 6.857273981417113%
FarePerTraveller: 6.594037616561376%
SurnameSurvival: 5.0523753465427195%
Pclass_3: 4.883627926611128%
Fare: 4.471366344033605%
Title_miss: 4.007915854412392%
Title_missis: 3.361452201462222%
TicketClusterCount: 2.5249543492692283%
NumCabins_0: 2.4841558476888443%
Cotravellers: 2.095387395048278%
Age: 2.074472201634253%
TicketClusterPeople: 1.8886767865934042%
NumCabins_1: 1.7502224631369536%
Pclass_1: 1.1944531231076023%
SibSp: 0.7914369332743878%
SurnameCount: 0.7622144568779111%
Title_master: 0.5784132547953645%
FarRelatives: 0.386611305851097%
Pclass_2: 0.3608262200018977%
Parch: 0.3368753709238954%
Embarked_S: 0.23700164005430335%
Embarked_C: 0.18356394640728943%
NumCabins_2: 0.18164880426847016%
Embarked_Q: 0.05341048118161365%
NumCabins_3: 0.0034681140878587645%
NumCabins_4: 0.0%


## Подбор лучших параметров

### Random Forest

In [46]:
parameter_grid = {
    'max_features': [0.3, 0.5, 0.8, 1],
    'max_depth': [4,5,6,None]
}

grid_search = GridSearchCV(RandomForestClassifier(n_estimators = 100), parameter_grid, cv=3)
grid_search.fit(train, target)
sorted(grid_search.grid_scores_, key=lambda x: -x.mean_validation_score)

[mean: 0.87767, std: 0.01611, params: {'max_depth': None, 'max_features': 0.8},
 mean: 0.87318, std: 0.01611, params: {'max_depth': None, 'max_features': 0.5},
 mean: 0.87205, std: 0.01803, params: {'max_depth': 6, 'max_features': 0.8},
 mean: 0.86981, std: 0.01111, params: {'max_depth': 6, 'max_features': 0.5},
 mean: 0.86869, std: 0.00991, params: {'max_depth': None, 'max_features': 0.3},
 mean: 0.86756, std: 0.01871, params: {'max_depth': 5, 'max_features': 0.8},
 mean: 0.86308, std: 0.01240, params: {'max_depth': 4, 'max_features': 0.8},
 mean: 0.86308, std: 0.01514, params: {'max_depth': 5, 'max_features': 0.3},
 mean: 0.86308, std: 0.01240, params: {'max_depth': 6, 'max_features': 0.3},
 mean: 0.86195, std: 0.02076, params: {'max_depth': 5, 'max_features': 0.5},
 mean: 0.85859, std: 0.02076, params: {'max_depth': None, 'max_features': 1},
 mean: 0.85410, std: 0.01830, params: {'max_depth': 4, 'max_features': 0.5},
 mean: 0.84063, std: 0.01111, params: {'max_depth': 4, 'max_featur

### Gradient Boosting

In [48]:
parameter_grid = {
    'loss': ['deviance'],
    'learning_rate': [0.05, 0.1, 0.2, 0.3, 0.5],
    'max_depth': [3,4,5,None],
    'max_features': ["sqrt", None]
}

grid_search = GridSearchCV(GradientBoostingClassifier(n_estimators = 100), parameter_grid, cv=3)
grid_search.fit(train, target)
sorted(grid_search.grid_scores_, key=lambda x: -x.mean_validation_score)

[mean: 0.91134, std: 0.00572, params: {'loss': 'deviance', 'max_depth': 5, 'max_features': None, 'learning_rate': 0.5},
 mean: 0.90797, std: 0.01041, params: {'loss': 'deviance', 'max_depth': 4, 'max_features': None, 'learning_rate': 0.5},
 mean: 0.90685, std: 0.00965, params: {'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'learning_rate': 0.5},
 mean: 0.90572, std: 0.01198, params: {'loss': 'deviance', 'max_depth': 4, 'max_features': None, 'learning_rate': 0.2},
 mean: 0.90572, std: 0.01198, params: {'loss': 'deviance', 'max_depth': 4, 'max_features': None, 'learning_rate': 0.3},
 mean: 0.90460, std: 0.01611, params: {'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'learning_rate': 0.2},
 mean: 0.90348, std: 0.01270, params: {'loss': 'deviance', 'max_depth': 4, 'max_features': None, 'learning_rate': 0.1},
 mean: 0.90348, std: 0.01145, params: {'loss': 'deviance', 'max_depth': 5, 'max_features': None, 'learning_rate': 0.3},
 mean: 0.90236, std: 0.01375, params: {'

## Построение и оценка Boosting моделей

In [52]:
boosts = [
        AdaBoostClassifier(base_estimator=
                           RandomForestClassifier(n_estimators = 10, max_features=4, criterion='entropy', max_depth=3),
                           n_estimators=100),
]
for mdl in boosts:
    scores = cross_validation.cross_val_score(mdl, train, target, cv=3)
    print("{}: {:.4f} (+/- {:.4f})".format(mdl.__class__.__name__, scores.mean(), scores.std() * 2))

AdaBoostClassifier: 0.8754 (+/- 0.0198)


Экспорт результатов:

In [7]:
for mdl in models:
    mdl.fit(train, target)
    result = ps.DataFrame()
    result.insert(0, "PassengerId", range(892, 1310))
    result.insert(1, "Survived", mdl.predict(test))
    result.to_csv("results/boosting-{}.csv".format(mdl.__class__.__name__), index=False)