In [198]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score as f1

In [138]:
def preclear(df):
    df['Sex'] = df['Sex'].map({'female':1, 'male':0})
    df['Age'].fillna(value=df.Age.mean(), inplace=True)
    df['Age'] = df['Age'].map(lambda a: int(a/10))
    df['Fare'].fillna(value=df.Fare.mean(), inplace=True)
    df['Fare'] = df['Fare'].map(lambda f: 0 if f==0 else int(np.log(f)))
    df['Embarked'].fillna(value='C', inplace=True)
    df['Embarked'] = df['Embarked'].map({'C':0, 'Q':1, 'S':2})
    df['Cabin'].fillna(value='X', inplace=True)
    df['Cabin'] = df['Cabin'].map(lambda c: c[0]).map({'D':0, 'E':1, 'B':2, 'F':3, 'C':4, 'G':5, 'A':6, 'X':7, 'T': 8})
    return df

In [144]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/gender_submission.csv')

train = preclear(train)
test = preclear(test)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null int64
Age            891 non-null int64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null int64
Cabin          891 non-null int64
Embarked       891 non-null int64
dtypes: int64(10), object(2)
memory usage: 83.6+ KB


## Evaluate the affection from selected feature to result (survival)

In [16]:
train[['Survived', 'Sex']].groupby(['Sex']).mean()

Unnamed: 0_level_0,Survived
Sex,Unnamed: 1_level_1
0,0.188908
1,0.742038


In [12]:
train[['Pclass', 'Survived']].groupby(['Pclass']).mean()

Unnamed: 0_level_0,Survived
Pclass,Unnamed: 1_level_1
1,0.62963
2,0.472826
3,0.242363


In [73]:
train[['Age', 'Survived']].groupby(['Age']).mean()

Unnamed: 0_level_0,Survived
Age,Unnamed: 1_level_1
0,0.612903
1,0.401961
2,0.324937
3,0.437126
4,0.382022
5,0.416667
6,0.315789
7,0.0
8,1.0


In [41]:
train[['SibSp', 'Survived']].groupby(['SibSp']).mean()

Unnamed: 0_level_0,Survived
SibSp,Unnamed: 1_level_1
0,0.345395
1,0.535885
2,0.464286
3,0.25
4,0.166667
5,0.0
8,0.0


In [74]:
train[['Parch', 'Survived']].groupby(['Parch']).mean()

Unnamed: 0_level_0,Survived
Parch,Unnamed: 1_level_1
0,0.343658
1,0.550847
2,0.5
3,0.6
4,0.0
5,0.2
6,0.0


In [86]:
train[['Fare', 'Survived']].groupby(['Fare']).mean()

Unnamed: 0_level_0,Survived
Fare,Unnamed: 1_level_1
0,0.066667
1,0.144928
2,0.306265
3,0.429787
4,0.696429
5,0.653846
6,1.0


In [146]:
train[['Embarked', 'Survived']].groupby(['Embarked']).mean()

Unnamed: 0_level_0,Survived
Embarked,Unnamed: 1_level_1
0,0.558824
1,0.38961
2,0.336957


In [110]:
train[['Cabin', 'Survived']].groupby(['Cabin']).mean().sort_values(by=['Survived'],ascending=[False])

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
0,0.757576
1,0.75
2,0.744681
3,0.615385
4,0.59322
5,0.5
6,0.466667
7,0.299854
8,0.0


In [112]:
train[['Ticket', 'Survived']].groupby(['Ticket']).mean().sort_values(by=['Survived'],ascending=[False])

Unnamed: 0_level_0,Survived
Ticket,Unnamed: 1_level_1
110152,1.0
26360,1.0
386525,1.0
382651,1.0
244373,1.0
248698,1.0
248706,1.0
370375,1.0
248733,1.0
248738,1.0


## Estimate current valueable features

In [111]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,889.0
mean,446.0,0.383838,2.308642,0.352413,2.420875,0.523008,0.381594,2.505051,5.979798,1.535433
std,257.353842,0.486592,0.836071,0.47799,1.356289,1.102743,0.806057,0.986839,2.062952,0.792088
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,223.5,0.0,2.0,0.0,2.0,0.0,0.0,2.0,7.0,1.0
50%,446.0,0.0,3.0,0.0,2.0,0.0,0.0,2.0,7.0,2.0
75%,668.5,1.0,3.0,1.0,3.0,1.0,0.0,3.0,7.0,2.0
max,891.0,1.0,3.0,1.0,8.0,8.0,6.0,6.0,8.0,2.0


## Select useful features for further processing

In [236]:
valid_labels = [
    'Pclass', 
    'Sex',
    'Age',
    'Fare',
    'Cabin',
    'Embarked'
];

X = train[valid_labels]
y = train.Survived

X_test = test[valid_labels]
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
Pclass      891 non-null int64
Sex         891 non-null int64
Age         891 non-null int64
Fare        891 non-null int64
Cabin       891 non-null int64
Embarked    891 non-null int64
dtypes: int64(6)
memory usage: 41.8 KB


## Split cross-validation set from training set 

In [190]:
from sklearn.model_selection import train_test_split

X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.25, random_state=42)


## Parameter selection for multi-layer perceptron classifier

In [243]:
from sklearn.neural_network import MLPClassifier

alphas = np.logspace(0,9,10,base=3)*1e-6
scores = []
for ran in np.logspace(0,9,10,base=2):
    for act in ['logistic', 'tanh', 'relu']:
        for a in alphas:
            X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.2, random_state=int(ran))
            clf = MLPClassifier(
                solver='lbfgs',
                alpha=a,
                activation=act,
                hidden_layer_sizes=(20,10),
                random_state=int(ran),
                max_iter=250
            )
            clf.fit(X_train, y_train)
            scores.append([a, act, int(ran), clf.score(X_cv, y_cv), f1(y_cv, clf.predict(X_cv))])


## Evaluate the params of the best score

In [244]:
scores = np.array(scores)
df = pd.DataFrame(data=scores, columns=['alpha', 'activation', 'random_state', 'score', 'f1'])
df.sort_values(by=['score', 'f1'], ascending=[False, False])

Unnamed: 0,alpha,activation,random_state,score,f1
87,0.002187,relu,4,0.865921787709,0.769230769231
202,9e-06,relu,64,0.860335195531,0.822695035461
211,3e-06,logistic,128,0.860335195531,0.770642201835
85,0.000243,relu,4,0.860335195531,0.761904761905
182,9e-06,logistic,64,0.854748603352,0.808823529412
89,0.019683,relu,4,0.854748603352,0.745098039216
186,0.000729,logistic,64,0.849162011173,0.802919708029
200,1e-06,relu,64,0.849162011173,0.802919708029
237,0.002187,relu,128,0.849162011173,0.765217391304
76,0.000729,tanh,4,0.849162011173,0.756756756757


## Use this param to train concrete model

In [245]:
X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.2, random_state=64)
clf = MLPClassifier(
        solver='lbfgs',
        alpha=9e-06,
        activation='relu',
        hidden_layer_sizes=(20,10),
        random_state=64,
        max_iter=250
    )
clf.fit(X_train, y_train)
print(clf.score(X_cv, y_cv))
print(f1(y_cv, clf.predict(X_cv)))

0.860335195531
0.822695035461


## Output the prediction to file

In [246]:
s_pred = clf.predict(X_test)
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": s_pred
})
submission.to_csv('sub.csv', index=False)

## try svm classifier, and other classification models

In [191]:
from sklearn.svm import SVC

scores = []
for c in np.logspace(0,9,10,base=2)*0.03:
    for k in ['rbf', 'sigmoid']:
        clf = SVC(
            C = c,
            kernel = k,
            decision_function_shape = 'ovo'
        )
        clf.fit(X_train, y_train)
        scores.append([c, k, clf.score(X_cv, y_cv)])

pd.DataFrame(scores, columns=['c', 'k', 's']).sort_values(['s'])

Unnamed: 0,c,k,s
9,0.48,sigmoid,0.600897
1,0.03,sigmoid,0.600897
17,7.68,sigmoid,0.600897
3,0.06,sigmoid,0.600897
15,3.84,sigmoid,0.600897
5,0.12,sigmoid,0.600897
13,1.92,sigmoid,0.600897
7,0.24,sigmoid,0.600897
11,0.96,sigmoid,0.600897
19,15.36,sigmoid,0.636771


In [14]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(
    random_state=1,
    C=0.15
)

lr.fit(X_train, y_train)
y_pred = lr.predict(X_cv)

In [57]:
rfc = RandomForestClassifier(n_estimators=100)

rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_cv)
rfc.score(X_cv, y_cv)

0.80808080808080807

In [33]:
from sklearn.metrics import accuracy_score

accuracy_score(y_cv, y_pred)
#accuracy_score(submission['Survived'], s_pred)
#s_pred.to_csv()


0.81818181818181823

123.0
[[[1 2 3]
  [2 3 4]]]
[[123 246 369]
 [246 369 492]]
