In [117]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 

In [118]:
train = pd.read_csv('titanic_train.csv')
test = pd.read_csv('titanic_test.csv')

In [119]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [120]:
df = pd.concat([train, test], axis=0)
df.drop('PassengerId', axis=1, inplace=True)

In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    float64
 1   Pclass    1309 non-null   int64  
 2   Name      1309 non-null   object 
 3   Sex       1309 non-null   object 
 4   Age       1046 non-null   float64
 5   SibSp     1309 non-null   int64  
 6   Parch     1309 non-null   int64  
 7   Ticket    1309 non-null   object 
 8   Fare      1308 non-null   float64
 9   Cabin     295 non-null    object 
 10  Embarked  1307 non-null   object 
dtypes: float64(3), int64(3), object(5)
memory usage: 122.7+ KB


In [122]:
train.corr()['Survived'].sort_values(ascending=False)

Survived       1.000000
Fare           0.257307
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Name: Survived, dtype: float64

In [123]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Age'] = df['Age'].apply(np.log1p)

In [124]:
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
df['Fare'] = df['Fare'].apply(np.log1p)

In [125]:
df['Cabin'] = df['Cabin'].fillna('None')
df['cabin_captial'] = df['Cabin'].apply(lambda x: x[:1])
df.drop('Cabin', axis=1, inplace=True)

In [126]:
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

In [127]:
df.drop(['Name', 'Ticket'], axis=1, inplace=True)

In [128]:
df['Pclass'] = df['Pclass'].apply(str)

In [129]:
df['family_size'] = df['SibSp']+df['Parch']

In [130]:
df['pc'] = df['Pclass']+df['cabin_captial']

In [111]:
# 不同性别，船舱等级的死亡率
train.groupby(['Sex', 'Pclass', 'cabin_cap'])['Survived'].mean()

KeyError: 'cabin_cap'

In [112]:
train.groupby(['Sex', 'Pclass', 'cabin_cap'])['Survived'].count()

KeyError: 'cabin_cap'

In [134]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,cabin_captial,family_size,pc
0,0.0,3,male,3.135494,1,0,2.110213,S,N,1,3N
1,1.0,1,female,3.663562,1,0,4.280593,C,C,1,1C
2,1.0,3,female,3.295837,0,0,2.188856,S,N,0,3N
3,1.0,1,female,3.583519,1,0,3.990834,S,C,1,1C
4,0.0,3,male,3.583519,0,0,2.202765,S,N,0,3N
...,...,...,...,...,...,...,...,...,...,...,...
413,,3,male,3.430146,0,0,2.202765,S,N,0,3N
414,,1,female,3.688879,0,0,4.699571,C,C,0,1C
415,,3,male,3.676301,0,0,2.110213,S,N,0,3N
416,,3,male,3.430146,0,0,2.202765,S,N,0,3N


In [149]:
df[:len(train)].iloc[df[(df['Pclass']!='1') & (df['Sex']=='male')].index, 0].sum()

0.0

In [150]:
train.fillna('None', inplace=True)

In [151]:
train['cabin_cap'] = train['Cabin'].apply(lambda x: x[:1])

In [152]:
train['cabin_cap']

0      N
1      C
2      N
3      C
4      N
      ..
886    N
887    B
888    N
889    C
890    N
Name: cabin_cap, Length: 891, dtype: object

In [153]:
df_ = pd.get_dummies(df)

train_ = df_[:len(train)]
test_ = df_[len(train):]

train_y = train_['Survived']
train_x = train_.drop('Survived', axis=1)
test_x = test_.drop('Survived', axis=1)

## modeling

In [154]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold, cross_val_score, train_test_split
from sklearn.linear_model import Lasso, Ridge, LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
import lightgbm

kfold = StratifiedKFold(n_splits=5)

In [155]:
# 随机森林
rfc = RandomForestClassifier()

rf_param_grid = {
    'max_depth': [None],
    'max_features': [5],
    'min_samples_split': [5],
    'min_samples_leaf': [3],
    'n_estimators': [100],
    'criterion': ['gini']
}

gs_rfc = GridSearchCV(rfc, 
                      param_grid=rf_param_grid, 
                      cv=kfold,
                      scoring='accuracy',
                      n_jobs=-1,
                      verbose=-1
                     )
gs_rfc.fit(train_x, train_y)

print('scoring: ', gs_rfc.best_score_)
print('params: ', gs_rfc.best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.7s finished


scoring:  0.8249450756386917
params:  {'criterion': 'gini', 'max_depth': None, 'max_features': 5, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 100}


In [156]:
def evaluate(clf):
    score = cross_val_score(clf, train_x, train_y, scoring='accuracy', cv=kfold, n_jobs=-1)
    print('scoring', score)
    print('score', score.mean(), score.std())

In [157]:
rfc_ = RandomForestClassifier(max_features=5,
                             min_samples_split=5,
                             min_samples_leaf=3,
                             n_estimators=200,
                             criterion='gini')
evaluate(rfc_)

scoring [0.77094972 0.76966292 0.8258427  0.84269663 0.88764045]
score 0.8193584834599209 0.044860223258249614


In [28]:
# 逻辑回归
lr = LogisticRegression()

lr_params = {
    'C': [0.03],
}

lr_gs = GridSearchCV(lr,
                   param_grid=lr_params,
                   cv = kfold,
                   scoring='accuracy',
                   n_jobs=-1,
                  )

lr_gs.fit(train_x, train_y)

print('score: ', lr_gs.best_score_)
print('params: ', lr_gs.best_params_)

score:  0.8002699140041429
params:  {'C': 0.03}


In [29]:
lr_ = LogisticRegression(C=0.03)
evaluate(lr_)

scoring [0.75977654 0.80337079 0.80898876 0.79213483 0.83707865]
score 0.8002699140041429 0.025085494337963445


In [30]:
# Adaboost
ada = AdaBoostClassifier()

ada_params = {
    'learning_rate': [0.3, 1],
    'n_estimators': [50, 100]
}
ada_gs = GridSearchCV(ada,
                      param_grid=ada_params,
                      cv=kfold,
                      scoring='accuracy',
                      n_jobs=-1
                     )

ada_gs.fit(train_x, train_y)

print('score: ', ada_gs.best_score_)
print('params: ', ada_gs.best_params_)

score:  0.8047768501663424
params:  {'learning_rate': 1, 'n_estimators': 50}


In [31]:
ada_preds = ada_gs.best_estimator_.predict(test_x)

In [32]:
ada_ = AdaBoostClassifier(learning_rate=0.3,
                         n_estimators=1000)
evaluate(ada_)

scoring [0.76536313 0.81460674 0.82022472 0.82022472 0.83707865]
score 0.8114995919904588 0.02426838117295715


In [33]:
# lightgbm 
lgbc = lightgbm.LGBMClassifier()
lgb_param = {
    'objective':['binary'],
    'max_depth':[5],
    'num_leaves':[7],
    'learning_rate':[0.1],
    'feature_feaction':[0.8],
    'bagging_fraction':[0.8],
    'cat_smooth':[0],
}
lgbm_gs = GridSearchCV(lgbc,
                       param_grid=lgb_param,
                       cv=kfold,
                       scoring='accuracy',
                       n_jobs=-1
                      )
lgbm_gs.fit(train_x, train_y)

print(lgbm_gs.best_score_)
print(lgbm_gs.best_params_)

0.8350574351892537
{'bagging_fraction': 0.8, 'cat_smooth': 0, 'feature_feaction': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'num_leaves': 7, 'objective': 'binary'}


In [34]:
lgbc_ = lightgbm.LGBMClassifier(objective='binary',
                                max_depth=4,
                                num_leaves=6,
                                learning_rate=0.3,
                                bagging_fraction=0.5,
                                feature_fraction=0.6,
                                cat_smooth=0
                               )
evaluate(lgbc_)

scoring [0.79329609 0.8258427  0.84831461 0.81460674 0.86516854]
score 0.8294457347310275 0.025187804823704026


In [160]:
lr_preds = lr_gs.predict(test_x)

rf_preds = gs_rfc.best_estimator_.predict(test_x)

lgbm_preds = lgbm_gs.best_estimator_.predict(test_x)

sub = pd.DataFrame()
sub['PassengerId'] = test.PassengerId
sub['Survived'] = rf_preds

In [36]:
test['Survived'] = rf_preds

In [37]:
test['Cabin'].fillna('None', inplace=True)
test['cabin_cap'] = test['Cabin'].apply(lambda x: x[0])
male_3_N = test[(test['Pclass']!=1) & (test['Sex']=='male') & (test['cabin_cap']=='N')].index
female_1_2 = test[(test['Pclass']!=3) & (test['Sex']=='female')].index

test.iloc[male_3_N, 11] = 0

In [164]:
sub['Survived'] = rf_preds
sub['Survived'] = sub['Survived'].apply(np.int)
sub.to_csv('submission_2020_03_17.csv', index=False)

In [165]:
sub

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
