In [2]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 

In [3]:
train = pd.read_csv('titanic_train.csv')
test = pd.read_csv('titanic_test.csv')

In [4]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
df = pd.concat([train, test], axis=0)
df.drop('PassengerId', axis=1, inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    float64
 1   Pclass    1309 non-null   int64  
 2   Name      1309 non-null   object 
 3   Sex       1309 non-null   object 
 4   Age       1046 non-null   float64
 5   SibSp     1309 non-null   int64  
 6   Parch     1309 non-null   int64  
 7   Ticket    1309 non-null   object 
 8   Fare      1308 non-null   float64
 9   Cabin     295 non-null    object 
 10  Embarked  1307 non-null   object 
dtypes: float64(3), int64(3), object(5)
memory usage: 122.7+ KB


In [8]:
train.corr()['Survived'].sort_values(ascending=False)

Survived       1.000000
Fare           0.257307
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Name: Survived, dtype: float64

In [9]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Age'] = df['Age'].apply(np.log1p)

In [10]:
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
df['Fare'] = df['Fare'].apply(np.log1p)

In [11]:
df['Cabin'] = df['Cabin'].fillna('None')
df['cabin_captial'] = df['Cabin'].apply(lambda x: x[:1])
df.drop('Cabin', axis=1, inplace=True)

In [12]:
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

In [13]:
df.drop(['Name', 'Ticket'], axis=1, inplace=True)

In [14]:
df['Pclass'] = df['Pclass'].apply(str)

In [15]:
df['family_size'] = df['SibSp']+df['Parch']

In [16]:
df['pc'] = df['Pclass']+df['cabin_captial']

In [17]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,cabin_captial,family_size,pc
0,0.0,3,male,3.135494,1,0,2.110213,S,N,1,3N
1,1.0,1,female,3.663562,1,0,4.280593,C,C,1,1C
2,1.0,3,female,3.295837,0,0,2.188856,S,N,0,3N
3,1.0,1,female,3.583519,1,0,3.990834,S,C,1,1C
4,0.0,3,male,3.583519,0,0,2.202765,S,N,0,3N


In [18]:
df_ = pd.get_dummies(df)

train_ = df_[:len(train)]
test_ = df_[len(train):]

train_y = train_['Survived']
train_x = train_.drop('Survived', axis=1)
test_x = test_.drop('Survived', axis=1)

## modeling

In [20]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold, cross_val_score, train_test_split
from sklearn.linear_model import Lasso, Ridge, LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

kfold = StratifiedKFold(n_splits=5)

In [21]:
# 随机森林
rfc = RandomForestClassifier()

rf_param_grid = {
    'max_depth': [None],
    'max_features': [5],
    'min_samples_split': [5],
    'min_samples_leaf': [3],
    'n_estimators': [100],
    'criterion': ['gini']
}

gs_rfc = GridSearchCV(rfc, 
                      param_grid=rf_param_grid, 
                      cv=kfold,
                      scoring='accuracy',
                      n_jobs=-1,
                      verbose=-1
                     )
gs_rfc.fit(train_x, train_y)

print('scoring: ', gs_rfc.best_score_)
print('params: ', gs_rfc.best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.3s finished


scoring:  0.8170987383089573
params:  {'criterion': 'gini', 'max_depth': None, 'max_features': 5, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 100}


In [23]:
# 逻辑回归
lr = LogisticRegression()

lr_params = {
    'C': [0.03],
}

lr_gs = GridSearchCV(lr,
                   param_grid=lr_params,
                   cv = kfold,
                   scoring='accuracy',
                   n_jobs=-1,
                  )

lr_gs.fit(train_x, train_y)

print('score: ', lr_gs.best_score_)
print('params: ', lr_gs.best_params_)

score:  0.8002699140041429
params:  {'C': 0.03}


In [31]:
# Adaboost
ada = AdaBoostClassifier()

ada_params = {
    'learning_rate': [0.3, 1],
    'n_estimators': [50, 100]
}
ada_gs = GridSearchCV(ada,
                      param_grid=ada_params,
                      cv=kfold,
                      scoring='accuracy',
                      n_jobs=-1
                     )

ada_gs.fit(train_x, train_y)

print('score: ', ada_gs.best_score_)
print('params: ', ada_gs.best_params_)

score:  0.8047768501663424
params:  {'learning_rate': 1, 'n_estimators': 50}


In [33]:
ada_preds = ada_gs.best_estimator_.predict(test_x)

In [34]:
help(ada)

Help on AdaBoostClassifier in module sklearn.ensemble._weight_boosting object:

class AdaBoostClassifier(sklearn.base.ClassifierMixin, BaseWeightBoosting)
 |  An AdaBoost classifier.
 |  
 |  An AdaBoost [1] classifier is a meta-estimator that begins by fitting a
 |  classifier on the original dataset and then fits additional copies of the
 |  classifier on the same dataset but where the weights of incorrectly
 |  classified instances are adjusted such that subsequent classifiers focus
 |  more on difficult cases.
 |  
 |  This class implements the algorithm known as AdaBoost-SAMME [2].
 |  
 |  Read more in the :ref:`User Guide <adaboost>`.
 |  
 |  .. versionadded:: 0.14
 |  
 |  Parameters
 |  ----------
 |  base_estimator : object, optional (default=None)
 |      The base estimator from which the boosted ensemble is built.
 |      Support for sample weighting is required, as well as proper
 |      ``classes_`` and ``n_classes_`` attributes. If ``None``, then
 |      the base estima

In [35]:
lr_preds = lr_gs.predict(test_x)

rf_preds = gs_rfc.best_estimator_.predict(test_x)

sub = pd.DataFrame()
sub['PassengerId'] = test.PassengerId
sub['Survived'] = ada_preds
sub['Survived'] = sub['Survived'].apply(int)
sub.to_csv('submission_2020_03_17.csv', index=False)