In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

titanic = pd.read_csv('https://raw.githubusercontent.com/dsindy/kaggle-titanic/master/data/train.csv')
titanic
df = titanic.copy()

In [2]:
def process_family():
    global df
    # introducing a new feature : the size of families (including the passenger)
    df['family'] = df['Parch'] + df['SibSp'] + 1

    # introducing other features based on the family size
    df['Singleton'] = df['family'].map(lambda s: 1 if s == 1 else 0)
    df['SmallFamily'] = df['family'].map(lambda s: 1 if 2 <= s <= 4 else 0)
    df['LargeFamily'] = df['family'].map(lambda s: 1 if 5 <= s else 0)
    return df
    
df = process_family()

In [3]:
df['title'] = df['Name'].str.split(', ')
df['title'] = df['title'].apply(lambda name : name[1].split('.')[0])

In [4]:
Title_Dict = {
    'Mr':'Mr',
    'Mrs':'Mrs',
    'Miss':'Miss',
    'Master':'Master',
    'Don':'Royalty',
    'Rev':'Officer',
    'Dr':'Officer',
    'Mme':'Mrs',
    'Ms':'Mrs',
    'Major':'Officer',
    'Lady':'Royalty',
    'Sir':'Royalty',
    'Mlle':'Miss',
    'Col':'Officer',
    'Capt':'Officer',
    'the Countess':'Royalty',
    'Jonkheer':'Royalty'
}
df['title'] = df['title'].map(Title_Dict)

In [5]:
# Age填補 missing values by group sex、title、pclass median
age_median = df.pivot_table(values = 'Age' , index = ['Sex' , 'title' , 'Pclass'] , aggfunc='median')
age_median

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Age
Sex,title,Pclass,Unnamed: 3_level_1
female,Miss,1,30.0
female,Miss,2,24.0
female,Miss,3,18.0
female,Mrs,1,40.0
female,Mrs,2,31.5
female,Mrs,3,31.0
female,Officer,1,49.0
female,Royalty,1,40.5
male,Master,1,4.0
male,Master,2,1.0


In [6]:
age_median = df.groupby(['Sex' , 'title' , 'Pclass'])['Age'].transform('median')
df['Age'] = df['Age'].fillna(age_median)

In [7]:
df = pd.get_dummies(df, columns=['Sex','Embarked','title'], dtype='int')

In [8]:
#標準化年齡、價錢
from sklearn.preprocessing import StandardScaler
sz = StandardScaler()
df['Age'] = sz.fit_transform(df[['Age']])
df['Fare'] = sz.fit_transform(df[['Fare']])

In [9]:
df_train = df[['Survived', 'Pclass', 'Age', 'family', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
            'title_Master', 'title_Miss', 'title_Mr', 'title_Mrs', 'title_Officer', 'title_Royalty', 'Singleton', 'SmallFamily', 'LargeFamily']]

### 不同模型

In [10]:
from sklearn.linear_model import LogisticRegression , LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [11]:
log = LogisticRegression(random_state = 0 , max_iter = 1000)
logcv = LogisticRegressionCV(random_state=0 , max_iter = 3000)
svc = SVC(random_state = 0)
decisiontree = DecisionTreeClassifier()
knn = KNeighborsClassifier(n_neighbors=3)
gaussian = GaussianNB()
rf = RandomForestClassifier(n_estimators=100)
models = [log , logcv , svc , decisiontree , knn , gaussian , rf]

In [13]:
model_names = ['Logistic Regression' , 'Logistic Regression CV' , 'SVC' , 'Decision Tree' , 'KNN' , 'Gaussian Navive' , 'Random Forest']
res = []
from sklearn.model_selection import cross_val_score

def score(model , x , y):
  return cross_val_score(model , x , y , cv = 5 , scoring = 'accuracy').mean()

In [14]:
columns_X = list(set(df_train.columns) - {'Survived'})
columns_y = ['Survived']

train_X = df_train[columns_X]
train_y = df_train[columns_y]

for num in range(len(models)):
    res.append({'Model':model_names[num] , 'Average Score': score(models[num] , train_X , train_y)})
    
res = pd.DataFrame(res)
res.sort_values('Average Score', ascending=False)

Unnamed: 0,Model,Average Score
2,SVC,0.833877
1,Logistic Regression CV,0.828278
0,Logistic Regression,0.827155
5,Gaussian Navive,0.818172
6,Random Forest,0.806961
4,KNN,0.80472
3,Decision Tree,0.775538


###  利用「sklearn.model_selection 下的 GridSearchCV(...)」進行參數的調整
1. SVC
2. Log

In [15]:
df_gs = df[['Survived', 'Pclass', 'Age', 'family', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
            'title_Master', 'title_Miss', 'title_Mr', 'title_Mrs', 'title_Officer', 'title_Royalty', 'Singleton', 'SmallFamily', 'LargeFamily']]

In [16]:
from sklearn.model_selection import GridSearchCV

columns_X = list(set(df_gs.columns) - {'Survived'})
columns_y = ['Survived']

train_X = df_gs[columns_X]
train_y = df_gs[columns_y]

In [17]:
# SVC
svc = SVC()

# 定义超参数范围
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'sigmoid']
}

grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(train_X, train_y)

print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)


Best Parameters:  {'C': 0.1, 'gamma': 1, 'kernel': 'linear'}
Best Score:  0.8294051627384961


In [18]:
# Logistic Regression
log_reg = LogisticRegression()

# 定義超參數範圍
param_grid = {
    'penalty' : ['l1', 'l2'], # 正則化參數
    'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000], # C參數
    'solver' : ['liblinear', 'sag'] # 優化算法
}

# 創建 GridSearchCV 物件
grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(train_X, train_y)

print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

Best Parameters:  {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score:  0.8260381593714928
