## 타이타닉 생존자 예측

In [2]:
import numpy as np 
import pandas as pd 

In [3]:
titanic_df = pd.read_csv('../00.data/titanic/train.csv')
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### 결손치(Nan/Null) 처리

In [5]:
# 둘 중 하나
#titanic_df['Age'] = titanic_df['Age'].fillna(titanic_df['Age'].mean())
titanic_df['Age'].fillna(titanic_df['Age'].mean(), inplace=True)
titanic_df['Age']

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64

In [6]:
titanic_df['Embarked'].value_counts()       # 어떤 함수가 있는지 볼 때, value_countst()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [7]:
titanic_df['Embarked'].fillna('N', inplace=True)

### 불필요한 데이터 제거

In [16]:
#del titanic_df['PassengerId']
titanic_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


### 문자열 처리

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
titanic_df['Sex'] = le.fit_transform(titanic_df['Sex'])
titanic_df['Embarked'] = le.fit_transform(titanic_df['Embarked'])
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,3
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,3
3,1,1,0,35.0,1,0,53.1,3
4,0,3,1,35.0,0,0,8.05,3


### Survived 속성을 y로 나머지 피쳐를 X로 만듦


In [20]:
y = titanic_df['Survived']
X = titanic_df[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,2
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.925,2
3,1,0,35.0,1,0,53.1,2
4,3,1,35.0,0,0,8.05,2


In [21]:
X1 = titanic_df.iloc[:, 1:]
X1.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,2
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.925,2
3,1,0,35.0,1,0,53.1,2
4,3,1,35.0,0,0,8.05,2


In [22]:
X2 = titanic_df.drop(['Survived'], axis=1)
X2.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,2
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.925,2
3,1,0,35.0,1,0,53.1,2
4,3,1,35.0,0,0,8.05,2


## 데이터 전처리에 필요한 함수

In [24]:
# Nan 처리 함수
def proc_nan(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    return df

# 불필요한 피처 제거 함수
def drop_features(df):
    df.drop(['PassengerId','Name','Ticket','Cabin'], axis=1, inplace=True)
    return df

# 문자열을 숫자로 변환하는 함수
def transform_feature(df):
    le = LabelEncoder()
    for feature in ['Sex', 'Embarked']:
        df[feature] = le.fit_transform(df[feature])
    return df

# 위에서 정의한 함수들을 차례로 호출해주는 함수
def pre_process(df):
    df = proc_nan(df)
    df = drop_features(df)
    df = transform_feature(df)
    return df

# 실습

- 타이타닉 생존자 예측

In [25]:
# 타이타닉 데이터 불러오기
titanic_df = pd.read_csv('../00.data/titanic/train.csv')
y = titanic_df['Survived']
X = titanic_df.drop(['Survived'], axis=1)
X = pre_process(X)

### 학습/테스트 데이터 세트 분리

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2021
)

### 적용할 알고리즘 선정:

In [27]:
# 결정트리 모델
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(random_state=2021)

In [28]:
# 파라미터는 max_depth와 min_sample_leaf
parameters = {
    'max_depth':[2,5,8,11], 
    'min_samples_leaf':[2,4,6],
    'min_samples_split':[2,4,6]
}

In [30]:
# GridSearchCV를 사용하여 학습
from sklearn.model_selection import GridSearchCV

grid_dtree = GridSearchCV(
    dt_clf, param_grid=parameters, scoring='accuracy', cv=5     #refit=True는 생략가능
)
grid_dtree.fit(X_train, y_train)        # 학습시키는 명령줄

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 5, 8, 11],
                         'min_samples_leaf': [2, 4, 6],
                         'min_samples_split': [2, 4, 6]},
             scoring='accuracy')

In [31]:
# 최적의 파라미터, 정확도
grid_dtree.best_params_, grid_dtree.best_score_

({'max_depth': 8, 'min_samples_leaf': 4, 'min_samples_split': 2},
 0.8454742440657934)

In [34]:
# 파라미터 세부 튜닝
parameters = {
    'max_depth':[7,8,9],
    'min_samples_leaf':[3,4,5],
    'min_samples_split':[2,3]
}
grid_dtree = GridSearchCV(
    dt_clf, param_grid=parameters, scoring='accuracy', cv=5
)
grid_dtree.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [7, 8, 9], 'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [2, 3]},
             scoring='accuracy')

In [35]:
# 최적 파라미터, 정확도
grid_dtree.best_params_, grid_dtree.best_score_

({'max_depth': 8, 'min_samples_leaf': 4, 'min_samples_split': 2},
 0.8454742440657934)

In [36]:
# 테스트 데이터에 대해서 정확도를 측정
estimator = grid_dtree.best_estimator_
pred = estimator.predict(X_test)

In [37]:
# 평가
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.7430167597765364