In [2]:
# 필수 라이브러리
import numpy as np
import pandas as pd

# 시각화 라이브러리
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# 머신러닝 라이브러리 - sklearn
from sklearn.model_selection import train_test_split

# 필요 라이브러리 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# 평가지표 - Accuracy
from sklearn.metrics import accuracy_score

In [6]:
# 데이터 로딩 
titan_df = pd.read_csv('./titanic/train.csv')
titan_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [10]:
### 데이터 정보
print('### Data Information ### \n')
titan_df.info()

### Data Information ### 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [14]:
## 결측치를 확인 후 적절한 값으로 대체(imputation)
titan_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [21]:
### train 결측치 대체 함수화
def imputation_na(df):
    df['Age'].fillna(np.mean(df['Age']), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0,inplace=True)
    return df


In [24]:
imputation_na(titan_df).head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,N,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,N,S


In [25]:
### 전처리 후의 결측값 확인
titan_df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [29]:
### 문자들을 숫자로 변환 (인코딩)
### 종속변수(y값)가 명목형 변수(0:사망, 1:생존) == Label encoding 써도 됨

titan_df.select_dtypes(include='object').columns

Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')

In [30]:
from sklearn.preprocessing import LabelEncoder

In [44]:
# Label_Encoder를 for문을 통해 반복해서 적용

ftrs = ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
for ftr in ftrs:
    le = LabelEncoder()
    titan_df[ftr] = le.fit_transform(titan_df[ftr])   

In [43]:
def Label_Encode_ftrs(df):
    # Label_Encoder를 for문을 통해 반복해서 적용
    ftrs = ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
    for ftr in ftrs:
        le = LabelEncoder()
        df[ftr] = le.fit_transform(df[ftr])   
    return df

In [46]:
# Label Encoding을 수행함으로써 문자를 숫자로 변환시킴

Label_Encode_ftrs(titan_df)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,108,1,22.000000,1,0,523,7.2500,146,3
1,2,1,1,190,0,38.000000,1,0,596,71.2833,81,0
2,3,1,3,353,0,26.000000,0,0,669,7.9250,146,3
3,4,1,1,272,0,35.000000,1,0,49,53.1000,55,3
4,5,0,3,15,1,35.000000,0,0,472,8.0500,146,3
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,548,1,27.000000,0,0,101,13.0000,146,3
887,888,1,1,303,0,19.000000,0,0,14,30.0000,30,3
888,889,0,3,413,0,29.699118,1,2,675,23.4500,146,3
889,890,1,1,81,1,26.000000,0,0,8,30.0000,60,0


In [47]:
### 불필요한 컬럼속성 제거 
def drop_ftrs(df):
    df.drop(['PassengerId','Name','Ticket','Cabin'], axis=1, inplace=True)
    return df

In [48]:
### 앞에서 생성한 def함수들을 다 합쳐서 만들어보자
def preprocessing_ftrs(df):
    df = imputation_na(df)
    df = Label_Encode_ftrs(df)
    df = drop_ftrs(df)
    
    return df

In [51]:
### 지금 이 작업은 train.csv로만 진행하므로 validation입니다 ^^

## 원본 데이터를 재로딩한 후, 
# Features (즉, X값)데이터와
# Label(즉, y값)데이터를 추출

titan_df = pd.read_csv('./titanic/train.csv')
y_titan_df = titan_df['Survived']
X_titan_df = titan_df.drop(['Survived'], axis=1)

In [54]:
### 전처리가 끝난 X_ftrs
X1_titan_df = preprocessing_ftrs(X_titan_df)

In [76]:
### 학습을 수행하기 위한
## 데이터 분할 :: train_test_split

X_train, X_val, y_train, y_val = train_test_split(X1_titan_df, y_titan_df,
                                                 random_state=11)

ML 알고리즘인 결정 트리, 랜덤 포레스트, 로지스틱 회귀를 이용해 타이타닉 생존자를 예측해보자.
이 알고리즘에 대한 상세 설명은 보강시 설명하겠다.(로지스틱 회귀는 이름은 회귀지만 매우 강력한 분류 알고리즘이다.) 아쉽지만 현재는 사이킷런 기반의 머신러닝 코드에 익숙해지는데 집중해보자. 사이킷런은 결정 트리를 위해 DecisionTreeClassifier, 랜덤 포레스트를 위해 RandomForestClassifier, 로지스틱회귀를 위해 LogisticRegression 클래스를 제공한다.

In [77]:
# 필요 라이브러리 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

### 객체화
dt_clf = DecisionTreeClassifier(random_state=11)
rf_clf = RandomForestClassifier(random_state=11)
lr_clf = LogisticRegression(random_state=11)

In [79]:
# train과 validation을 통해서 미리
# 학습된 알고리즘 및 가장 높은 정확도의 알고리즘 선택

## dt_clf 학습
dt_clf.fit(X_train, y_train)
pred_dt = dt_clf.predict(X_val)
accuracy_dt = accuracy_score(y_val,pred_dt)

print('dt_clf의 정확도:', np.round(accuracy_dt,4))

## rf_clf 학습
rf_clf.fit(X_train,y_train)
pred_rf = rf_clf.predict(X_val)
accuracy_rf = accuracy_score(y_val,pred_rf)

print('rf_clf의 정확도:', np.round(accuracy_rf,4))

## lr_clf 학습
lr_clf.fit(X_train,y_train)
pred_lr = lr_clf.predict(X_val)
accuracy_lr = accuracy_score(y_val,pred_lr)

print('lr_clf의 정확도:', np.round(accuracy_lr,4))

dt_clf의 정확도: 0.8027
rf_clf의 정확도: 0.8296
lr_clf의 정확도: 0.8475


In [81]:
# 하이퍼파라미터 튜닝했다고 가정하겠습니다. GridSearchCV라는 것을 씀
# Kfold 했다라고 가정하겠습니다.

In [92]:
X_test_all = pd.read_csv('./titanic/test.csv')
X1_test_all = preprocessing_ftrs(X_test_all)

In [115]:
a=[1,2,3]
b= a.copy()
b=b*3

In [85]:
X_train_all = X1_titan_df.copy() ## 전체의 X_ftrs들을 의미합니다.
y_train_all = y_titan_df.copy() ## 전체 y_label들을 의미합니다.

In [94]:
import warnings
warnings.filterwarnings('ignore')

# 전체의 데이터로 학습을 수행한다.

lr_clf.fit(X_train_all, y_train_all)
submit_pred = lr_clf.predict(X1_test_all)

In [99]:
submission_df = pd.read_csv('./titanic/gender_submission.csv')
submission_df.head()

submission_df['y_pred'] = submit_pred
submission_df.head(3)

## 실제 데이터(y_test_all)와의 정확도
accuracy_score(submission_df['Survived'],submission_df['y_pred'])

0.930622009569378

In [101]:
# 답안제출
submission_df.to_csv('./220608.csv')

In [102]:
# end of file