# ch8_7 전처리 과정 함수화
이전 챕터에서 스텝 바이 스텝으로 train 데이터 프레임 전처리 과정을 살펴보았습니다. 그런데 우리가 전처리 해야할 데이터 프레임은 test도 있습니다. 그렇다면 전처리 과정을 함수로 예쁘게 코딩해놓으면 편리하겠죠? 이번 챕터에서는 전처리 과정을 함수화 하고, train과 test 데이터 셋을 전처리하여 파일로 저장해보겠습니다.

In [11]:
import pandas as pd
import numpy as np

In [12]:
def drop_unusing_columns(df):
    return df.drop(["Name", "Ticket", "Cabin"], axis=1) 

In [13]:
def add_drived_variables(df):
    # 파생 변수 FamilySize, IsAlone 생성
    df["FamilySize"] =  df["SibSp"] + df["Parch"]
    df["IsAlone"] = 1
    df.loc[df["FamilySize"] >= 1, "IsAlone"] = 0
    return df

In [14]:
mean_age_dict = {
    ('female', 1): 34.61176470588235,
    ('female', 2): 28.722972972972972,
    ('female', 3): 21.75,
    ('male', 1): 41.28138613861386,
    ('male', 2): 30.74070707070707,
    ('male', 3): 26.507588932806325
}

def fill_missing_values(df):
    # 결측치 제거
    def _fill_group(group):
        sex, pclass = group.iloc[0][["Sex", "Pclass"]]
        fill_value = mean_age_dict[(sex, pclass)]
        group["Age"] = group["Age"].fillna(fill_value)
        return group
    
    df = df.groupby(['Sex', 'Pclass'], group_keys=False).apply(_fill_group)
    df["Embarked"] = df["Embarked"].fillna("S")
    # test 데이터 셋에 fare 결측치가 있어서 처리
    df["Fare"] = df["Fare"].fillna(0)
    return df

In [15]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

def label_encode(df):
    # label encoding
    df["Embarked"] = encoder.fit_transform(df["Embarked"])
    df["Sex"] = encoder.fit_transform(df["Sex"])
    return df

In [16]:
def one_hot_encode(df):
    pclass_dummies = pd.get_dummies(df["Pclass"], prefix="Pclass")
    embarked_dummies = pd.get_dummies(df["Embarked"], prefix="Embarked")
    df = pd.concat([df, pclass_dummies, embarked_dummies], axis=1)
    df = df.drop(["Pclass", "Embarked"], axis=1)
    return df

In [17]:
import pickle

with open("./data/standard_scaler.pkl", "rb") as fr:
    standard_scaler = pickle.load(fr)
with open("./data/minmax_scaler.pkl", "rb") as fr:
    minmax_scaler = pickle.load(fr)
    
def scale_values(df):
    df["Fare"] = np.log1p(df["Fare"])
    df[["Age", "Fare"]] = standard_scaler.transform(df[["Age", "Fare"]])
    df[["SibSp", "Parch", "FamilySize"]] = minmax_scaler.transform(df[["SibSp", "Parch", "FamilySize"]]) 
    return df

In [18]:
def preprocess(df):
    df = drop_unusing_columns(df) 
    df = add_drived_variables(df)
    df = fill_missing_values(df)
    df = label_encode(df)
    df = one_hot_encode(df)
    df = scale_values(df) 
    return df 

In [19]:
train = pd.read_csv("./data/titanic_train.csv")
test = pd.read_csv("./data/titanic_test.csv")

In [20]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [21]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [22]:
preprocessed_train = preprocess(train)
preprocessed_test = preprocess(test)

In [23]:
preprocessed_train

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,FamilySize,IsAlone,Pclass_1,Pclass_2,Pclass_3,Embarked_0,Embarked_1,Embarked_2
0,1,0,1,-0.551366,0.125,0.000000,-0.879741,0.1,0,0,0,1,0,0,1
1,2,1,0,0.654030,0.125,0.000000,1.361220,0.1,0,1,0,0,1,0,0
2,3,1,0,-0.250017,0.000,0.000000,-0.798540,0.0,1,0,0,1,0,0,1
3,4,1,0,0.428018,0.125,0.000000,1.062038,0.1,0,1,0,0,0,0,1
4,5,0,1,0.428018,0.000,0.000000,-0.784179,0.0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,1,-0.174680,0.000,0.000000,-0.333698,0.0,1,0,1,0,0,0,1
887,888,1,0,-0.777378,0.000,0.000000,0.487082,0.0,1,1,0,0,0,0,1
888,889,0,0,-0.570201,0.125,0.333333,0.242007,0.3,0,0,0,1,0,0,1
889,890,1,1,-0.250017,0.000,0.000000,0.487082,0.0,1,1,0,0,1,0,0


In [24]:
preprocessed_test

Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Fare,FamilySize,IsAlone,Pclass_1,Pclass_2,Pclass_3,Embarked_0,Embarked_1,Embarked_2
0,892,1,0.390349,0.000,0.000000,-0.809683,0.0,1,0,0,1,0,1,0
1,893,0,1.332065,0.125,0.000000,-0.911513,0.1,0,0,0,1,0,0,1
2,894,1,2.462123,0.000,0.000000,-0.612461,0.0,1,0,1,0,0,1,0
3,895,1,-0.174680,0.000,0.000000,-0.716562,0.0,1,0,0,1,0,0,1
4,896,0,-0.551366,0.125,0.166667,-0.387631,0.2,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,1,-0.211777,0.000,0.000000,-0.784179,0.0,1,0,0,1,0,0,1
414,1306,0,0.729367,0.000,0.000000,1.793823,0.0,1,1,0,0,1,0,0
415,1307,1,0.691698,0.000,0.000000,-0.879741,0.0,1,0,0,1,0,0,1
416,1308,1,-0.211777,0.000,0.000000,-0.784179,0.0,1,0,0,1,0,0,1


In [25]:
preprocessed_train.to_csv("./data/titanic_preprocessed_train.csv", index=False)
preprocessed_test.to_csv("./data/titanic_preprocessed_test.csv", index=False)

## 정리
이번 챕터에서는 전처리 과정을 함수화 했습니다. 그 과정에서 편의성을 위해 미리 파일로 저장해두었던 scaler들을 pickle을 이용해서 불러오는 방법을 익혀보았습니다. 다음 챕터에서는 본격적으로 전처리한 데이터를 가지고 상관관계도 분석해보고, 예측 모델도 만들어보겠습니다. 