# ch8_7 전처리 과정 함수화
이전 챕터에서 스텝 바이 스텝으로 train 데이터 프레임 전처리 과정을 살펴보았습니다. 그런데 우리가 전처리 해야할 데이터 프레임은 test도 있습니다. 그렇다면 전처리 과정을 함수로 예쁘게 코딩해놓으면 편리하겠죠? 이번 챕터에서는 전처리 과정을 함수화 하고, train과 test 데이터 셋을 전처리하여 파일로 저장해보겠습니다.

In [45]:
import pandas as pd
import numpy as np

In [5]:
train_df = pd.read_csv("./data/titanic_train.csv")
test_df = pd.read_csv("./data/titanic_test.csv")

In [49]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [50]:
standard_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()

In [12]:
def drop_unusing_columns(df):
    return df.drop(["Name", "Ticket", "Cabin"], axis=1)

In [15]:
def add_derived_columns(df):
    df["FamilySize"] = df["SibSp"] + df["Parch"]
    df["IsAlone"] = 1
    df.loc[df["FamilySize"] >= 1, "IsAlone"] = 0
    return df

In [21]:
mean_age_dict = {
    ('female', 1): 34.61176470588235,
     ('female', 2): 28.722972972972972,
     ('female', 3): 21.75,
     ('male', 1): 41.28138613861386,
     ('male', 2): 30.74070707070707,
     ('male', 3): 26.507588932806325
}

def fill_missing_values(df):
    def _fill_group(group):
        sex, pclass = group.iloc[0][["Sex", "Pclass"]]
        fill_value = mean_age_dict[(sex, pclass)]
        group["Age"] = group["Age"].fillna(fill_value)
        return group
    df = df.groupby(["Sex", "Pclass"]).apply(_fill_group)
    df["Embarked"] = df["Embarked"].fillna("S")
    df["Fare"] = df["Fare"].fillna(0)
    return df

In [26]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

def label_encode(df):
    df["Sex"] = label_encoder.fit_transform(df["Sex"])
    return df

In [35]:
def onehot_encode(df):
    embarked_dummies = pd.get_dummies(df["Embarked"], prefix="Embarked")
    pclass_dummies = pd.get_dummies(df["Pclass"], prefix="Pclass")
    df = pd.concat([df, embarked_dummies, pclass_dummies], axis=1)
    df = df.drop(["Pclass", "Embarked"], axis=1)
    return df

In [70]:
import pickle

with open("./data/standard_scaler.pkl", "rb") as fr:
    standard_scaler = pickle.load(fr)
    
with open("./data/minmax_scaler.pkl", "rb") as fr:
    minmax_scaler = pickle.load(fr)

In [73]:
def scale_numercial_values(df):
    df["Fare"] = np.log1p(df["Fare"])
    df[["Age", "Fare"]] = standard_scaler.transform(df[["Age", "Fare"]])
    df[["SibSp", "Parch", "FamilySize"]] = minmax_scaler.transform(df[["SibSp", "Parch", "FamilySize"]])
    return df

In [74]:
def preprocess(df):
    df = drop_unusing_columns(df)
    df = add_derived_columns(df)
    df = fill_missing_values(df)
    df = label_encode(df)
    df = onehot_encode(df)
    df = scale_numercial_values(df)
    return df

In [75]:
preprocess(train_df)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(["Sex", "Pclass"]).apply(_fill_group)


Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,FamilySize,IsAlone,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,1,0,1,-0.551366,0.125,0.000000,-0.879741,0.1,0,0,0,1,0,0,1
1,2,1,0,0.654030,0.125,0.000000,1.361220,0.1,0,1,0,0,1,0,0
2,3,1,0,-0.250017,0.000,0.000000,-0.798540,0.0,1,0,0,1,0,0,1
3,4,1,0,0.428018,0.125,0.000000,1.062038,0.1,0,0,0,1,1,0,0
4,5,0,1,0.428018,0.000,0.000000,-0.784179,0.0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,1,-0.174680,0.000,0.000000,-0.333698,0.0,1,0,0,1,0,1,0
887,888,1,0,-0.777378,0.000,0.000000,0.487082,0.0,1,0,0,1,1,0,0
888,889,0,0,-0.570201,0.125,0.333333,0.242007,0.3,0,0,0,1,0,0,1
889,890,1,1,-0.250017,0.000,0.000000,0.487082,0.0,1,1,0,0,1,0,0


In [76]:
preprocessed_train_df = preprocess(train_df)
preprocessed_test_df = preprocess(test_df)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(["Sex", "Pclass"]).apply(_fill_group)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(["Sex", "Pclass"]).apply(_fill_group)


In [78]:
preprocessed_train_df.to_csv("./data/titanic_train_preprocessed.csv", index=False)
preprocessed_test_df.to_csv("./data/titanic_test_preprocessed.csv", index=False)

## 정리
이번 챕터에서는 전처리 과정을 함수화 했습니다. 그 과정에서 편의성을 위해 미리 파일로 저장해두었던 scaler들을 pickle을 이용해서 불러오는 방법을 익혀보았습니다. 다음 챕터에서는 본격적으로 전처리한 데이터를 가지고 상관관계도 분석해보고, 예측 모델도 만들어보겠습니다. 