# BIG DATA ANALYSIS: Data 전처리
데이터셋 설명: [타이타닉](https://www.kaggle.com/c/titanic/overview)
---

## 1. 데이터셋 로딩

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("data/train.csv")

In [None]:
df

## 2. 무의미한 컬럼제거 (Feature Selection)

In [None]:
df = df.drop(['아이디', '티켓번호'], axis=1)
df

## 3. 결측값 처리

In [None]:
df.info()

In [None]:
#객실번호는 결측값이 많으므로, 컬럼 자체를 버리기
df = df.drop(['객실번호'], axis=1)

In [None]:
#승선한은 결측값이 2개밖에 되지 않으므로, 두 객체를 버리기
df = df.dropna(subset = ["승선항"])
df

In [None]:
print("생존: ",df['생존'].isnull().sum())
print("객실등급: ", df['객실등급'].isnull().sum())
print("이름: ", df['이름'].isnull().sum())
print("나이: ",df['나이'].isnull().sum())
print("형제자매: ",df['형제자매'].isnull().sum())
print("부모: ",df['부모'].isnull().sum())
print("요금: ",df['요금'].isnull().sum())
print("승선항: ",df['승선항'].isnull().sum())

In [None]:
#Option1: 전체 나이의 평균으로 대입
avg = df['나이'].mean()
print(avg)
df['나이'].fillna(avg)

In [None]:
# option2. 남녀 각각의 나이
male_avg = df[df['성별']=="male"]['나이'].mean() 
female_avg = df[df['성별']=="female"]['나이'].mean() 
print(male_avg, female_avg)
mask = df['성별']=="female"
df.loc[mask,'나이'] = df.loc[mask,'나이'].fillna(female_avg)
mask = df['성별']=="male"
df.loc[mask,'나이'] = df.loc[mask,'나이'].fillna(male_avg)


In [None]:
df

In [None]:
df.info()

In [None]:
#option3 Mr, Miss, 부모, 형제자매 수 등으로 더 디테일한 나이를 예측 할 수 있지 않을까?

# 4. 정규화

In [None]:
def minmax(min_val, max_val, val):
    return (val-min_val)/(max_val-min_val)

In [None]:
#Test

In [None]:
from sklearn.preprocessing import MinMaxScaler


In [None]:
scaler = MinMaxScaler()
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

newdf = df.select_dtypes(include=numerics)

scaled_age = scaler.fit_transform(newdf)

organized_df = pd.DataFrame(scaled_age, columns=newdf.columns, index=list(newdf.index.values))
organized_df.head()


# 5. 인코딩

In [None]:
df

In [None]:
organized_df

In [None]:
organized_df['성별_b'] = df['성별'].factorize()[0]

In [None]:
organized_df

In [None]:
from sklearn.preprocessing import OneHotEncoder


In [None]:
encoder = OneHotEncoder()
val, categories = df['승선항'].factorize()
one_hot = encoder.fit_transform(val.reshape(-1, 1))
one_hot.toarray()

In [None]:
categories

In [None]:
organized_df['승선항_S'] = one_hot.toarray()[:,0]
organized_df['승선항_C'] = one_hot.toarray()[:,1]
organized_df['승선항_Q'] = one_hot.toarray()[:,2]

In [None]:
organized_df