# BIG DATA ANALYTICS: 전처리 
- 데이터 전처리중 범주형 데이터 인코딩에 대해서 연습해보겠습니다
---

## 1. 데이터 로딩

In [None]:
import pandas as pd

In [None]:
import os

data = pd.read_csv("adult.data", header=None, index_col=False,skipinitialspace=True,
    names=['age', 'workclass', 'fnlwgt', 'education',  'education-num',
           'marital-status', 'occupation', 'relationship', 'race', 'gender',
           'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
           'income'])

data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week',
             'occupation', 'income']]

display(data.head())

## 2. 요약통계

In [None]:
data.info()

In [None]:
for column in data.columns:
    if data[column].dtype == "object":
        print("Feature:{}".format(column))
        print(data[column].value_counts())
        print("======"*5)

In [None]:
data.describe()

## One-hot Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder


In [None]:
enc = OneHotEncoder(handle_unknown='ignore')

In [None]:
encoded_feature = enc.fit_transform(data.workclass.values.reshape(-1, 1))

In [None]:
print(data.workclass.values[:10])
encoded_feature.toarray()[:10]

## Ordinal Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder


### 학력 수준은 순서가 있지 않을까?

In [None]:

data.education.unique()

In [None]:
order = ['Preschool','1st-4th','5th-6th', '7th-8th', '9th', '10th', '11th', '12th', 'HS-grad','Prof-school',
         'Assoc-acdm','Assoc-voc','Some-college','Bachelors','Masters','Doctorate' ]

In [None]:
enc = LabelEncoder()

In [None]:
enc.fit_transform(order)

In [None]:
enc.classes_

In [None]:
## 순서를 지정해서 인코딩 할 수 있도록 나만의 LabelEncoder 만들기
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import column_or_1d

class MyLabelEncoder(LabelEncoder):

    def fit(self, y):
        y = column_or_1d(y, warn=True)
        self.classes_ = pd.Series(y).unique()
        return self

In [None]:
enc = MyLabelEncoder()
enc.fit(order)
enc.transform(order)

In [None]:
enc.transform(data.education)[:10]

In [None]:
data.education[:10]

## 원본데이터에 인코딩 된 데이터 적용

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
encoded_feature = enc.fit_transform(data.workclass.values.reshape(-1, 1))
workclass_one_hot = encoded_feature.toarray()
columns = ["workclass"+str(x) for x in range(workclass_one_hot.shape[1])]

work_class_df = pd.DataFrame(workclass_one_hot, columns=columns)
work_class_df

In [None]:
# 열 기준으로 데이터 병합
new_data = pd.concat([data, work_class_df], axis=1)
new_data

In [None]:
# 직업 특성도 병합
enc = OneHotEncoder(handle_unknown='ignore')
encoded_feature = enc.fit_transform(data.occupation.values.reshape(-1, 1))
occupation_one_hot = encoded_feature.toarray()
columns = ["occupation"+str(x) for x in range(occupation_one_hot.shape[1])]

occupation_df = pd.DataFrame(occupation_one_hot, columns=columns)
new_data = pd.concat([new_data, occupation_df], axis=1)
new_data

In [None]:
# 간단한건 그냥 하드코딩!
new_data.loc[new_data['gender']=='Male', 'gender'] = 0
new_data.loc[new_data['gender']=='Female', 'gender'] = 1
new_data['gender'] = new_data['gender'].astype(int)
new_data.loc[new_data['income']=='<=50K', 'income'] = 0
new_data.loc[new_data['income']=='>50K', 'income'] = 1
new_data['income'] = new_data['income'].astype(int)
new_data

In [None]:
# 교육수준은 기존의 값을 replace
education_enc = MyLabelEncoder()
education_enc.fit(order)
new_data['education'] = education_enc.transform(new_data['education'])

In [None]:
new_data

In [None]:
new_data = new_data.drop(['workclass', 'occupation'],axis=1)

In [None]:
new_data

In [None]:
new_data.info()

## 훈련/테스트 데이터 셋 분리 후 모델 훈련 및 검증

In [None]:
X = new_data.drop('income',axis=1)
y = new_data['income']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# 랜덤추출
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train, y_train)
print("테스트 점수: {:.2f}".format(logreg.score(X_test, y_test)))

In [None]:
print(sum(y_train)/len(y_train))
print(sum(y_test)/len(y_test))

In [None]:
# 층화추출
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,stratify=y)
logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train, y_train)
print("테스트 점수: {:.2f}".format(logreg.score(X_test, y_test)))

In [None]:
print(sum(y_train)/len(y_train))
print(sum(y_test)/len(y_test))