# BIG DATA ANALYTICS: Voting
- Voting을 활용하여 앙상블 모델을 만들어보겠습니다.
---

## 1. 데이터 로드 및 전처리
- 지난주에 진행한 과정을 한번에 적용하겠습니다

In [None]:
import pandas as pd

In [None]:

data = pd.read_csv("adult.data", header=None, index_col=False,skipinitialspace=True,
    names=['age', 'workclass', 'fnlwgt', 'education',  'education-num',
           'marital-status', 'occupation', 'relationship', 'race', 'gender',
           'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
           'income'])

data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week',
             'occupation', 'income']]

data.head()

In [None]:
data.info()

In [None]:
data['income'].value_counts()

In [None]:
data['income'].value_counts()[0]/len(data['income'])

In [None]:

data.loc[data['income']=='<=50K', 'income'] = 0
data.loc[data['income']=='>50K', 'income'] = 1
data['income'] = data['income'].astype(int)

In [None]:
from sklearn.model_selection import train_test_split
X = data.drop('income',axis=1)
y = data['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
X_train

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import column_or_1d

class MyLabelEncoder(LabelEncoder):

    def fit(self, y):
        y = column_or_1d(y, warn=True)
        self.classes_ = pd.Series(y).unique()
        return self

In [None]:
order = ['Preschool','1st-4th','5th-6th', '7th-8th', '9th', '10th', '11th', '12th', 'HS-grad','Prof-school',
         'Assoc-acdm','Assoc-voc','Some-college','Bachelors','Masters','Doctorate' ]

new_data = X_train.copy()

scaler = MinMaxScaler()

new_data[['age', 'hours-per-week']] = scaler.fit_transform(new_data[['age', 'hours-per-week']])




# workclass 병합
enc1 = OneHotEncoder(handle_unknown='ignore')
encoded_feature = enc1.fit_transform(new_data.workclass.values.reshape(-1, 1))
workclass_one_hot = encoded_feature.toarray()
columns = ["workclass"+str(x) for x in range(workclass_one_hot.shape[1])]

work_class_df = pd.DataFrame(workclass_one_hot, columns=columns)

new_data = pd.concat([new_data, work_class_df], axis=1)




# occupation 병합
enc2 = OneHotEncoder(handle_unknown='ignore')
encoded_feature = enc2.fit_transform(new_data.occupation.values.reshape(-1, 1))
occupation_one_hot = encoded_feature.toarray()
columns = ["occupation"+str(x) for x in range(occupation_one_hot.shape[1])]

occupation_df = pd.DataFrame(occupation_one_hot, columns=columns)
new_data = pd.concat([new_data, occupation_df], axis=1)



# 간단한건 그냥 하드코딩!
new_data.loc[new_data['gender']=='Male', 'gender'] = 0
new_data.loc[new_data['gender']=='Female', 'gender'] = 1
new_data['gender'] = new_data['gender'].astype(int)




# 교육수준은 기존의 값을 replace
enc3 = MyLabelEncoder()
enc3.fit(order)
new_data['education'] = enc3.transform(new_data['education'])





new_data = new_data.drop(['workclass', 'occupation'],axis=1)
X_train = new_data
X_train

### 테스트 데이터 셋에 대해서는 위에서 생성한 인코더와 스케일러를 활용하여, transform만 적용합니다.

In [None]:
new_data = X_test.copy()



new_data[['age', 'hours-per-week']] = scaler.transform(new_data[['age', 'hours-per-week']])


encoded_feature = enc1.transform(new_data.workclass.values.reshape(-1, 1))
workclass_one_hot = encoded_feature.toarray()
columns = ["workclass"+str(x) for x in range(len(enc1.categories_[0]))]

work_class_df = pd.DataFrame(workclass_one_hot, columns=columns)
new_data = pd.concat([new_data, work_class_df], axis=1)


encoded_feature = enc2.fit_transform(new_data.occupation.values.reshape(-1, 1))
occupation_one_hot = encoded_feature.toarray()
columns = ["occupation"+str(x) for x in range(len(enc2.categories_[0]))]
occupation_df = pd.DataFrame(occupation_one_hot, columns=columns)
new_data = pd.concat([new_data, occupation_df], axis=1)

# 간단한건 그냥 하드코딩!
new_data.loc[new_data['gender']=='Male', 'gender'] = 0
new_data.loc[new_data['gender']=='Female', 'gender'] = 1
new_data['gender'] = new_data['gender'].astype(int)

new_data['education'] = enc3.transform(new_data['education'])


new_data = new_data.drop(['workclass', 'occupation'],axis=1)
X_test = new_data
X_test

## Voting Classifier 생성 (Hard Voting: 다수결)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier


clf1 = RandomForestClassifier(random_state=42)
clf2 = KNeighborsClassifier()
clf3 = SVC(random_state=42)

In [None]:
clf1.fit(X_train, y_train)
print(clf1.score(X_test, y_test))
clf2.fit(X_train, y_train)
print(clf2.score(X_test, y_test))
clf3.fit(X_train, y_train)
print(clf3.score(X_test, y_test))

In [None]:
eclf1 = VotingClassifier(estimators=[('rf', clf1), ('knn', clf2), ('svc', clf3)], voting='hard')
eclf1 = eclf1.fit(X_train, y_train)


In [None]:
print(eclf1.score(X_test, y_test))