# BIG DATA ANALYTICS: Imblance 클래스 대응
- sampling을 다르게 하여 앙상블을 하겠습니다.
---

## 데이터 로드 및 전처리

In [None]:
import pandas as pd

In [None]:

data = pd.read_csv("adult.data", header=None, index_col=False,skipinitialspace=True,
    names=['age', 'workclass', 'fnlwgt', 'education',  'education-num',
           'marital-status', 'occupation', 'relationship', 'race', 'gender',
           'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
           'income'])

data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week',
             'occupation', 'income']]

data.head()

In [None]:
data.info()

In [None]:


data.loc[data['income']=='<=50K', 'income'] = 0
data.loc[data['income']=='>50K', 'income'] = 1
data['income'] = data['income'].astype(int)

In [None]:
data['income'].value_counts()

In [None]:
data['income'].value_counts()[0]/len(data['income'])

In [None]:
from sklearn.model_selection import train_test_split
X = data.drop('income',axis=1)
y = data['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_train

In [None]:
sum(y_train==0)/len(y_train)

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import column_or_1d

class MyLabelEncoder(LabelEncoder):

    def fit(self, y):
        y = column_or_1d(y, warn=True)
        self.classes_ = pd.Series(y).unique()
        return self

In [None]:
order = ['Preschool','1st-4th','5th-6th', '7th-8th', '9th', '10th', '11th', '12th', 'HS-grad','Prof-school',
         'Assoc-acdm','Assoc-voc','Some-college','Bachelors','Masters','Doctorate' ]

new_data = X_train.copy()

scaler = MinMaxScaler()

new_data[['age', 'hours-per-week']] = scaler.fit_transform(new_data[['age', 'hours-per-week']])




# workclass 병합
enc1 = OneHotEncoder(handle_unknown='ignore')
encoded_feature = enc1.fit_transform(new_data.workclass.values.reshape(-1, 1))
workclass_one_hot = encoded_feature.toarray()
columns = ["workclass"+str(x) for x in range(workclass_one_hot.shape[1])]

work_class_df = pd.DataFrame(workclass_one_hot, columns=columns)

new_data = pd.concat([new_data, work_class_df], axis=1)




# occupation 병합
enc2 = OneHotEncoder(handle_unknown='ignore')
encoded_feature = enc2.fit_transform(new_data.occupation.values.reshape(-1, 1))
occupation_one_hot = encoded_feature.toarray()
columns = ["occupation"+str(x) for x in range(occupation_one_hot.shape[1])]

occupation_df = pd.DataFrame(occupation_one_hot, columns=columns)
new_data = pd.concat([new_data, occupation_df], axis=1)



# 간단한건 그냥 하드코딩!
new_data.loc[new_data['gender']=='Male', 'gender'] = 0
new_data.loc[new_data['gender']=='Female', 'gender'] = 1
new_data['gender'] = new_data['gender'].astype(int)




# 교육수준은 기존의 값을 replace
enc3 = MyLabelEncoder()
enc3.fit(order)
new_data['education'] = enc3.transform(new_data['education'])





new_data = new_data.drop(['workclass', 'occupation'],axis=1)
X_train = new_data
X_train

In [None]:
new_data = X_test.copy()



new_data[['age', 'hours-per-week']] = scaler.transform(new_data[['age', 'hours-per-week']])


encoded_feature = enc1.transform(new_data.workclass.values.reshape(-1, 1))
workclass_one_hot = encoded_feature.toarray()
columns = ["workclass"+str(x) for x in range(len(enc1.categories_[0]))]

work_class_df = pd.DataFrame(workclass_one_hot, columns=columns)
new_data = pd.concat([new_data, work_class_df], axis=1)


encoded_feature = enc2.fit_transform(new_data.occupation.values.reshape(-1, 1))
occupation_one_hot = encoded_feature.toarray()
columns = ["occupation"+str(x) for x in range(len(enc2.categories_[0]))]
occupation_df = pd.DataFrame(occupation_one_hot, columns=columns)
new_data = pd.concat([new_data, occupation_df], axis=1)

# 간단한건 그냥 하드코딩!
new_data.loc[new_data['gender']=='Male', 'gender'] = 0
new_data.loc[new_data['gender']=='Female', 'gender'] = 1
new_data['gender'] = new_data['gender'].astype(int)

new_data['education'] = enc3.transform(new_data['education'])


new_data = new_data.drop(['workclass', 'occupation'],axis=1)
X_test = new_data
X_test

## 1. Over Sampling

In [None]:
over_sample_index = list(y_train[y_train==1].index)
over_sample_data = X_train.loc[over_sample_index,:]
over_sample_target = y_train[y_train==1]


X_train_over = X_train.append(over_sample_data)
y_train_over = y_train.append(over_sample_target)
X_train_over = X_train_over.reset_index(drop=True)
y_train_over = y_train_over.reset_index(drop=True)
X_train_over

In [None]:
sum(y_train_over==0)/len(y_train_over)

## 2. Under Sampling

In [None]:

under_sample_index = list(y_train[y_train==0].sample(frac=0.4).index)
X_train_under = X_train.drop(under_sample_index)
X_train_under = X_train_under.reset_index(drop=True)
y_train_under = y_train.drop(under_sample_index)
y_train_under = y_train_under.reset_index(drop=True)
X_train_under

In [None]:
sum(y_train_under==0)/len(y_train_under)

## 다른 데이터 셋 구성을 이용한 앙상블: (Weighted Voting)

In [None]:
from sklearn.neural_network import MLPClassifier

### 1. 원본 비율 데이터

In [None]:
clf1 = MLPClassifier(max_iter = 500, random_state=42)
clf1.fit(X_train, y_train)
print(clf1.score(X_test, y_test))

### 2. Over Sampling 데이터

In [None]:
clf2 = MLPClassifier(max_iter = 500, random_state=42)
clf2.fit(X_train_over, y_train_over)
print(clf2.score(X_test, y_test))

### 3. Under Sampling 데이터

In [None]:
clf3 = MLPClassifier(max_iter=500, random_state=42)
clf3.fit(X_train_under, y_train_under)
print(clf3.score(X_test, y_test))

### 4. Weighted Voting 계산

In [None]:
y_preds_1 = clf1.predict_proba(X_test)
y_preds_2 = clf2.predict_proba(X_test)
y_preds_3 = clf3.predict_proba(X_test)




In [None]:
y_preds = 2*y_preds_1+y_preds_2+y_preds_3
y_preds = y_preds/4

In [None]:
y_preds.argmax(axis=1)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_test, y_preds.argmax(axis=1))