In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('adult.csv')
display(df.take([12,13,14]))

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
12,26,Private,82091,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,39,United-States,<=50K
13,58,?,299831,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,35,United-States,<=50K
14,48,Private,279724,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,3103,0,48,United-States,>50K


由上面的數據可看到：
- 許多特徵是類別型而非數值型
- 有遺漏值

In [2]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

# 編碼 income 欄位
le = LabelEncoder()
df['income'] = le.fit_transform(df['income'].values)

# 處理類別型特徵，將 '?' 取代為''
catego_features = ['workclass', 'education', 'marital-status', 'occupation', 
                   'relationship', 'race', 'gender', 'native-country']
for i in catego_features:
    df[i] = df[i].replace('?', np.nan)
display(df.take([12,13,14]))
display(df.shape)
df.isnull().sum()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
12,26,Private,82091,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,39,United-States,0
13,58,,299831,HS-grad,9,Married-civ-spouse,,Husband,White,Male,0,0,35,United-States,0
14,48,Private,279724,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,3103,0,48,United-States,1


(48842, 15)

age                   0
workclass          2799
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         2809
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      857
income                0
dtype: int64

In [3]:
df_drop = df.dropna()
display(df_drop.shape)
df_drop.isnull().sum()

(45222, 15)

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [4]:
# one-hot encoding
df_en = pd.get_dummies(df_drop)
display(df_en.head(3))

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Private,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,25,226802,7,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,38,89814,9,0,0,50,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,28,336951,12,0,0,40,1,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier

# 減少一點樣本，縮短執行時間
df_small = df_en.sample(n=5000, random_state=0)
X = df_small.drop('income', 1).values
y = df_small['income'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

# 建立管道化
pipe_knn = make_pipeline(StandardScaler(), 
                         KNeighborsClassifier(n_neighbors=10, p=2))
pipe_knn.fit(X_train, y_train)
y_pred = pipe_knn.predict(X_test)
print('KNN 分類錯誤的樣本數： %d' % (y_test != y_pred).sum())
print(classification_report(y_pred, y_test))

KNN 分類錯誤的樣本數： 188
              precision    recall  f1-score   support

           0       0.92      0.85      0.88       824
           1       0.47      0.63      0.54       176

    accuracy                           0.81      1000
   macro avg       0.69      0.74      0.71      1000
weighted avg       0.84      0.81      0.82      1000



In [6]:
df_small['income'].value_counts()

0    3754
1    1246
Name: income, dtype: int64

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier(class_weight='balanced')
param_grid = [{'n_estimators': range(10, 101, 10),
               'max_depth': range(3, 11)}]
grid_s = GridSearchCV(estimator=clf,
                      param_grid=param_grid,
                      scoring='accuracy',
                      cv=5, n_jobs=-1)
grid_s.fit(X_train, y_train)
print(grid_s.best_params_)
y_pred = grid_s.predict(X_test)
print('RandomForest 分類錯誤的樣本數： %d' % (y_test != y_pred).sum())
print(classification_report(y_pred, y_test))

{'max_depth': 10, 'n_estimators': 80}
RandomForest 分類錯誤的樣本數： 219
              precision    recall  f1-score   support

           0       0.77      0.94      0.84       625
           1       0.83      0.52      0.64       375

    accuracy                           0.78      1000
   macro avg       0.80      0.73      0.74      1000
weighted avg       0.79      0.78      0.77      1000

