In [1]:
# 載入函式庫及資料集
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

df = pd.read_csv('income_evaluation.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [2]:
df.columns

Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
       ' income'],
      dtype='object')

In [6]:
# 欄位名稱有空格, 先去除空格
df.columns = df.columns.str.strip()

In [7]:
df['income'].value_counts()

Unnamed: 0_level_0,count
income,Unnamed: 1_level_1
<=50K,24720
>50K,7841


In [None]:
df.drop(columns="fnlwgt",inplace=True)

In [10]:
df.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [11]:
# 查看空格
df.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
education,0
education-num,0
marital-status,0
occupation,0
relationship,0
race,0
sex,0
capital-gain,0


In [12]:
# 建立年齡分組
bins = [16,24,64,90]
labels=['young','adult','old']
df['age_types'] = pd.cut(df['age'], bins=bins,labels=labels)

# 把收入依 50k 為界限, 分為 0,1
df['income_num'] = np.where(df['income'] == " >50K",1,0).astype('int16')

In [13]:
df.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,age_types,income_num
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,adult,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,adult,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,adult,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,adult,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,adult,0


In [14]:
# 指定要處理的欄位
cols = ['workclass', 'occupation', 'native-country']

# 將指定欄位中的 '?' 全部替換成 NaN, 標記缺失
df[cols] = df[cols].replace('?', np.nan)

LabelEncoder()：將類別型資料（例如字串 "Male", "Female", "Other"）轉成數值（如 0, 1, 2）。

fit_transform(df[a])：學習欄位 a 的所有類別並轉換成整數。

df[a] = ...：將轉換結果覆蓋回原本欄位。

In [15]:
from sklearn.preprocessing import LabelEncoder
def label_encoder(a):
    le = LabelEncoder()
    df[a] = le.fit_transform(df[a])
label_list = ['workclass', 'education','marital-status',
       'occupation', 'relationship', 'race', 'sex','native-country', 'income']
for i in label_list:
    label_encoder(i)

In [16]:
df.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,age_types,income_num
0,39,7,9,13,4,1,1,4,1,2174,0,40,39,0,adult,0
1,50,6,9,13,2,4,0,4,1,0,0,13,39,0,adult,0
2,38,4,11,9,0,6,1,4,1,0,0,40,39,0,adult,0
3,53,4,1,7,2,6,0,2,1,0,0,40,39,0,adult,0
4,28,4,9,13,2,10,5,2,0,0,0,40,5,0,adult,0


### 做數值特徵的縮放 (Normalization)
df.drop([...], axis=1)：先排除掉這三個欄位：

'income' → 分類標籤（文字類別，不能直接縮放）

'age_types' → 分箱後的類別型欄位（不是連續數值）

'income_num' → 二元標籤（0/1，不需要縮放）

剩下的都是數值特徵（例如 age, fnlwgt, education-num, capital-gain, hours-per-week 等）。

scaler.fit(...)：計算每個欄位的 min 和 max，存起來供後續轉換使用。


In [17]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df.drop(['income','age_types','income_num'],axis=1))

In [18]:
scaled_features = scaler.transform(df.drop(['income','age_types','income_num'],axis=1))
columns=['age', 'workclass', 'education', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'sex', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country']
df_scaled = pd.DataFrame(scaled_features,columns=columns)
df_scaled.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,0.30137,0.875,0.6,0.8,0.666667,0.071429,0.2,1.0,1.0,0.02174,0.0,0.397959,0.95122
1,0.452055,0.75,0.6,0.8,0.333333,0.285714,0.0,1.0,1.0,0.0,0.0,0.122449,0.95122
2,0.287671,0.5,0.733333,0.533333,0.0,0.428571,0.2,1.0,1.0,0.0,0.0,0.397959,0.95122
3,0.493151,0.5,0.066667,0.4,0.333333,0.428571,0.0,0.5,1.0,0.0,0.0,0.397959,0.95122
4,0.150685,0.5,0.6,0.8,0.333333,0.714286,1.0,0.5,0.0,0.0,0.0,0.397959,0.121951


### 1. 定義特徵與標籤

```python
X = df_scaled
y = df.income

```

- `X`：縮放後的特徵矩陣（數值型資料 + 類別編碼）。
- `y`：標籤，這裡是 `income`（收入分類，<=50K / >50K）。

---

### 2. 過採樣 + 欠採樣處理

```python
from imblearn.combine import SMOTETomek
smk = SMOTETomek(random_state=42)
X_res, y_res = smk.fit_resample(X, y)

```

- **SMOTETomek** = SMOTE (過採樣) + Tomek Links (欠採樣) 的結合。
    - **SMOTE**：用插值方式生成少數類別的新樣本，平衡類別比例。
    - **Tomek Links**：刪掉決策邊界附近的「不純樣本」，讓類別分界更清晰。
- 輸出：`X_res`、`y_res`，這是平衡後的資料。

---

### 3. 訓練 / 測試集切分

```python
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.20, random_state=101, shuffle=True
)

```

- 隨機打亂後，80% 資料當訓練集，20% 當測試集。
- `random_state=101` 保證結果可重現。

---

### 4. 支援向量機 (SVM) 模型

```python
from sklearn.svm import SVC
svc = SVC(random_state=101)

```

- `SVC()`：建立支援向量分類器，預設 kernel = RBF。
- 適合處理非線性分類問題。

---

### 5. 交叉驗證 + 訓練模型

```python
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(svc, X_train, y_train, cv=5)
svc.fit(X_train, y_train)

```

- `cross_val_score(..., cv=5)`：在訓練集上做 5 折交叉驗證，得到每折的準確率。
- `np.mean(accuracies)`：交叉驗證的平均準確率（可視為模型在訓練集的穩定性）。
- `fit(...)`：用完整的 `X_train`、`y_train` 訓練模型。

---

### 6. 評估表現

```python
print("Train Score:", np.mean(accuracies))
print("Test Score:", svc.score(X_test, y_test))

```

- **Train Score**：交叉驗證平均準確率。
- **Test Score**：模型在測試集上的準確率。

In [19]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss

X = df_scaled
y= df.income

smk = SMOTETomek(random_state=42)
X_res,y_res=smk.fit_resample(X,y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_res,y_res,test_size=0.20,random_state=101,shuffle=True)
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
svc = SVC(random_state = 101)
accuracies = cross_val_score(svc, X_train, y_train, cv=5)
svc.fit(X_train,y_train)

print("Train Score:",np.mean(accuracies))
print("Test Score:",svc.score(X_test,y_test))

Train Score: 0.8306082581853399
Test Score: 0.8292505708947477
