# 環境準備

## 取得資料集

In [None]:
# 不檢查檔案是否存在，暴力下載
# ! wget https://raw.githubusercontent.com/cnchi/datasets/master/CarEvaluation.csv

# 先檢查檔案是否存在，再決定是否下載
import os

Dataset_File = "CarEvaluation.csv"
if not os.path.isfile(Dataset_File):
  os.system("wget https://raw.githubusercontent.com/cnchi/datasets/master/" + Dataset_File)

# 資料集前處理

## 讀入 CSV 檔

In [None]:
import numpy as np
import pandas as pd

dataset = pd.read_csv("CarEvaluation.csv")
dataset

Unnamed: 0,City,Children,Age,Salary,ToBuy
0,Taipei,,44.0,72000.0,No
1,Taichung,0.0,27.0,48000.0,Yes
2,Kaohsiung,0.0,30.0,54000.0,No
3,Taichung,1.0,38.0,61000.0,No
4,Kaohsiung,2.0,40.0,,Yes
5,Taipei,2.0,35.0,58000.0,Yes
6,Taichung,1.0,,52000.0,No
7,Taipei,2.0,48.0,79000.0,Yes
8,Kaohsiung,1.0,50.0,83000.0,No
9,Taipei,2.0,37.0,67000.0,Yes


## 切分自變數、應變數

In [None]:
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 4].values

In [None]:
X

array([['Taipei', nan, 44.0, 72000.0],
       ['Taichung', 0.0, 27.0, 48000.0],
       ['Kaohsiung', 0.0, 30.0, 54000.0],
       ['Taichung', 1.0, 38.0, 61000.0],
       ['Kaohsiung', 2.0, 40.0, nan],
       ['Taipei', 2.0, 35.0, 58000.0],
       ['Taichung', 1.0, nan, 52000.0],
       ['Taipei', 2.0, 48.0, 79000.0],
       ['Kaohsiung', 1.0, 50.0, 83000.0],
       ['Taipei', 2.0, 37.0, 67000.0]], dtype=object)

In [None]:
Y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## 處理缺失資料

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer = imputer.fit(X[:, 1:4])
X[:, 1:4] = imputer.transform(X[:, 1:4])
X

array([['Taipei', 1.2222222222222223, 44.0, 72000.0],
       ['Taichung', 0.0, 27.0, 48000.0],
       ['Kaohsiung', 0.0, 30.0, 54000.0],
       ['Taichung', 1.0, 38.0, 61000.0],
       ['Kaohsiung', 2.0, 40.0, 63777.77777777778],
       ['Taipei', 2.0, 35.0, 58000.0],
       ['Taichung', 1.0, 38.77777777777778, 52000.0],
       ['Taipei', 2.0, 48.0, 79000.0],
       ['Kaohsiung', 1.0, 50.0, 83000.0],
       ['Taipei', 2.0, 37.0, 67000.0]], dtype=object)

## 類別資料數位化

In [None]:
# 使用標籤編碼器，將應變數 Y 數位化
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
Y = labelEncoder.fit_transform(Y).astype("float64")

In [None]:
# 使用獨熱編碼器，將自變數 X 數位化
ary_dummies = pd.get_dummies(X[:, 0]).values
X = np.concatenate((ary_dummies, X[:, 1:4]), axis=1).astype("float64")
X

array([[1.        , 0.        , 0.        , 1.        , 1.22222222],
       [1.        , 0.        , 1.        , 0.        , 0.        ],
       [0.        , 1.        , 0.        , 0.        , 0.        ],
       [1.        , 0.        , 1.        , 0.        , 1.        ],
       [0.        , 1.        , 0.        , 0.        , 2.        ],
       [1.        , 0.        , 0.        , 1.        , 2.        ],
       [1.        , 0.        , 1.        , 0.        , 1.        ],
       [1.        , 0.        , 0.        , 1.        , 2.        ],
       [0.        , 1.        , 0.        , 0.        , 1.        ],
       [1.        , 0.        , 0.        , 1.        , 2.        ]])

## 切分訓練集、測試集

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

## 特徵縮放

In [None]:
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler().fit(X_train)
X_train = sc_X.transform(X_train)
X_test = sc_X.transform(X_test)