###取得資料集

In [21]:
import os

Dataset_File = "HealthCheck.csv"
if not os.path.isfile(Dataset_File):
  os.system("wget https://raw.githubusercontent.com/cnchi/datasets/master/" + Dataset_File)

###讀入資料

In [22]:
import pandas as pd
import numpy as np

dataset = pd.read_csv("HealthCheck.csv")
dataset

Unnamed: 0,Race,Height_cm,Weight_kg,Over70yr
0,White,186.0,90.0,Yes
1,African,185.0,98.0,No
2,Asian,175.0,80.0,No
3,African,170.0,75.0,Yes
4,Asian,164.0,,No
5,Asian,170.0,72.0,Yes
6,White,178.0,75.0,No
7,White,,89.0,Yes
8,African,165.0,79.0,Yes


###切分自變數應變數並印出

In [23]:
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 3].values

In [24]:
X

array([['White', 186.0, 90.0],
       ['African', 185.0, 98.0],
       ['Asian', 175.0, 80.0],
       ['African', 170.0, 75.0],
       ['Asian', 164.0, nan],
       ['Asian', 170.0, 72.0],
       ['White', 178.0, 75.0],
       ['White', nan, 89.0],
       ['African', 165.0, 79.0]], dtype=object)

In [25]:
Y

array(['Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes'],
      dtype=object)

###處理缺失資料

In [26]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
X

array([['White', 186.0, 90.0],
       ['African', 185.0, 98.0],
       ['Asian', 175.0, 80.0],
       ['African', 170.0, 75.0],
       ['Asian', 164.0, 82.25],
       ['Asian', 170.0, 72.0],
       ['White', 178.0, 75.0],
       ['White', 174.125, 89.0],
       ['African', 165.0, 79.0]], dtype=object)

###類別資料數位化


In [27]:
#處理自變數
ary_dummies = pd.get_dummies(X[:, 0]).values
X = np.concatenate((ary_dummies, X[:, 1:3]), axis=1).astype("float64")
X

array([[  0.   ,   0.   ,   1.   , 186.   ,  90.   ],
       [  1.   ,   0.   ,   0.   , 185.   ,  98.   ],
       [  0.   ,   1.   ,   0.   , 175.   ,  80.   ],
       [  1.   ,   0.   ,   0.   , 170.   ,  75.   ],
       [  0.   ,   1.   ,   0.   , 164.   ,  82.25 ],
       [  0.   ,   1.   ,   0.   , 170.   ,  72.   ],
       [  0.   ,   0.   ,   1.   , 178.   ,  75.   ],
       [  0.   ,   0.   ,   1.   , 174.125,  89.   ],
       [  1.   ,   0.   ,   0.   , 165.   ,  79.   ]])

In [28]:
#處理應變數
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
Y = labelEncoder.fit_transform(Y).astype("float64")
Y

array([1., 0., 0., 1., 0., 1., 0., 1., 1.])

###切分訓練集、測試集

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

###特徵縮放

In [30]:
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler().fit(X_train)
X_train = sc_X.transform(X_train)
X_test = sc_X.transform(X_test)

###印出結果

In [31]:
print("自變數訓練集:\n", X_train)
print("應變數訓練集:\n", Y_train)
print("自變數測試集:\n", X_test)
print("應變數測試集:\n", Y_test)

自變數訓練集:
 [[ 1.15470054 -0.63245553 -0.63245553  1.30941777  1.89111602]
 [-0.8660254   1.58113883 -0.63245553 -1.19037979  0.07416141]
 [ 1.15470054 -0.63245553 -0.63245553 -1.07134181 -0.30076573]
 [-0.8660254  -0.63245553  1.58113883  0.47615192 -0.76221452]
 [ 1.15470054 -0.63245553 -0.63245553 -0.47615192 -0.76221452]
 [-0.8660254  -0.63245553  1.58113883  1.42845575  0.96821844]
 [-0.8660254   1.58113883 -0.63245553 -0.47615192 -1.10830111]]
應變數訓練集:
 [0. 0. 1. 0. 1. 1. 1.]
自變數測試集:
 [[-0.8660254  -0.63245553  1.58113883  0.01487975  0.85285625]
 [-0.8660254   1.58113883 -0.63245553  0.11903798 -0.18540353]]
應變數測試集:
 [1. 0.]
