<a href="https://colab.research.google.com/github/yar31313/SNUH-AI-Lecture_Basic/blob/main/W103_ML_%EC%8B%A4%EC%8A%B5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ML 실습

1.   데이터 불러오기
2.   데이터 탐색 및 전처리
3.   모델 구성
4.   모델 훈련
5.   모델 검증

---


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras import optimizers

##1. 데이터 불러오기

In [None]:
from tensorflow.keras.datasets import boston_housing
# https://keras.io/api/datasets/boston_housing/#boston-housing-price-regression-dataset
(x_train, y_train), (x_test, y_test) = boston_housing.load_data()
label_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']

'''
Variables in order:
 CRIM     per capita crime rate by town
 ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
 INDUS    proportion of non-retail business acres per town
 CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
 NOX      nitric oxides concentration (parts per 10 million)
 RM       average number of rooms per dwelling
 AGE      proportion of owner-occupied units built prior to 1940
 DIS      weighted distances to five Boston employment centres
 RAD      index of accessibility to radial highways
 TAX      full-value property-tax rate per $10,000
 PTRATIO  pupil-teacher ratio by town
 B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
 LSTAT    % lower status of the population
 MEDV     Median value of owner-occupied homes in $1000's
 '''
pass

##2. 데이터 탐색 및 전처리

In [None]:
print('x_train :', x_train.shape, '\ty_train :', y_train.shape)
print('x_test :', x_test.shape, '\ty_test :', y_test.shape)

In [None]:
# 데이터 확인
df = pd.DataFrame(x_train)
df.head()
# df = pd.DataFrame(x_train, columns=label_names)
# df.head()

In [None]:
# 결측치 확인
df.isnull().sum()

In [None]:
# 통계치 확인
df.describe()

In [None]:
# 데이터 시각화
f, ax = plt.subplots(3, 5, figsize=(25,10))
for i in range(0,x_train.shape[1]):
  ax[i//5][i%5].hist(x_train[:,i])
  ax[i//5][i%5].set_title(label_names[i])
ax[2,4].hist(y_train)
ax[2,4].set_title("MEDV")
plt.show()

In [None]:
# 데이터 표준화
mean = x_train.mean(axis=0)
std = x_train.std(axis=0)
x_train = (x_train - mean)/std
x_test = (x_test - mean)/std

pd.DataFrame(x_train, columns=label_names).describe()

In [None]:
# 훈련셋/검증셋 분리
xy_train = np.column_stack((x_train,y_train))
np.random.shuffle(xy_train)

xy_val = xy_train[:int(xy_train.shape[0]/5)]
xy_train = xy_train[int(xy_train.shape[0]/5):]

print('xy_val :', xy_val.shape, '\t\t\txy_train :', xy_train.shape)

x_val = xy_val[:,:13]
y_val = xy_val[:,13]
x_train = xy_train[:,:13]
y_train = xy_train[:,13]

print('x_val :', x_val.shape, '\ty_val :', y_val.shape, '\tx_train :', x_train.shape, '\ty_train :', y_train.shape)

##3. 모델 구성

In [None]:
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=[x_train.shape[1]]))
model.add(Dense(48, activation='relu'))
model.add(Dense(12, activation='relu'))
model.add(Dense(1))

model.summary()

In [None]:
# example test
model.predict(x_train[:10])

##4. 모델 훈련

In [None]:
model.compile(optimizer=optimizers.RMSprop(lr=0.0005), loss='mse', metrics=['mae'])
history = model.fit(x_train, y_train, validation_data=(x_val,y_val),epochs=50, batch_size=10)

In [None]:
pd.DataFrame(history.history).head(15)

In [None]:
pd.DataFrame(history.history)

fig, ax = plt.subplots(1,2, figsize=(15,5))

ax[0].plot(history.history['loss'], 'y.-', label='train loss')
ax[0].plot(history.history['val_loss'], 'r.-', label='val loss')
ax[0].set_xlabel('epoch')
ax[0].set_ylabel('loss')
ax[0].legend(loc='upper right')

ax[1].plot(history.history['mae'], 'b.-', label='train mae')
ax[1].plot(history.history['val_mae'], 'g.-', label='val mae')
ax[1].set_xlabel('epoch')
ax[1].set_ylabel('mae')
ax[1].legend(loc='upper right')

plt.show()

##5. 모델 검증

In [None]:
# test data를 이용한 검증
y_eval = model.evaluate(x_test, y_test)
print(y_eval)

In [None]:
# 예측값과 실제값 비교
y_pred = model.predict(x_test)[:,0]
print(y_pred.shape)

plt.plot(y_test, y_pred, '.')
plt.plot([0,y_pred.max()],[0,y_pred.max()])
plt.xlabel('True')
plt.ylabel('Predicted')
plt.show()

In [None]:
# 오차 히스토그램
plt.hist(y_pred-y_test, bins=30)
plt.xlabel('Error')
plt.ylabel('Count')
plt.show()