* Boston 시의 주택가격 예측 Linear Regression 구현

주어진 csv 파일을 사용하여
Boston 시의 주택가격을 예측하는 tensorflow 프로그램을
작성하세요
9개의 컬럼 데이터를 사용하여 주택가격을 예측하는 
multi-variable linear regression 구현

* 사용되는 입력변수 목록
crim : per capita crime rate by town 
zn  : proportion of residential land zoned for lots over 25,000 sq.ft.  
indus   :proportion of non-retail business acres per town.  
nox :  nitric oxides concentration  
rm : average number of rooms per dwelling  
age :  proportion of owner-occupied units built before 1940  
dis  : weighted distances to five Boston employment centers  
tax :  full-value property-tax rate per dollars 10,000  
ptratio :  pupil-teacher ratio by town  
medv  : Median value of owner-occupied homes in thousand dollars  
        주택가격, 우리가 예측할 값이다

* 제공되는 csv 파일 설명
boston_train.csv :  Train the model and obtain the weights  [400, 10]  
boston_test.csv :  Evaluate the performance of the model on unseen data  [100, 10]  


In [1]:
import tensorflow as tf
import numpy as np
tf.random.set_seed(5)

In [2]:
# 데이터 불러 오기
xy = np.loadtxt('boston_train.csv', delimiter=',',skiprows=1, dtype=np.float32)
xy.shape          #(400,10)
print(xy.shape)
# x_train
x_train = xy[:,:-1]     # X     
y_train = xy[:,[-1]]    # Y
print(x_train.shape,y_train.shape)

(400, 10)
(400, 9) (400, 1)


In [3]:
# 변수 초기화 : weight, bias
# X * W = Y
# (m,n) * (n,l) = (m,l)  : 행렬의 내적 곱셉 공식
#(400,9) * (9,1) = (400,1)                                 # x_train.shape과 W를 행렬곱하여 y_train.shape되도록
W = tf.Variable(tf.random.normal([9,1]),name='weight')     # W의 행렬 수치 수정(random.normal([ 여기 값 ])) 
b = tf.Variable(tf.random.normal([1]),name='bias')         # b(bias)는 W의 마지막값과 같은 값으로 둔다 
print(W,b)

<tf.Variable 'weight:0' shape=(9, 1) dtype=float32, numpy=
array([[-0.18030666],
       [-0.95028627],
       [-0.03964049],
       [-0.7425406 ],
       [ 1.3231523 ],
       [-0.61854804],
       [ 0.85406643],
       [-0.08899953],
       [ 2.4488697 ]], dtype=float32)> <tf.Variable 'bias:0' shape=(1,) dtype=float32, numpy=array([0.22652864], dtype=float32)>


In [4]:
# 예측 함수(hypothesis) : H(X1,X2,X3) = W1*X1 + W2*X2 + W3*X3 + ... + b
def hypothesis(X):
    return tf.matmul(X,W) + b

In [5]:
# 비용 함수 : (Hx - y)^2 의 평균
# tf.square() : 제곱
# tf.reduce_mean() : 평균
def cost_func():
    cost = tf.reduce_mean(tf.square(hypothesis(x_train) - y_train))
    return cost

In [6]:
# 경사 하강법
# learning_rate(학습율)을 0.01 로 설정하여 optimizer객체를 생성
# optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate= 0.01)
#optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate= 0.01)
optimizer = tf.optimizers.Adam(learning_rate=0.01)

In [7]:
# 학습 시작
print('***** Start Learning!!')
for step in range(10001):
    optimizer.minimize(cost_func,var_list=[W,b])
    if step % 1000 == 0:
        print('%04d'%step,'cost:[',cost_func().numpy(),']',
             ' W:',W.numpy(),' b:',b.numpy())
        
print('***** Learning Finished!!')

***** Start Learning!!
0000 cost:[ 3380.4128 ]  W: [[-0.17030635]
 [-0.940286  ]
 [-0.02964019]
 [-0.7325403 ]
 [ 1.3331527 ]
 [-0.60854775]
 [ 0.8640667 ]
 [-0.07899924]
 [ 2.45887   ]]  b: [0.23652895]
1000 cost:[ 62.33766 ]  W: [[-0.2290736 ]
 [ 0.15704152]
 [-0.2341586 ]
 [-0.5537422 ]
 [ 2.0969405 ]
 [-0.00413891]
 [-1.2355309 ]
 [-0.01346126]
 [ 1.1769981 ]]  b: [0.04301985]
2000 cost:[ 43.61921 ]  W: [[-0.21055721]
 [ 0.11845081]
 [-0.26855478]
 [-0.02376635]
 [ 4.2989554 ]
 [-0.02798401]
 [-1.5441736 ]
 [-0.00865479]
 [ 0.4551651 ]]  b: [0.9228278]
3000 cost:[ 36.122997 ]  W: [[-1.8018949e-01]
 [ 7.6397642e-02]
 [-2.0762946e-01]
 [ 4.2565547e-02]
 [ 6.0564122e+00]
 [-4.8381902e-02]
 [-1.4871418e+00]
 [-6.0389610e-03]
 [-1.9743495e-01]]  b: [1.7857502]
4000 cost:[ 34.84969 ]  W: [[-1.6464920e-01]
 [ 5.3313296e-02]
 [-1.6675086e-01]
 [-6.9250613e-01]
 [ 6.8752556e+00]
 [-5.4360442e-02]
 [-1.3683947e+00]
 [-4.4191186e-03]
 [-5.4338950e-01]]  b: [2.522034]
5000 cost:[ 34.630123 ]  

In [8]:
# 회귀 계수, weight과 bias 출력
print('weight :', W.numpy())
print('Bias: ', b.numpy())

weight : [[-1.5976070e-01]
 [ 4.4607934e-02]
 [-1.2352155e-01]
 [-1.1961572e+01]
 [ 6.7739682e+00]
 [-4.6308581e-02]
 [-1.5357161e+00]
 [-1.8167411e-03]
 [-8.0096614e-01]]
Bias:  [12.786661]


In [9]:
# 예측
print('***** Predict')

xxyy = np.loadtxt('boston_test.csv', delimiter=',',skiprows=1, dtype=np.float32) # 테스트용 값 파일 불러오기
xxyy.shape

x_data = xxyy[:,:-1]                        # 테스트 할 x값만 저장
x_test = np.array(x_data,dtype=np.float32)
print(hypothesis(x_test))
#73, 80, 75, 152
#93, 88, 93, 185
#89, 91, 90, 180
#96, 98, 100, 196
#73, 66, 70, 142


***** Predict
tf.Tensor(
[[22.036743 ]
 [33.6848   ]
 [23.324196 ]
 [31.547203 ]
 [34.15084  ]
 [24.94234  ]
 [23.865719 ]
 [14.034515 ]
 [28.240376 ]
 [21.429129 ]
 [26.845556 ]
 [19.387938 ]
 [20.396423 ]
 [42.227753 ]
 [15.0984745]
 [16.891142 ]
 [27.680002 ]
 [18.262903 ]
 [29.295385 ]
 [18.12585  ]
 [17.675734 ]
 [23.064438 ]
 [25.338455 ]
 [18.977695 ]
 [18.594187 ]
 [22.231216 ]
 [30.888582 ]
 [37.197014 ]
 [24.397018 ]
 [21.52744  ]
 [23.102768 ]
 [22.057217 ]
 [32.368744 ]
 [33.46273  ]
 [33.81089  ]
 [22.226871 ]
 [20.628586 ]
 [39.087135 ]
 [21.9385   ]
 [26.13983  ]
 [23.189102 ]
 [22.34973  ]
 [15.776873 ]
 [21.602798 ]
 [22.613781 ]
 [16.843504 ]
 [17.054596 ]
 [21.91208  ]
 [17.861937 ]
 [20.649174 ]
 [ 8.57162  ]
 [15.53927  ]
 [22.262535 ]
 [22.781776 ]
 [26.299536 ]
 [24.171799 ]
 [17.926558 ]
 [17.506302 ]
 [28.5208   ]
 [20.274227 ]
 [19.90683  ]
 [17.412987 ]
 [23.796013 ]
 [30.663778 ]
 [16.862791 ]
 [15.461723 ]
 [31.751997 ]
 [23.949856 ]
 [29.635908 ]
 [29.1740

In [11]:
# 정확도 측정 : RMSE
def get_rmse(y_test,preds):
    squared_error = 0
    for k,_ in enumerate(y_test):
        squared_error += (preds[k] - y_test[k])**2
    mse = squared_error/len(y_test)  
    rmse = np.sqrt(mse)
    return rmse[0]

# 학습한 데이터를 그대로 검증 데이터로 사용한 경우
x_test = xxyy[:,:-1]     # X
y_test = xxyy[:,[-1]]    # Y

preds = hypothesis(x_test).numpy()
print('RMSE:', get_rmse(y_test,preds)) #RMSE: 4.00894

RMSE: 4.00894
