# 04 - Multi-variable Linear Regression

<img width="200" src="https://i.imgur.com/hbPVe1T.png">


In [1]:
import tensorflow as tf
import numpy as np

print(tf.__version__)

2.3.0


# Multi-variable linear regression

> - Predicting exam score - regression using three inputs (x1, x2, x3)

x1 (quiz 1) | x2 (quiz 2) | x3 (mid 1) | Y (final)
---- | ---- | ----| ----
73 | 80 | 75 | 152
93 | 88 | 93 | 185
89 | 91 | 90 | 180
96 | 98 | 100 | 196
73 | 66 | 70 | 142

> - Test Scores for General Psychology ( https://goo.gl/g2T8Kp )

# Matrix multiplication

### dot product(=scalar product, 내적)

<img src="https://www.mathsisfun.com/algebra/images/matrix-multiply-a.svg" >

[출처](https://www.mathsisfun.com/algebra/matrix-multiplying.html)

# Multi-feature regression

### Hypothesis

$$ H(x) = w x + b $$

$$ H(x_1, x_2, x_3) = w_1 x_1 + w_2 x_2 + w_3 x_3 + b $$

# Hypothesis using matrix

$$ H(x_1, x_2, x_3) = \underline{w_1 x_1 + w_2 x_2 + w_3 x_3} + b $$

$$ \Downarrow $$

$$ \begin{pmatrix} w_{1} & w_{2} & w_{3} \end{pmatrix} \cdot \begin{pmatrix} x_{1} \\ x_{2} \\ x_{3} \end{pmatrix}$$

=> $ WX $ ($W$, $X$ 는 matrix)

# Hypothesis without b

$$ H(x_1, x_2, x_3) = w_1 x_1 + w_2 x_2 + w_3 x_3 + b$$

$$ = b + w_1 x_1 + w_2 x_2 + w_3 x_3 $$

$$ = \begin{pmatrix} b & x_{ 1 } & x_{ 2 } & x_{ 3 } \end{pmatrix}\cdot \begin{pmatrix} 1 \\ w_{ 1 } \\ w_{ 2 } \\ w_{ 3 } \end{pmatrix} $$

$$ = XW $$

# Hypothesis using matrix 

### Many x instances

$$ \begin{pmatrix} x_{ 11 } & x_{ 12 } & x_{ 13 } \\ x_{ 21 } & x_{ 22 } & x_{ 23 } \\ x_{ 31 } & x_{ 32 } & x_{ 33 }\\ x_{ 41 } & x_{ 42 } & x_{ 43 }\\ x_{ 51 } & x_{ 52 } & x_{ 53 }\end{pmatrix} \cdot \begin{pmatrix} w_{ 1 } \\ w_{ 2 } \\ w_{ 3 } \end{pmatrix}=\begin{pmatrix} x_{ 11 }w_{ 1 }+x_{ 12 }w_{ 2 }+x_{ 13 }w_{ 3 } \\ x_{ 21 }w_{ 1 }+x_{ 22 }w_{ 2 }+x_{ 23 }w_{ 3 }\\ x_{ 31 }w_{ 1 }+x_{ 32 }w_{ 2 }+x_{ 33 }w_{ 3 } \\ x_{ 41 }w_{ 1 }+x_{ 42 }w_{ 2 }+x_{ 43 }w_{ 3 } \\ x_{ 51 }w_{ 1 }+x_{ 52 }w_{ 2 }+x_{ 53 }w_{ 3 } \end{pmatrix} $$

$$ [5, 3] \cdot [3, 1] = [5, 1] $$

$$ H(X) = XW $$

> - $5$는 데이터(instance)의 수, $3$은 변수(feature)의 수, $1$은 결과

# Simple Example (2 variables)

x1 | x2 | y
---- | ---- | ----
1  |  0  |  1
0  |  2  |  2
3  |  0  |  3
0  |  4  |  4
5  |  0  |  5

In [14]:
tf.random.set_seed(0)

x1_data = [1, 0, 3, 0, 5]
x2_data = [0, 2, 0, 4, 0]
y_data  = [1, 2, 3, 4, 5]

W1 = tf.Variable(tf.random.uniform((1,), -10.0, 10.0))
W2 = tf.Variable(tf.random.uniform((1,), -10.0, 10.0))
b = tf.Variable(tf.random.uniform((1,), -10.0, 10.0))

learning_rate = tf.Variable(0.001)

for i in range(1000 + 1):
    with tf.GradientTape() as tape:
        hypothesis = W1 * x1_data + W2 * x2_data + b
        cost = tf.reduce_mean(tf.square(hypothesis - y_data))
    W1_grad, W2_grad, b_grad = tape.gradient(cost, [W1, W2, b])
    W1.assign_sub(learning_rate * W1_grad)
    W2.assign_sub(learning_rate * W2_grad)
    b.assign_sub(learning_rate * b_grad)
    
    if i % 100 == 0:
        print("{:5} | {:10.5f} | {:10.4f} | {:10.4f} | {:10.6f}".format(
          i, cost.numpy(), W1.numpy()[0], W2.numpy()[0], b.numpy()[0]))

    0 |  335.28082 |    -4.0663 |     1.1220 |  -6.065215
  100 |   18.95926 |     0.7151 |     1.8781 |  -4.429109
  200 |    3.44508 |     1.7284 |     2.0768 |  -3.961648
  300 |    2.52540 |     1.9225 |     2.1184 |  -3.762738
  400 |    2.33730 |     1.9403 |     2.1114 |  -3.629400
  500 |    2.19633 |     1.9213 |     2.0881 |  -3.514729
  600 |    2.06604 |     1.8953 |     2.0595 |  -3.407385
  700 |    1.94368 |     1.8686 |     2.0293 |  -3.304398
  800 |    1.82860 |     1.8425 |     1.9990 |  -3.204873
  900 |    1.72033 |     1.8171 |     1.9693 |  -3.108468
 1000 |    1.61847 |     1.7926 |     1.9403 |  -3.015011


# Simple Example (2 variables with Matrix)

In [42]:
x_data = [
    [1., 0., 3., 0., 5.],
    [0., 2., 0., 4., 0.]
]
y_data  = [1, 2, 3, 4, 5]

W = tf.Variable(tf.random.uniform((1, 2), -1.0, 1.0))
b = tf.Variable(tf.random.uniform((1,), -1.0, 1.0))

learning_rate = tf.Variable(0.01)

for i in range(1000+1):
    with tf.GradientTape() as tape:
        hypothesis = tf.matmul(W, x_data) + b # (1, 2) * (2, 5) = (1, 5)
        cost = tf.reduce_mean(tf.square(hypothesis - y_data))

        W_grad, b_grad = tape.gradient(cost, [W, b])
        W.assign_sub(learning_rate * W_grad)
        b.assign_sub(learning_rate * b_grad)
    
    if i % 100 == 0:
        print("{:5} | {:10.6f} | {:10.4f} | {:10.4f} | {:10.6f}".format(
            i, cost.numpy(), W.numpy()[0][0], W.numpy()[0][1], b.numpy()[0]))

    0 |   9.711032 |     0.4891 |    -0.5854 |   0.813789
  100 |   0.148628 |     0.7605 |     0.7157 |   0.911109
  200 |   0.080663 |     0.8236 |     0.7906 |   0.671238
  300 |   0.043778 |     0.8700 |     0.8458 |   0.494499
  400 |   0.023759 |     0.9042 |     0.8864 |   0.364296
  500 |   0.012895 |     0.9295 |     0.9163 |   0.268376
  600 |   0.006998 |     0.9480 |     0.9383 |   0.197711
  700 |   0.003798 |     0.9617 |     0.9546 |   0.145653
  800 |   0.002061 |     0.9718 |     0.9665 |   0.107302
  900 |   0.001119 |     0.9792 |     0.9753 |   0.079049
 1000 |   0.000607 |     0.9847 |     0.9818 |   0.058235


# Hypothesis without b

In [49]:
# 앞의 코드에서 bias(b)를 행렬에 추가
x_data = [
    [1., 1., 1., 1., 1.], # bias(b)
    [1., 0., 3., 0., 5.], 
    [0., 2., 0., 4., 0.]
]
y_data  = [1, 2, 3, 4, 5]

W = tf.Variable(tf.random.uniform((1,3), -1.0, 1.0))

learning_rate = 0.01
optimizer = tf.keras.optimizers.SGD(learning_rate)

for i in range(1000 + 1):
    with tf.GradientTape() as tape:
        hypothesis = tf.matmul(W, x_data)
        cost = tf.reduce_mean(tf.square(hypothesis - y_data))

    grads = tape.gradient(cost, [W])
    optimizer.apply_gradients(grads_and_vars=zip(grads, [W]))
    
    if i % 100 == 0:
        print("{:5} | {:10.6f} | {:10.4f} | {:10.4f} | {:10.4f}".format(
            i, cost.numpy(), W.numpy()[0][0], W.numpy()[0][1], W.numpy()[0][2]))

    0 |   7.723946 |    -0.9047 |     0.9896 |     0.0724
  100 |   0.026807 |    -0.3870 |     1.1017 |     1.1206
  200 |   0.014549 |    -0.2851 |     1.0749 |     1.0889
  300 |   0.007896 |    -0.2100 |     1.0552 |     1.0655
  400 |   0.004285 |    -0.1547 |     1.0407 |     1.0483
  500 |   0.002326 |    -0.1140 |     1.0300 |     1.0355
  600 |   0.001262 |    -0.0840 |     1.0221 |     1.0262
  700 |   0.000685 |    -0.0619 |     1.0163 |     1.0193
  800 |   0.000372 |    -0.0456 |     1.0120 |     1.0142
  900 |   0.000202 |    -0.0336 |     1.0088 |     1.0105
 1000 |   0.000110 |    -0.0247 |     1.0065 |     1.0077


### Custom Gradient
* tf.train.GradientDescentOptimizer(): optimizer
* optimizer.apply_gradients(): update

## Multi-variable linear regression
*  random  초기화: tf.random_normal()

In [60]:
data = np.array([
    # X1,   X2,    X3,   y
    [ 73.,  80.,  75., 152. ],
    [ 93.,  88.,  93., 185. ],
    [ 89.,  91.,  90., 180. ],
    [ 96.,  98., 100., 196. ],
    [ 73.,  66.,  70., 142. ]
], dtype=np.float32)

# slice data
X = data[:, :-1]
y = data[:, [-1]]

W = tf.Variable(tf.random.normal((3, 1)))
b = tf.Variable(tf.random.normal((1,)))

learning_rate = 0.00001

# hypothesis, prediction function
def predict(X):
    return tf.matmul(X, W) + b

print("epoch | cost")

n_epochs = 2000
for i in range(n_epochs + 1):
    # tf.GradientTape() to record the gradient of the cost function
    with tf.GradientTape() as tape:
        cost = tf.reduce_mean(tf.square(predict(X) - y))
        
    # calculates the gradients of the loss
    W_grad, b_grad = tape.gradient(cost, [W, b])
    
    # updates parameters (W and b)
    W.assign_sub(learning_rate * W_grad)
    b.assign_sub(learning_rate * b_grad)
    
    if i % 100 == 0:
        print("{:5} | {:10.4f}".format(i, cost.numpy()))

epoch | cost
    0 |  4075.5820
  100 |     6.6360
  200 |     6.3900
  300 |     6.1565
  400 |     5.9350
  500 |     5.7247
  600 |     5.5250
  700 |     5.3354
  800 |     5.1554
  900 |     4.9844
 1000 |     4.8220
 1100 |     4.6677
 1200 |     4.5211
 1300 |     4.3819
 1400 |     4.2495
 1500 |     4.1237
 1600 |     4.0041
 1700 |     3.8904
 1800 |     3.7823
 1900 |     3.6794
 2000 |     3.5816


In [63]:
W.numpy(), b.numpy()

(array([[ 0.663899  ],
        [-0.04354759],
        [ 1.3917944 ]], dtype=float32),
 array([-0.8917927], dtype=float32))

In [64]:
tf.matmul(X, W) + b

<tf.Tensor: shape=(5, 1), dtype=float32, numpy=
array([[148.4736 ],
       [186.4555 ],
       [179.49388],
       [197.75429],
       [142.1243 ]], dtype=float32)>

### predict

In [65]:
Y # labels, 실제값

[152.0, 185.0, 180.0, 196.0, 142.0]

In [66]:
predict(X).numpy() # 예측값

array([[148.4736 ],
       [186.4555 ],
       [179.49388],
       [197.75429],
       [142.1243 ]], dtype=float32)

In [67]:
# 새로운 데이터 예측

predict([[ 89.,  95.,  92.],[ 84.,  92.,  85.]]).numpy()

array([[182.10329],
       [169.17186]], dtype=float32)