In [None]:
import numpy as np
import matplotlib.pyplot as plt
from IPython import display
plt.ion()
# %matplotlib notebook

# Quick jupyter notebook intro

In [None]:
## this is just a chart function for use in the cells below
def chart(x, y, y_actual, y_estimate, error_history):
    plt.figure(figsize=(15,5))
    plt.subplot(1, 2, 1)
    plt.scatter(x, y)
    plt.plot(x, y_actual)
    plt.plot(x, y_estimate)
    plt.title("Data and sample")
    
    plt.subplot(1, 2, 2)
    plt.plot(error_history)
    plt.title("Error history")
    
def chart_ab(a, b):
    plt.figure(figsize=(15,5))
    plt.subplot(1, 2, 1)
    plt.plot(a)
    plt.title("a value")
    
    plt.subplot(1, 2, 2)
    plt.plot(b)
    plt.title("b value")

# Free Online Courses
http://course.fast.ai/
https://www.coursera.org/specializations/deep-learning

# An linear function

## $ f(x) = ax + b $
## $y=ax+b$

In [None]:
x = np.arange(-10, 11).astype("float32")
print("x:", x)
y_actual = 0.5 * x + 2  # a * x + b
print("y_actual:", y_actual)

y = y_actual + np.random.randn(21) * 2 # Generate some data and add noise
y = y.astype("float32")
print("y (sample data):", y)

plt.plot(x, y_actual)
plt.scatter(x, y)
plt.show()

# A random guess

In [None]:
#randomly initialize weights
w_a = -5
w_b = 0

y_estimate = w_a * x + w_b
print("y_estimate:", y_estimate)

plt.scatter(x, y)
plt.plot(x, y_actual)
plt.plot(x, y_estimate)
plt.show()

# error calculation
l1_error = y - y_estimate
print("L1 error:")
print(l1_error)
print("L1 sum:", l1_error.sum(), "\n")

# error squared
print("L2 error:")
l2_error = l1_error * l1_error
print(l2_error)
print("L2 sum:", l2_error.sum())

### Discussion point: 
- Difference between L1 and L2 error
- How does L1 and L2 change as your change your guess of w_a
- How does L1 and L2 change as your change your guess of w_b
- If you didn't know the exact answer to a and b, could you have arrived at a fairly good guess after x number of tries? How would you go about it? 

# Gradient

![title](https://github.com/whathelll/Reinforcement-Learning/raw/master/FunctionApproximation/images/gradient.png)

# Gradient decent
https://en.wikipedia.org/wiki/Gradient_descent

## Find: $$gradient = \frac{\text{change in loss}}{\text{change in w_a}}$$
## Then: $$w_a = w_a - 0.01 * gradient$$ 
#### where 0.01 is referred to as the learning rate which you can adjust

In [None]:
"""Restarting all variables"""

y = y_actual + np.random.randn(21) * 2 # Generate some data and add noise
y = y.astype("float32")
print("y (sample data):", y)
plt.plot(x, y_actual)
plt.scatter(x, y)
plt.show()

#randomly initialize weights again
w_a = -5
w_b = 0
y_estimate = w_a * x + w_b

# Let's create 3 lists to store our history of these values
error_history = []  # logging
w_a_history = [w_a]  # logging
w_b_history = [w_b]  # logging

In [None]:
for i in range(1):
    l1 = y - y_estimate
    l2_error = l1 * l1
    error_history.append(l2_error.sum())  # logging
    learning_rate = 0.01
    
    """https://www.khanacademy.org/math/ap-calculus-ab/ab-derivative-intro """
    # de/da = de/dl1 * dl1/da = 2 * (dl1) * -(x)
    # de/db = de/dl1 * dl1/db = 2 * (dl1) * -1
    de_da = 2 * l1 * -x    # gradient of error with respect to a
    de_db = 2 * l1 * -1   # gradient of error with respect to b
    w_a = w_a - learning_rate * de_da.sum() / x.shape[0]
    w_b = w_b - learning_rate * de_db.sum() / x.shape[0]
    
    w_a_history.append(w_a)  # logging
    w_b_history.append(w_b)  # logging

    y_estimate = w_a * x + w_b

chart(x, y, y_actual, y_estimate, error_history)
chart_ab(w_a_history, w_b_history)

print("w_a:", w_a, "w_b:", w_b)
y_estimate = w_a * x + w_b
print("y_estimate:", y_estimate)
print(error_history[len(error_history)-1])

### Discussion point: 
- Observe what happens when you run this for several hundred times
- Does the orange line ever converge to the blue line? Will it ever? why? 
- What happens when you go back to the cell 5 and change the code to introduce a bigger random noise in your y values? Does it converge better or worse and why? 
- Why does the rate of change of w_a and w_b reduce over time?
- What happens when you increase/decrease the learning rate (after restarting), is there a learning rate that doesn't work and why?
- Why does w_b take longer to converge?
- Have we done a decent job at approximating a linear function just based on noisy samples?

# Let's use tensorflow for this

In [None]:
import tensorflow as tf

### Linear Function again
$ f(x) = ax + b $

In [None]:
"""Setting up a linear function again"""
x = np.arange(-10, 11).astype("float32")
print("x:", x)
y_actual = 0.5 * x + 2
print("y_actual:", y_actual)

y = y_actual + np.random.randn(21) * 0.5
y = y.astype("float32")
print("y (sample data):", y)

In [None]:
# define input, only 1 value in per sample
inputs = tf.keras.layers.Input(shape=(1,))
# add a layer with 1 node
hidden_layer = tf.keras.layers.Dense(1)(inputs)
# define output
predictions = hidden_layer

model = tf.keras.models.Model(inputs=inputs, outputs=predictions)  # instantiate our model
sgd = tf.keras.optimizers.SGD(lr=0.01) # define the method for optimizing, in this case Stochastic Gradient Descent
model.compile(optimizer=sgd, loss='mse', metrics=['accuracy']) # assign optimizer and loss measure to model

# put data into the correct shape because tf expects data to come in batches
tf_x = np.expand_dims(x, axis=1)
tf_y = np.expand_dims(y, axis=1)
print(tf_x.shape, tf_y.shape)


print("Weights before:", model.get_weights())

"""Train the model"""
history = model.fit(tf_x, tf_y, epochs=250, verbose=0)

print("Prediction:", model.predict(tf_x).squeeze())
print("Weights after:", model.get_weights())
print("Loss:", history.history["loss"][-1])

plt.plot(x, y_actual)
plt.plot(x, model.predict(tf_x).squeeze())
# model.predict(tf_x) will now do predictions based on the trained weights

### Discussion point: 
- Observe the weight values before and after training
- Observe what happens when you change the number of epochs
- Observe what happens when you change the learning rate
- Change verbose=1 and see what happens

# A non linear function

## $ f(x) = x^2 $


In [None]:
x = np.arange(-10, 11)
print("x:", x)
y_actual = x * x
print("y_actual:", y_actual)

y = y_actual + 0 #np.random.randn(21) * 10

plt.plot(x, y_actual)
plt.scatter(x, y)
plt.show()

### Discussion point: 
- What do we expect our linear model would do?

In [None]:
# define input, only 1 value in per sample
inputs = tf.keras.layers.Input(shape=(1,))

"""Original linear model"""
predictions = tf.keras.layers.Dense(1)(inputs)

"""A simple linear node with a non-linear activation"""
# predictions = tf.keras.layers.Dense(1, activation="relu")(inputs)

"""A deep model"""
# hidden_layer = tf.keras.layers.Dense(32, activation="relu")(inputs)
# hidden_layer = tf.keras.layers.Dense(32, activation="relu")(hidden_layer)
# hidden_layer = tf.keras.layers.Dense(32, activation="relu")(hidden_layer)
# hidden_layer = tf.keras.layers.Dense(32, activation="relu")(hidden_layer)
# hidden_layer = tf.keras.layers.Dense(32, activation="relu")(hidden_layer)
# predictions = tf.keras.layers.Dense(1, activation="relu")(hidden_layer)

model = tf.keras.models.Model(inputs=inputs, outputs=predictions)  # instantiate our model
sgd = tf.keras.optimizers.SGD(lr=0.0001) # define the method for optimizing - Stochastic Gradient Descent
model.compile(optimizer=sgd, loss='mse', metrics=['accuracy']) # assign optimizer and loss measure to model

# put data into the correct shape because tf expects data to come in batches
tf_x = np.expand_dims(x, axis=1)
tf_y = np.expand_dims(y, axis=1)
print(tf_x.shape, tf_y.shape)


# print("Weights before:", model.get_weights())

"""Train the model"""
history = model.fit(tf_x, tf_y, epochs=500, verbose=0)

print("Prediction:", model.predict(tf_x).squeeze())
# print("Weights after:", model.get_weights())
print("Loss:", history.history["loss"][-1])

plt.plot(x, y_actual)
plt.plot(x, model.predict(tf_x).squeeze())

# Rectified Linear Units (ReLU)
$ y = ax + b$  
  
$ReLU =
\begin{cases}
y > 0,  & \text{return y} \\
y < 0, & \text{return 0}
\end{cases}$

### Discussion point: 
- What is happening when we combine a linear model with a non-linear activation? 
- Run the single scenario a few times and observe what happens? Why? 
- What happens if we change the number of nodes in our single layer?
- What is happening when we run a deep model with multiple layers?
- Play around with the learning rate, and different layers to see what happens. 

# Further learnings
## Top down approach: http://course.fast.ai/
## Bottom up approach: https://www.coursera.org/specializations/deep-learning