In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation

model = Sequential([
  Dense(1, input_shape=(2,), use_bias=False),
  Dense(2, use_bias=False)
])

model.compile(loss='mean_absolute_error')

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1)                 2         
                                                                 
 dense_1 (Dense)             (None, 2)                 2         
                                                                 
Total params: 4
Trainable params: 4
Non-trainable params: 0
_________________________________________________________________


### Scenario 1: Not all features are independent of each other

In [2]:
# the below data has two features, the second one is dependent on the first (it's 5 times bigger than the first)

train1 = []
import random
for _ in range(1000000):
  r = random.random()
  train1.append((r, 100 * r))

train1 = np.array(train1)

In [3]:
# since this is a autoEncoder, the input and output data are the same
model.fit(train1, train1, validation_split=0.1)  # default batch_size=32, so the number of iterations = 900000 / 32 = 28125

# "loss" is calculated after each bactch for that batch. "val_loss" is calculated at the end for the whole val set.



<keras.callbacks.History at 0x282a55bf0d0>

In [6]:
model.weights

[<tf.Variable 'dense/kernel:0' shape=(2, 1) dtype=float32, numpy=
 array([[-1.6856214 ],
        [-0.71987677]], dtype=float32)>,
 <tf.Variable 'dense_1/kernel:0' shape=(1, 2) dtype=float32, numpy=array([[-0.01384243, -1.3580726 ]], dtype=float32)>]

The output weights -0.01384243, -1.3580726 show that the network has learned to output the bottleneck value and 100 x bottleneck value.  
On the other hand, -1.685 * x + -0.719 * (100 * x) = -73.585 * x, and -0.013 * (-73.585 * x) = 0.95 * x

In [5]:
a = np.array([(0.5, 50)])
model.predict([a])

# we see that it was able to reconstruct the input



array([[ 0.50990885, 50.026848  ]], dtype=float32)

### Scenario 2: All features are independent of each other

In [8]:
train2 = []
import random
for _ in range(1000000):
  r1 = random.random() * 100  # we multiply by 100 to get the same range of numbers as scenario 1
  r2 = random.random() * 100
  train2.append((r1, r2))

train2 = np.array(train2)

model.fit(train2, train2, validation_split=0.1)  



<keras.callbacks.History at 0x282ae0a3e20>

we see that loss cannot be lowered further and is much higher than the first scenario.  
This is due to the fact that the input couldn't be compressed (as was done in the first scenario)

In [42]:
model.weights

[<tf.Variable 'dense_2/kernel:0' shape=(2, 1) dtype=float32, numpy=
 array([[-1.2997323 ],
        [-0.00222956]], dtype=float32)>,
 <tf.Variable 'dense_3/kernel:0' shape=(1, 2) dtype=float32, numpy=array([[-0.76876694, -0.5793358 ]], dtype=float32)>]

**Explanation of val_loss: 15.8892**  
The average distance between two points in 0-100 interval is 33.3 (below).  
val_loss of 15.8892 shows that the network has reconstructed one of the elements (almost 0 loss) and for the other element it hasn't learned anything (so almost 33.3 loss). The average of these two would be 16.6 (in the case 0 loss for 1 element and total loss for the other).

In [12]:
a = np.array([(4, 20)])
model.predict([a])



array([[4.049759 , 3.0566564]], dtype=float32)

In [34]:
out = []
for i in range(100):
  for j in range(100):
    out.append(abs(i - j))
np.mean(out)

33.33