Some fun experiments comparing the performance of using analytical solution vs gradient descent to solve linear regression.

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error as mse

randn = np.random.randn
l2norm = np.linalg.norm
inv = np.linalg.inv

import torch.utils.benchmark as torchbench

def bench(fn):
    t0 = torchbench.Timer(
    stmt='fn()',
    globals={'fn': fn})
    return t0.timeit(5).mean * 1000

Create the design matrix X and the label y arrays by a linear transformation plus some normally distributed error.

In [52]:
nfeatures = 1024 + 1        # n
nsamples = nfeatures * 20   # m
si = nfeatures*15

b = randn(nfeatures)
_X = randn(nsamples, nfeatures)
_X[:,0] = 1
_X[:,2] = _X[:,1] + randn(nsamples) * 0.01
X = _X[0:si]
X_test = _X[si:]

y_truth = _X @ b
_y = y_truth + randn(nsamples)
y = _y[0:si]
y_test_truth = _y[si:]

In [63]:


def direct_solution():
    # Complexity analysis
    # X.T @ X: X is m x n, so O(mn^2)
    # inv: O(n^3)
    # inv(...) @ X.T: O(mn^2)
    # last one is MV product: O(mn)
    b_hat = inv(X.T @ X + 100*np.identity(nfeatures)) @ X.T @ y
    return b_hat

b_hat = direct_solution()
print('b hat:  ', b_hat)
print('b truth:', b)
y_test = X_test @ b_hat
print(y_test)
print(y_test_truth)
print(mse(y_test, y_test_truth))

#print(f'time: {bench(direct_solution):.3f} ms')
#print(f'time of matmul: {bench(lambda: X.T @ X):.3f}')
#t = X.T @ X
#print(f'time of inverting: {bench(lambda: inv(t)):.3f}')

b hat:   [ 0.46939664 -0.43051188 -0.44378367 ...  0.79959653  1.84777393
  1.12738231]
b truth: [ 0.47624798  0.66586679 -1.54426316 ...  0.80751583  1.85427948
  1.13675101]
[ -6.39479709  32.2888496   28.7520582  ...  -8.80781922 -23.32479413
 -33.83633557]
[ -5.65151559  32.37736412  28.79186303 ... -10.19789198 -23.13249109
 -33.0176142 ]
1.0910558229128207


In [23]:
def gd_fixed_step(step=0.2, iters=1000):
    b = randn(nfeatures)
    for i in range(iters):
        grad = -2 * (y - X @ b) @ X / nsamples + 0.05*b
        b -= step * grad
        ng = l2norm(grad)
        #print(ng)
        if ng < 1e-3:
            break
    print(i)
    return b
    
b_hat = gd_fixed_step()
print('b hat:  ', b_hat)
print('b truth:', b)
print(f'L2norm: {l2norm(b-b_hat):.8f}')
print(f'time: {bench(gd_fixed_step):.3f} ms')

339
b hat:   [ 1.29247416  0.04862471 -0.05771728 ... -1.08123838 -1.22183517
  0.1896468 ]
b truth: [ 1.31137936  0.51396069 -0.51886663 ... -1.12443243 -1.24743364
  0.18186636]
L2norm: 1.13669733
218
34
379
285
424
289
421
time: 2329.868 ms


In [24]:
def gd_varied_step(init_step=0.2, iters=1000):
    step = init_step
    b = randn(nfeatures)
    error = (y - X @ b)
    prev_sse = np.sum(np.power(error, 2))
    for i in range(iters):
        error = (y - X @ b)
        sse = np.sum(np.power(error, 2))
        if sse < prev_sse:
            step *= 1.5
        else:
            step = init_step
        prev_sse = sse    
        grad = -2 * error @ X / nsamples + 0.05*b
        b -= step * grad
        ng = l2norm(grad)
        #print(ng)
        if ng < 1e-3:
            break
    print(i)
    return b
    
b_hat = gd_varied_step()
print('b hat:  ', b_hat)
print('b truth:', b)
print(f'L2norm: {l2norm(b-b_hat):.8f}')
print(f'time: {bench(gd_varied_step):.3f} ms')

226
b hat:   [ 1.29247415  0.04866589 -0.05775842 ... -1.08123834 -1.2218352
  0.1896468 ]
b truth: [ 1.31137936  0.51396069 -0.51886663 ... -1.12443243 -1.24743364
  0.18186636]
L2norm: 1.13666381
53
386
105
81
289
104
42
time: 824.497 ms
