Some fun experiments comparing the performance of using analytical solution vs gradient descent to solve linear regression.

In [56]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error as mse

randn = np.random.randn
l2norm = np.linalg.norm

import torch.utils.benchmark as torchbench

def bench(fn):
    t0 = torchbench.Timer(
    stmt='fn()',
    globals={'fn': fn})
    return t0.timeit(5).mean * 1000

Create the design matrix X and the label y arrays by a linear transformation plus some normally distributed error.

In [57]:
nfeatures = 1024 + 1        # n
nsamples = nfeatures * 15   # m

b = randn(nfeatures)
print('(true) feature vector:\n', b)
X = randn(nsamples, nfeatures)
X[0,:] = 1
print('design matrix:\n', X)

(true) feature vector:
 [ 0.6165659  -1.12961516  1.51448317 ... -0.45687138 -1.27717563
 -0.81561424]
design matrix:
 [[ 1.          1.          1.         ...  1.          1.
   1.        ]
 [-0.26066707 -0.06262741 -0.39251034 ... -1.09108437 -0.94859971
  -1.58804296]
 [ 0.65051288 -0.71301832  0.34022412 ... -0.74797108  0.58349247
   0.16346827]
 ...
 [-1.06871364 -0.06081977  0.31108845 ...  0.0381546   0.60568548
   0.14035323]
 [-0.14717475 -0.3520984   1.98482409 ... -0.34046461  0.99573805
  -0.58691011]
 [ 0.33086866  0.06742121 -0.38564413 ... -0.23308997 -0.94823448
   1.20469088]]


In [62]:
y_truth = X @ b
y = y_truth + randn(nsamples)

def direct_solution():
    # Complexity analysis
    # X.T @ X: X is m x n, so O(mn^2)
    # inv: O(n^3)
    # inv(...) @ X.T: O(mn^2)
    # last one is MV product: O(mn)
    b_hat = np.linalg.inv(X.T @ X) @ X.T @ y
    return b_hat

b_hat = direct_solution()
print('b hat:  ', b_hat)
print('b truth:', b)
print(f'MSE: {mse(b, b_hat):.8f}')
print(f'time: {bench(direct_solution):.3f} ms')
print(f'time of matmul: {bench(lambda: X.T @ X):.3f}')

b hat:   [ 0.60873505 -1.1364232   1.52194525 ... -0.45236377 -1.27560383
 -0.81333577]
b truth: [ 0.6165659  -1.12961516  1.51448317 ... -0.45687138 -1.27717563
 -0.81561424]
MSE: 0.00007474
time: 960.863 ms
time of matmul: 404.315


In [59]:
def gd_fixed_step(step=0.2, iters=1000):
    b = randn(nfeatures)
    for i in range(iters):
        grad = -2 * (y - X @ b) @ X / nsamples
        b -= step * grad
        ng = l2norm(grad)
        #print(ng)
        if ng < 1e-4:
            print(i)
            break
    return b
    
b_hat = gd_fixed_step()
print('b hat:  ', b_hat)
print('b truth:', b)
print(f'MSE: {mse(b, b_hat):.8f}')
print(f'time: {bench(gd_fixed_step):.3f} ms')

44
b hat:   [ 0.61289855 -1.12682652  1.52123435 ... -0.45501485 -1.29001648
 -0.80619029]
b truth: [ 0.6165659  -1.12961516  1.51448317 ... -0.45687138 -1.27717563
 -0.81561424]
MSE: 0.00007125
45
45
45
45
45
45
45
time: 291.080 ms


In [64]:
def gd_varied_step(init_step=0.2, iters=1000):
    step = init_step
    b = randn(nfeatures)
    error = (y - X @ b)
    prev_sse = np.sum(np.power(error, 2))
    for i in range(iters):
        error = (y - X @ b)
        sse = np.sum(np.power(error, 2))
        if sse < prev_sse:
            step *= 1.5
        else:
            step = init_step
        prev_sse = sse    
        grad = -2 * error @ X / nsamples
        b -= step * grad
        ng = l2norm(grad)
        if ng < 1e-4:
            print(i)
            break
    return b
    
b_hat = gd_varied_step()
print('b hat:  ', b_hat)
print('b truth:', b)
print(f'MSE: {mse(b, b_hat):.8f}')
print(f'time: {bench(gd_varied_step):.3f} ms')

17
b hat:   [ 0.60873463 -1.13642337  1.52194494 ... -0.45236386 -1.27560378
 -0.81333595]
b truth: [ 0.6165659  -1.12961516  1.51448317 ... -0.45687138 -1.27717563
 -0.81561424]
MSE: 0.00007474
17
17
17
17
17
17
17
time: 119.298 ms
