In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from gradient_descent import Objective, gradient_descent_momentum

In [2]:
df_train = pd.read_csv('./task1a/train.csv')
x=df_train.iloc[:,1:].to_numpy()
y=df_train.iloc[:,0].to_numpy()
lda=0.1
w = np.arange(13)

egv,evec=np.linalg.eig(np.transpose(x)@x)
meig=max(egv)
eta=1/meig

In [3]:
def loss(y,x,w,lda=0):
    return np.sum(np.square((y - x@w))) + lda*np.sum(np.square(w))

In [4]:
x1 = np.arange(9.0).reshape((3, 3))
x2 = np.arange(3.0)
print(x1)
print(x2)
print(np.transpose(np.transpose(x1)*x2))
print(np.sum(np.transpose(np.transpose(x1)*x2), axis=0))

[[0. 1. 2.]
 [3. 4. 5.]
 [6. 7. 8.]]
[0. 1. 2.]
[[ 0.  0.  0.]
 [ 3.  4.  5.]
 [12. 14. 16.]]
[15. 18. 21.]


In [5]:
w = np.ones(13)
(y - x@w) 
print(w.shape)

(13,)


In [6]:
g=np.zeros(len(w))
for i in range(len(w)):
    for k in range(len(y)):
        g[i]+=-2*(y[k]-x[k,:]@w)*x[k,i] 
        




In [7]:
g2=2*np.sum(-np.transpose(np.transpose(x)*(y - x@w)), axis=0) #+ 2*lda*w
print(g-g2)

[ 9.31322575e-10  0.00000000e+00  9.31322575e-10  0.00000000e+00
  0.00000000e+00 -6.98491931e-10  3.72529030e-09  1.16415322e-10
  9.31322575e-10 -4.47034836e-08  9.31322575e-10  5.96046448e-08
 -9.31322575e-10]


In [8]:
class ToyObjective(Objective):
    def __init__(self, x, y, lda, delta):
        self.x = x
        self.y = y
        self.lda = lda
        self.delta=delta
        
    def __call__(self, w):
        xw=self.x@w
        return np.transpose(xw)@xw-2*np.transpose(xw)@y+np.transpose(y)@y + self.lda*np.transpose(w)@w
        #return np.sum(np.square((self.y - self.x@w))) + self.lda*np.sum(np.square(w))
    
    
    def grad(self, w):
        return  2*np.sum(-np.transpose(np.transpose(self.x)*(self.y - self.x@w)), axis=0) + 2*lda*w


In [9]:
delta=1e-3
obj = ToyObjective(x,y,lda,delta)
w = np.asarray([ 1.17906498e-01,  2.65164537e-01, -2.98959152e-01,  9.10244717e-03,
       -8.05311495e+01,  7.72027964e+00,  1.04495021e-01, -7.53719394e+00,
       -4.65910380e-01,  3.19629266e-02,  1.24905325e+00,  5.51701733e-02,
       -9.62625143e-01])
w = np.ones(13)
obj.grad(w)

array([1.36453442e+06, 2.60013378e+06, 3.25741215e+06, 2.05640834e+04,
       1.51695525e+05, 1.66054856e+06, 1.89186640e+07, 9.86574602e+05,
       3.06095032e+06, 1.21525881e+08, 4.99641194e+06, 9.32659237e+07,
       3.77396939e+06])

In [10]:
g=np.zeros(len(w))
for i in range(len(w)):
    dw=np.zeros(len(w))
    dw[i]=delta
    g[i]=(obj(w+dw) - obj(w))/delta
g

array([1.36454976e+06, 2.60022694e+06, 3.25743785e+06, 2.05640955e+04,
       1.51695573e+05, 1.66055445e+06, 1.89194708e+07, 9.86577561e+05,
       3.06097721e+06, 1.21556993e+08, 4.99646341e+06, 9.32855717e+07,
       3.77400363e+06])

In [11]:
learning_rate = 0.9*eta
tol = 1e-10
n_steps = 100000
w_init = w
normalize = False

In [12]:
results = gradient_descent_momentum(obj, w_init, learning_rate=learning_rate, tol=tol, n_steps=n_steps, normalize=normalize)
results[2]

array([1.24918764e+08, 7.30894673e+08, 1.62852982e+08, ...,
       3.83259316e+03, 3.83259287e+03, 3.83259259e+03])

In [13]:
w_opt=results[0]

In [14]:
np.sum(np.square((y - x@w_opt)))

3829.4620835590963

In [15]:
obj.grad(w_opt)

array([-1.88607595e-01,  2.46131641e-01,  8.18189194e-01, -2.80727977e+01,
        6.11300607e+00, -2.49610673e+01,  4.95356026e-01,  5.85294956e+00,
       -3.49964704e-01,  5.03370667e-03,  5.85402320e+00,  1.64170247e-02,
       -1.95838869e+00])

In [16]:
n_fold=10
len_fold=y.shape[0]//n_fold
k=9
#np.concatenate((y[:i*len_fold], y[(i+1)*len_fold:]))
k_fold=np.delete(y,np.s_[k*len_fold:(k+1)*len_fold])

In [17]:
xT=np.transpose(x)

In [18]:
w = np.linalg.inv(xT @ x) @ xT @ y
w

array([-0.03833324,  0.03733255,  0.02237796,  3.71989865, -5.6979845 ,
        5.35579469,  0.01440385, -0.79532608,  0.35229042, -0.01114316,
       -0.29588   ,  0.02179952, -0.59271428])

In [19]:
y_pred = x @ w
np.sum(np.square((y - x@w)))

3779.5630033782295

In [20]:
RMSE = mean_squared_error(y, y_pred)**0.5
RMSE

5.019669978114916