# Домашнее задание по теме "Функции потерь. Оптимизация"

In [1]:
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [115]:
N = 100
X = np.random.uniform(low=0, high=100, size=N)
Y = 2*X + 1 + np.random.normal(scale=5, size=N)

### Линейная регрессия методом Nesterov momentum

In [3]:
def NM(X, gamma, lr=0.25):
    Y = []
    v = 0
    for i in range(len(X)):
        v = gamma*v + lr*(linear_interpolation(X, i+gamma*v) if i+gamma*v < len(X) else 0)
        Y.append(v)
    return np.asarray(Y)

In [4]:
def cost_function(X, y, theta0, theta1):
    total_cost = 0
    for i in range(len(X)):
        total_cost += (theta0 + theta1*X[i] - y[i]) ** 2
    return total_cost / (2 * len(X))

In [5]:
def der_theta0(X, y, theta0, theta1):
    total_cost = 0
    for i in range(len(X)):
        total_cost += (theta0 + theta1*X[i] - y[i])
    return total_cost / (len(X))    

In [6]:
def der_theta1(X, y, theta0, theta1):
    total_cost = 0
    for i in range(len(X)):
        total_cost += (theta0 + theta1*X[i] - y[i]) * X[i]
    return total_cost / (len(X))  

In [57]:
# обычный градиентный спуск
theta0 = 1
theta1 = 1
gamma = 0.9
learning_rate = 0.0001
EPOCHS = 20
for i in range(EPOCHS):
    dt0 = der_theta0(X, Y, theta0, theta1)
    dt1 = der_theta1(X, Y, theta0, theta1)
    
    theta0 -= learning_rate * dt0
    theta1 -= learning_rate * dt1
    
    print('>%d theta0: %.5f, theta1: %.5f, cost: %.5f ' % (i, theta0, theta1, cost_function(X, Y, theta0, theta1)))

>0 theta0: 1.00483, theta1: 1.32480, cost: 741.48979 
>1 theta0: 1.00808, theta1: 1.54333, cost: 341.94671 
>2 theta0: 1.01027, theta1: 1.69037, cost: 161.07079 
>3 theta0: 1.01174, theta1: 1.78930, cost: 79.18701 
>4 theta0: 1.01274, theta1: 1.85587, cost: 42.11766 
>5 theta0: 1.01340, theta1: 1.90066, cost: 25.33610 
>6 theta0: 1.01386, theta1: 1.93079, cost: 17.73898 
>7 theta0: 1.01416, theta1: 1.95107, cost: 14.29971 
>8 theta0: 1.01437, theta1: 1.96471, cost: 12.74272 
>9 theta0: 1.01450, theta1: 1.97389, cost: 12.03787 
>10 theta0: 1.01460, theta1: 1.98006, cost: 11.71877 
>11 theta0: 1.01466, theta1: 1.98422, cost: 11.57432 
>12 theta0: 1.01471, theta1: 1.98702, cost: 11.50892 
>13 theta0: 1.01474, theta1: 1.98890, cost: 11.47932 
>14 theta0: 1.01476, theta1: 1.99016, cost: 11.46591 
>15 theta0: 1.01478, theta1: 1.99101, cost: 11.45985 
>16 theta0: 1.01479, theta1: 1.99159, cost: 11.45710 
>17 theta0: 1.01480, theta1: 1.99197, cost: 11.45586 
>18 theta0: 1.01480, theta1: 1.9922

In [113]:
# nesterov momentum
theta0 = 1
theta1 = 1
vt0 = 0  # экспоненциальные скользящие средние
vt1 = 0
gamma = 0.3 # коэффициент сохранения
lr = 0.0001
EPOCHS = 20

for i in range(EPOCHS):
    # градиент
    dt0 = der_theta0(X, Y, theta0, theta1)
    dt1 = der_theta1(X, Y, theta0, theta1)

    # накапливаем градиент
    vt0 = gamma*vt0 + lr*dt0
    vt1 = gamma*vt1 + lr*dt1
    
    # новые значения параметров
    theta0 -= vt0
    theta1 -= vt1
    
    print('>%d theta0: %.5f, theta1: %.5f, cost: %.5f ' % (i, theta0, theta1, cost_function(X, Y, theta0, theta1)))

>0 theta0: 1.00483, theta1: 1.32480, cost: 741.48979 
>1 theta0: 1.00953, theta1: 1.64077, cost: 214.17626 
>2 theta0: 1.01266, theta1: 1.85073, cost: 44.46564 
>3 theta0: 1.01429, theta1: 1.96018, cost: 13.19200 
>4 theta0: 1.01494, theta1: 2.00368, cost: 11.64970 
>5 theta0: 1.01508, theta1: 2.01316, cost: 12.13521 
>6 theta0: 1.01503, theta1: 2.00933, cost: 11.90375 
>7 theta0: 1.01494, theta1: 2.00276, cost: 11.61833 
>8 theta0: 1.01486, theta1: 1.99752, cost: 11.49183 
>9 theta0: 1.01482, theta1: 1.99439, cost: 11.45916 
>10 theta0: 1.01480, theta1: 1.99292, cost: 11.45487 
>11 theta0: 1.01480, theta1: 1.99243, cost: 11.45501 
>12 theta0: 1.01480, theta1: 1.99239, cost: 11.45506 
>13 theta0: 1.01481, theta1: 1.99250, cost: 11.45494 
>14 theta0: 1.01482, theta1: 1.99262, cost: 11.45486 
>15 theta0: 1.01482, theta1: 1.99270, cost: 11.45483 
>16 theta0: 1.01483, theta1: 1.99275, cost: 11.45483 
>17 theta0: 1.01483, theta1: 1.99277, cost: 11.45483 
>18 theta0: 1.01484, theta1: 1.99277

Наблюдаем, что метод с накоплением импульса Нестерова (при условии корректного подбора гиперпараметров) сошёлся на несколько итераций быстрее обычного градиентного спуска.

### Метод RMSProp

## *здесь что-то не так: алгоритм не может сойтись ни с какими гиперпараметрами*

In [125]:
# rmsprop
theta0 = 1
theta1 = 1
eg0 = 0  # бегущие средние квадратов градиента
eg1 = 0
gamma = 0.3 # коэффициент сохранения
lr = 0.25
eps = 10e-6

EPOCHS = 50

for i in range(EPOCHS):
    # градиент
    dt0 = der_theta0(X, Y, theta0, theta1)
    dt1 = der_theta1(X, Y, theta0, theta1)

    # в данном методе используем квадраты градиента
    eg0 = gamma*eg0 + (1-gamma)*dt0*dt0
    eg1 = gamma*eg1 + (1-gamma)*dt1*dt1
    
    # новые значения параметров
    theta0 -= (lr * dt0)/(np.sqrt(eg0 + eps))
    theta1 -= (lr * dt1)/(np.sqrt(eg1 + eps))
    
    print('>%d theta0: %.5f, theta1: %.5f, cost: %.5f ' % (i, theta0, theta1, cost_function(X, Y, theta0, theta1)))

>0 theta0: 1.29881, theta1: 1.29881, cost: 794.29636 
>1 theta0: 1.53389, theta1: 1.53387, cost: 350.79601 
>2 theta0: 1.73944, theta1: 1.73934, cost: 112.20836 
>3 theta0: 1.90912, theta1: 1.90884, cost: 20.08735 
>4 theta0: 2.00404, theta1: 2.00317, cost: 9.81100 
>5 theta0: 1.97545, theta1: 1.97302, cost: 9.90591 
>6 theta0: 2.03895, theta1: 2.03412, cost: 12.83613 
>7 theta0: 1.82427, theta1: 1.81750, cost: 57.91415 
>8 theta0: 2.11745, theta1: 2.11057, cost: 33.83770 
>9 theta0: 1.88349, theta1: 1.87580, cost: 30.57362 
>10 theta0: 2.12330, theta1: 2.11493, cost: 35.62633 
>11 theta0: 1.86931, theta1: 1.86037, cost: 36.71308 
>12 theta0: 2.12269, theta1: 2.11318, cost: 34.91017 
>13 theta0: 1.87506, theta1: 1.86492, cost: 34.81143 
>14 theta0: 2.12497, theta1: 2.11423, cost: 35.35130 
>15 theta0: 1.87497, theta1: 1.86364, cost: 35.32919 
>16 theta0: 2.12570, theta1: 2.11377, cost: 35.16904 
>17 theta0: 1.87647, theta1: 1.86396, cost: 35.19167 
>18 theta0: 2.12703, theta1: 2.11392,