# Домашнее задание по теме "Функции потерь. Оптимизация"

In [1]:
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
N = 100
X = np.random.uniform(low=0, high=100, size=N)
Y = 2*X + 1 + np.random.normal(scale=5, size=N)

### Линейная регрессия методом Nesterov momentum

In [3]:
def NM(X, gamma, lr=0.25):
    Y = []
    v = 0
    for i in range(len(X)):
        v = gamma*v + lr*(linear_interpolation(X, i+gamma*v) if i+gamma*v < len(X) else 0)
        Y.append(v)
    return np.asarray(Y)

In [4]:
def cost_function(X, y, theta0, theta1):
    total_cost = 0
    for i in range(len(X)):
        total_cost += (theta0 + theta1*X[i] - y[i]) ** 2
    return total_cost / (2 * len(X))

In [5]:
def der_theta0(X, y, theta0, theta1):
    total_cost = 0
    for i in range(len(X)):
        total_cost += (theta0 + theta1*X[i] - y[i])
    return total_cost / (len(X))     

In [6]:
def der_theta1(X, y, theta0, theta1):
    total_cost = 0
    for i in range(len(X)):
        total_cost += (theta0 + theta1*X[i] - y[i]) * X[i]
    return total_cost / (len(X))  

In [7]:
# обычный градиентный спуск
theta0 = 1
theta1 = 1
gamma = 0.9
learning_rate = 0.0001
EPOCHS = 20
for i in range(EPOCHS):
    dt0 = der_theta0(X, Y, theta0, theta1)
    dt1 = der_theta1(X, Y, theta0, theta1)
    
    theta0 -= learning_rate * dt0
    theta1 -= learning_rate * dt1
    
    print('>%d theta0: %.5f, theta1: %.5f, cost: %.5f ' % (i, theta0, theta1, cost_function(X, Y, theta0, theta1)))

>0 theta0: 1.00525, theta1: 1.35335, cost: 758.56630 
>1 theta0: 1.00866, theta1: 1.58221, cost: 326.95400 
>2 theta0: 1.01090, theta1: 1.73044, cost: 145.88620 
>3 theta0: 1.01237, theta1: 1.82645, cost: 69.92554 
>4 theta0: 1.01334, theta1: 1.88864, cost: 38.05890 
>5 theta0: 1.01399, theta1: 1.92892, cost: 24.69035 
>6 theta0: 1.01443, theta1: 1.95500, cost: 19.08202 
>7 theta0: 1.01474, theta1: 1.97190, cost: 16.72922 
>8 theta0: 1.01496, theta1: 1.98284, cost: 15.74217 
>9 theta0: 1.01512, theta1: 1.98993, cost: 15.32806 
>10 theta0: 1.01525, theta1: 1.99452, cost: 15.15432 
>11 theta0: 1.01535, theta1: 1.99750, cost: 15.08141 
>12 theta0: 1.01544, theta1: 1.99942, cost: 15.05081 
>13 theta0: 1.01551, theta1: 2.00067, cost: 15.03795 
>14 theta0: 1.01559, theta1: 2.00147, cost: 15.03253 
>15 theta0: 1.01565, theta1: 2.00200, cost: 15.03024 
>16 theta0: 1.01572, theta1: 2.00234, cost: 15.02926 
>17 theta0: 1.01578, theta1: 2.00255, cost: 15.02883 
>18 theta0: 1.01584, theta1: 2.0027

In [8]:
# nesterov momentum
theta0 = 1
theta1 = 1
vt0 = 0  # экспоненциальные скользящие средние
vt1 = 0
gamma = 0.3 # коэффициент сохранения
lr = 0.0001
EPOCHS = 20

for i in range(EPOCHS):
    # градиент
    dt0 = der_theta0(X, Y, theta0, theta1)
    dt1 = der_theta1(X, Y, theta0, theta1)

    # накапливаем градиент
    vt0 = gamma*vt0 + lr*dt0
    vt1 = gamma*vt1 + lr*dt1
    
    # новые значения параметров
    theta0 -= vt0
    theta1 -= vt1
    
    print('>%d theta0: %.5f, theta1: %.5f, cost: %.5f ' % (i, theta0, theta1, cost_function(X, Y, theta0, theta1)))

>0 theta0: 1.00525, theta1: 1.35335, cost: 758.56630 
>1 theta0: 1.01024, theta1: 1.68821, cost: 189.58308 
>2 theta0: 1.01342, theta1: 1.89956, cost: 33.86932 
>3 theta0: 1.01497, theta1: 1.99940, cost: 15.05148 
>4 theta0: 1.01551, theta1: 2.03060, cost: 16.37464 
>5 theta0: 1.01559, theta1: 2.03023, cost: 16.33845 
>6 theta0: 1.01553, theta1: 2.02051, cost: 15.57127 
>7 theta0: 1.01548, theta1: 2.01141, cost: 15.15458 
>8 theta0: 1.01548, theta1: 2.00571, cost: 15.04200 
>9 theta0: 1.01553, theta1: 2.00303, cost: 15.02870 
>10 theta0: 1.01560, theta1: 2.00220, cost: 15.02966 
>11 theta0: 1.01569, theta1: 2.00222, cost: 15.02956 
>12 theta0: 1.01578, theta1: 2.00248, cost: 15.02894 
>13 theta0: 1.01586, theta1: 2.00273, cost: 15.02858 
>14 theta0: 1.01595, theta1: 2.00288, cost: 15.02845 
>15 theta0: 1.01604, theta1: 2.00295, cost: 15.02839 
>16 theta0: 1.01612, theta1: 2.00297, cost: 15.02834 
>17 theta0: 1.01620, theta1: 2.00297, cost: 15.02829 
>18 theta0: 1.01629, theta1: 2.00296

Наблюдаем, что метод с накоплением импульса Нестерова (при условии корректного подбора гиперпараметров) сошёлся на несколько итераций быстрее обычного градиентного спуска.

### Метод RMSProp

In [9]:
# rmsprop
theta0 = 1
theta1 = 1
eg0 = 0  # бегущие средние квадратов градиента
eg1 = 0
gamma = 0.3 # коэффициент сохранения
lr = 0.01
eps = 10e-6

EPOCHS = 200

for i in range(EPOCHS):
    # градиент
    dt0 = der_theta0(X, Y, theta0, theta1)
    dt1 = der_theta1(X, Y, theta0, theta1)

    # в данном методе используем квадраты градиента
    eg0 = gamma*eg0 + (1-gamma)*dt0*dt0
    eg1 = gamma*eg1 + (1-gamma)*dt1*dt1
    
    # новые значения параметров
    theta0 -= (lr * dt0)/(np.sqrt(eg0 + eps))
    theta1 -= (lr * dt1)/(np.sqrt(eg1 + eps))
    
    print('>%d theta0: %.5f, theta1: %.5f, cost: %.5f ' % (i, theta0, theta1, cost_function(X, Y, theta0, theta1)))

>0 theta0: 1.01195, theta1: 1.01195, cost: 1744.80201 
>1 theta0: 1.02241, theta1: 1.02241, cost: 1707.96892 
>2 theta0: 1.03250, theta1: 1.03250, cost: 1672.76082 
>3 theta0: 1.04250, theta1: 1.04250, cost: 1638.26865 
>4 theta0: 1.05247, theta1: 1.05247, cost: 1604.24191 
>5 theta0: 1.06243, theta1: 1.06243, cost: 1570.60732 
>6 theta0: 1.07238, theta1: 1.07238, cost: 1537.34305 
>7 theta0: 1.08234, theta1: 1.08234, cost: 1504.44254 
>8 theta0: 1.09229, theta1: 1.09229, cost: 1471.90381 
>9 theta0: 1.10224, theta1: 1.10224, cost: 1439.72627 
>10 theta0: 1.11220, theta1: 1.11219, cost: 1407.90971 
>11 theta0: 1.12215, theta1: 1.12214, cost: 1376.45406 
>12 theta0: 1.13210, theta1: 1.13210, cost: 1345.35929 
>13 theta0: 1.14205, theta1: 1.14205, cost: 1314.62538 
>14 theta0: 1.15200, theta1: 1.15200, cost: 1284.25229 
>15 theta0: 1.16195, theta1: 1.16194, cost: 1254.24000 
>16 theta0: 1.17190, theta1: 1.17189, cost: 1224.58850 
>17 theta0: 1.18185, theta1: 1.18184, cost: 1195.29776 
>1