# Regular Gradient Descent

In [8]:
import numpy as np

X = [0.5, 2.5]
Y = [0.2, 0.9]

def f(w,b,x):
    return 1.0/(1.0 + np.exp(-(w*x + b)))

def error(w,b):
    err = 0.0
    for x,y in zip(X,Y):
        fx = f(w,b,x)
        err += 0.5 * (fx - y) ** 2
    return err

def grad_b(w,b,x,y):
    fx = f(w,b,x)
    return (fx - y) * fx * (1 - fx)

def grad_w(w,b,x,y):
    fx = f(w,b,x)
    return (fx - y) * fx * (1-fx) * x

def do_gradient_descent(w,b,eta,max_epochs):
    for i in range(max_epochs):
        dw,db = 0,0
        for x,y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b(w,b,x,y)
        w = w - eta * dw
        b = b - eta * db
        print("W is {: >20}\t b is {: >20}\t Error is {: >20}".format(w,b,error(w,b)))
        # print("W is",w,"b is ",b,"Error is",error(w,b))
# do_gradient_descent(-2,-2,1.0,1000)
do_gradient_descent(-2,-2,1.0,200)

W is    -1.99450768078436	 b is  -1.9922888407847856	 Error is  0.41572985510220245
W is  -1.9889492123276025	 b is  -1.9845161253490138	 Error is   0.4156381130345323
W is  -1.9833229937607373	 b is  -1.9766809492962403	 Error is   0.4155446258111779
W is  -1.9776273639215822	 b is  -1.9687823878694068	 Error is  0.41544934240038417
W is  -1.9718605982115052	 b is  -1.9608194952149804	 Error is  0.41535220962939745
W is  -1.9660209052415965	 b is  -1.9527913036029303	 Error is  0.41525317206086976
W is  -1.9601064232510295	 b is  -1.9446968225987113	 Error is   0.4151521718599259
W is   -1.954115216278706	 b is  -1.9365350381830164	 Error is   0.4150491486510202
W is   -1.948045270067425	 b is  -1.9283049118145963	 Error is   0.4149440393636118
W is  -1.9418944876777522	 b is  -1.9200053794309366	 Error is  0.41483677806557867
W is  -1.9356606847864704	 b is  -1.9116353503809989	 Error is  0.41472729578316725
W is  -1.9293415846419129	 b is   -1.903193706283584	 Error is   0.414615520

# Momentum Based Gradient Descent

In [27]:
def do_momentum_gradient_descent(init_w,init_b,max_epochs):
    w,b,eta = init_w, init_b, 1.0
    prev_v_w, prev_v_b, gamma = 0 , 0,  0.9
    for i in range(max_epochs):
        dw, db = 0 , 0
        for x,y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b(w,b,x,y)

        v_w = gamma * prev_v_w + eta*dw
        v_b = gamma * prev_v_b + eta*db

        w = w - v_w
        b = b - v_b
        prev_v_w = v_w
        prev_v_b = v_b
        print("W is {: >20}\t b is {: >20}\t Error is {: >20}".format(w,b,error(w,b)))
        # print("W is",w,"b is ",b,"Error is",error(w,b))

do_momentum_gradient_descent(-2,-2,100)

W is    -1.99450768078436	 b is  -1.9922888407847856	 Error is  0.41572985510220245
W is  -1.9840061250335266	 b is   -1.977576082055321	 Error is  0.41555558079371396
W is  -1.9688673360707378	 b is  -1.9564434677326807	 Error is  0.41529946239666427
W is  -1.9493633941856943	 b is  -1.9293606641030028	 Error is   0.4149608936462756
W is  -1.9256734075697135	 b is  -1.8966979801040313	 Error is    0.414536439411146
W is  -1.8978865395684548	 b is   -1.858736566437543	 Error is  0.41401974830972726
W is  -1.8660009795398573	 b is  -1.8156764531171905	 Error is  0.41340120448518153
W is  -1.8299182601553428	 b is  -1.7676426602264097	 Error is  0.41266725728375003
W is  -1.7894317045176555	 b is  -1.7146894808539017	 Error is  0.41179929569764534
W is  -1.7442068520770284	 b is  -1.6568028511091835	 Error is   0.4107718116013056
W is  -1.6937501830462272	 b is  -1.5939004203451528	 Error is   0.4095493642811055
W is   -1.637359805935997	 b is  -1.5258283744380419	 Error is   0.408081391

# Nestrov Accelerated Gradient Descent

In [31]:
def do_nestrov_accelerated_gradient_descent(init_w,init_b,max_epochs):
    w,b,eta = init_w,init_b,1.0
    prev_v_w , prev_v_b,gamma = 0,0,0.9

    for i in range(max_epochs):
        dw,db = 0,0
        # do partial updates
        v_w = gamma * prev_v_w
        v_b = gamma * prev_v_b

        for x,y in zip(X,Y):
            # Calculate the gradients after the partial update
            dw += grad_w( w - v_w , b - v_b , x , y)
            db += grad_b( w - v_w , b - v_b , x , y)

        # now do the full update
        v_w = gamma * prev_v_w + eta * dw
        v_b = gamma * prev_v_b + eta * db

        w = w - v_w
        b = b - v_b

        prev_v_w = v_w
        prev_v_b = v_b
        print("W is {: >20}\t b is {: >20}\t Error is {: >20}".format(w,b,error(w,b)))
        # print("W is",w,"b is ",b,"Error is",error(w,b))

do_nestrov_accelerated_gradient_descent(-2,-2,90)

W is    -1.99450768078436	 b is  -1.9922888407847856	 Error is  0.41572985510220245
W is  -1.9839458048573209	 b is  -1.9775204120444787	 Error is  0.41555479839614623
W is  -1.9686325151111426	 b is  -1.9562291127349407	 Error is  0.41529635291153616
W is  -1.9487882275159392	 b is  -1.9288428308870547	 Error is   0.4149530690540074
W is  -1.9245385014832348	 b is  -1.8956933523235946	 Error is   0.4145204835864222
W is  -1.8959126337084982	 b is   -1.857024517313151	 Error is  0.41399090353359225
W is    -1.86283720259399	 b is   -1.812998314105477	 Error is  0.41335287451236424
W is  -1.8251230554273352	 b is  -1.7636989429088605	 Error is   0.4125902233631629
W is  -1.7824430659823918	 b is  -1.7091346537418697	 Error is  0.41168045143791177
W is  -1.7342959979433499	 b is   -1.649236757622674	 Error is  0.41059204147120676
W is  -1.6799481900565303	 b is  -1.5838544198344047	 Error is  0.40927980394662133
W is  -1.6183378240251194	 b is   -1.512742183533591	 Error is   0.407676426

# Stochastic Gradient Descent

In [34]:
def do_stochastic_gradient_descent(w,b,eta,max_epochs):
    for i in range(max_epochs):
        dw,db = 0,0
        for x,y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b(w,b,x,y)
            w = w - eta * dw
            b = b - eta * db
        print("W is {: >20}\t b is {: >20}\t Error is {: >20}".format(w,b,error(w,b)))        
        # print("W is",w,"b is ",b,"Error is",error(w,b))
    
do_stochastic_gradient_descent(-2,-2,1,150)

W is   -1.991029401024078	 b is  -1.9853832973517143	 Error is  0.41565658781447334
W is  -1.9819001493338824	 b is  -1.9705749936143444	 Error is  0.41548815574028225
W is  -1.9726070828837345	 b is   -1.955571610395328	 Error is   0.4153143832484931
W is  -1.9631447894488192	 b is   -1.940369648856349	 Error is   0.4151350365043807
W is  -1.9535075894668794	 b is  -1.9249655963301513	 Error is  0.41494986850871013
W is  -1.9436895172577084	 b is  -1.9093559338088817	 Error is  0.41475861813329884
W is  -1.9336843004198243	 b is  -1.8935371443824414	 Error is  0.41456100905919135
W is  -1.9234853371734617	 b is    -1.87750572270914	 Error is   0.4143567486037364
W is  -1.9130856713834918	 b is  -1.8612581856041932	 Error is   0.4141455264203731
W is  -1.9024779649540866	 b is  -1.8447910838339663	 Error is   0.4139270130519333
W is   -1.891654467237637	 b is  -1.8281010152050388	 Error is  0.41370085831461956
W is   -1.880606981042102	 b is  -1.8111846390366793	 Error is   0.413466689

# Mini Batch Gradient Descent

In [36]:
def do_mini_batch_gradient_descent(init_w,init_b,init_eta,max_epochs,batch_size):
    w,b,eta = init_w,init_b,init_eta
    mini_batch_size, num_points_seen = batch_size,0
    for i in range(max_epochs):
        dw,db,num_points = 0,0,0
        for x,y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b(w,b,x,y)
            num_points_seen += 1

            if num_points_seen % mini_batch_size == 0:
                # seen one mini batch
                w = w - eta * dw
                b = b - eta * db
                dw, db = 0,0 # reset parameters
        print("W is {: >20}\t b is {: >20}\t Error is {: >20}".format(w,b,error(w,b)))
        # print("W is",w,"b is ",b,"Error is",error(w,b))
    
do_mini_batch_gradient_descent(-2,-2,1,200,2)

W is    -1.99450768078436	 b is  -1.9922888407847856	 Error is  0.41572985510220245
W is  -1.9889492123276025	 b is  -1.9845161253490138	 Error is   0.4156381130345323
W is  -1.9833229937607373	 b is  -1.9766809492962403	 Error is   0.4155446258111779
W is  -1.9776273639215822	 b is  -1.9687823878694068	 Error is  0.41544934240038417
W is  -1.9718605982115052	 b is  -1.9608194952149804	 Error is  0.41535220962939745
W is  -1.9660209052415965	 b is  -1.9527913036029303	 Error is  0.41525317206086976
W is  -1.9601064232510295	 b is  -1.9446968225987113	 Error is   0.4151521718599259
W is   -1.954115216278706	 b is  -1.9365350381830164	 Error is   0.4150491486510202
W is   -1.948045270067425	 b is  -1.9283049118145963	 Error is   0.4149440393636118
W is  -1.9418944876777522	 b is  -1.9200053794309366	 Error is  0.41483677806557867
W is  -1.9356606847864704	 b is  -1.9116353503809989	 Error is  0.41472729578316725
W is  -1.9293415846419129	 b is   -1.903193706283584	 Error is   0.414615520

# Stochastic Momentum Gradient Descent and Stochastic Nestrov Accelerated Gradient Descent

In [13]:
def do_stochastic_momentum_gradient_descent(init_w,init_b,max_epochs):
    w,b,eta = init_w, init_b, 1.0
    prev_v_w, prev_v_b, gamma = 0 , 0,  0.9
    for i in range(max_epochs):
        dw, db = 0 , 0
        for x,y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b(w,b,x,y)
            # FOR STOCHASTIC
            v_w = gamma * prev_v_w + eta*dw
            v_b = gamma * prev_v_b + eta*db

            w = w - v_w
            b = b - v_b
            prev_v_w = v_w
            prev_v_b = v_b
        print("W is {: >20}\t b is {: >20}\t Error is {: >20}".format(w,b,error(w,b)))        
        # print("W is",w,"b is ",b,"Error is",error(w,b))

def do_stochastic_nestrov_accelerated_gradient_descent(init_w,init_b,max_epochs):
    w,b,eta = init_w,init_b,1.0
    prev_v_w , prev_v_b,gamma = 0,0,0.9

    for i in range(max_epochs):
        dw,db = 0,0
        # do partial updates
        v_w = gamma * prev_v_w
        v_b = gamma * prev_v_b

        for x,y in zip(X,Y):
            # Calculate the gradients after the partial update
            dw += grad_w( w - v_w , b - v_b , x , y)
            db += grad_b( w - v_w , b - v_b , x , y)

            # FOR STOCHASTIC
            # now do the update
            v_w = gamma * prev_v_w + eta * dw
            v_b = gamma * prev_v_b + eta * db

            w = w - v_w
            b = b - v_b

        prev_v_w = v_w
        prev_v_b = v_b
        print("W is {: >20}\t b is {: >20}\t Error is {: >20}".format(w,b,error(w,b)))
        # print("W is",w,"b is ",b,"Error is",error(w,b))

In [38]:
do_stochastic_momentum_gradient_descent(-2,-2,200)

W is   -1.987927645789039	 b is   -1.979179786881636	 Error is   0.4155904562195717
W is  -1.9607620307448692	 b is   -1.934145457791069	 Error is   0.4150697279824251
W is   -1.920466681233956	 b is  -1.8684900400710869	 Error is    0.414258329020515
W is  -1.8680743065256462	 b is  -1.7845642956483379	 Error is   0.4131256582453675
W is  -1.8037892928246164	 b is  -1.6837735136955356	 Error is   0.4116083203515281
W is   -1.726982008450368	 b is  -1.5668238968087014	 Error is   0.4095998240482958
W is  -1.6360412063372267	 b is  -1.4339637101815828	 Error is  0.40692531198692344
W is  -1.5279777394663128	 b is  -1.2852674089325442	 Error is   0.4032795678409704
W is  -1.3974883452044444	 b is  -1.1209985969815468	 Error is   0.3980533212452412
W is  -1.2346516869227167	 b is    -0.94197875262902	 Error is  0.38975849621837566
W is  -1.0186754871600636	 b is  -0.7493665045472188	 Error is   0.3737260537216556
W is  -0.6988238047978816	 b is  -0.5411442614978645	 Error is  0.3304875052

In [39]:
do_stochastic_nestrov_accelerated_gradient_descent(-2,-2,200)

W is -1.9909970205112286 b is  -1.9853703451465745 Error is 0.4156563048628465
W is -1.9717049106741067 b is  -1.9565077429100382 Error is 0.4153166347679565
W is -1.9425301480724873 b is  -1.91427181063635 Error is 0.41479268180265183
W is -1.9035315884381854 b is  -1.859201782052781 Error is 0.4140625603016133
W is -1.8543777335741165 b is  -1.7915333234777635 Error is 0.41308767827419657
W is -1.7942510889491008 b is  -1.7112022803310594 Error is 0.4118059051173131
W is -1.7216626229752487 b is  -1.617832756605361 Error is 0.41011750435424066
W is -1.6340938462399615 b is  -1.5106993722203619 Error is 0.4078546225715127
W is -1.5272700931467043 b is  -1.388630533869566 Error is 0.4047070020879354
W is -1.3935457326756668 b is  -1.2497415665302356 Error is 0.4000092343974468
W is -1.2178193132607127 b is  -1.090591384053973 Error is 0.3919841436450971
W is -0.9651862088526957 b is  -0.9030688066059374 Error is 0.37409071165336766
W is -0.5355533342348826 b is  -0.6612988919428238 Err

# Line Search Gradient Descent

In [48]:
def do_line_search_graident_descent(init_w,init_b,max_epochs):
    w,b,etas = init_w , init_b, [0.1,0.5,1.0,5.0,10.0]
    for i in range(max_epochs):
        dw,db = 0,0
        for x,y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b(w,b,x,y)
        min_error = 10000 # some large value
        best_w ,best_b = w,b

        for eta in etas:
            tmp_w = w - eta * dw
            tmp_b = b - eta * db

            if error(tmp_w,tmp_b) < min_error:
                best_w = tmp_w
                best_b = tmp_b
                min_error = error(tmp_w,tmp_b)
        w,b = best_w,best_b
        print("W is {: >20}\t b is {: >20}\t Error is {: >20}".format(w,b,error(w,b)))
        # print("W is",w,"b is ",b,"Error is",error(w,b))

do_line_search_graident_descent(-2,-2,310)

8651038418e-09
W is    1.791409856907695	 b is  -2.2816799470564093	 Error is 1.8902244375535985e-09
W is   1.7914459305209387	 b is   -2.281730937756927	 Error is 1.5202234729415377e-09
W is   1.7914782810178798	 b is   -2.281776665066863	 Error is 1.2226609503450215e-09
W is   1.7915072928484408	 b is   -2.281817672621919	 Error is 9.833513712150386e-10
W is   1.7915333107266902	 b is   -2.281854447842147	 Error is 7.908881539187812e-10
W is   1.7915566437451784	 b is  -2.2818874277578653	 Error is 6.360989711197849e-10
W is    1.791577569061837	 b is  -2.2819170042292907	 Error is 5.116079171994819e-10
W is    1.791596335204118	 b is  -2.2819435286234793	 Error is 4.114835307201621e-10
W is    1.791613165030343	 b is  -2.2819673160053897	 Error is 3.3095582090219695e-10
W is   1.7916282583840024	 b is   -2.281988648893865	 Error is 2.661887589498705e-10
W is    1.791641794472989	 b is  -2.2820077806279526	 Error is 2.1409735464343363e-10
W is   1.7916539340023876	 b is   -2.28202493

# Generating Fake Data

In [20]:
import pandas as pd
fake_X = np.random.rand(20)
fake_Y = np.random.rand(100)
fake_dataset = pd.DataFrame(columns=['fake_X','fake_Y'])
fake_dataset['fake_X'] = fake_X

# my_list = []
# for i in range(80):
#     my_list.append(0)
# my_df = pd.DataFrame(data=my_list, columns = ['fake_X'])
# print(my_df)
# fake_dataset.append(my_df,ignore_index=True)

for i in range(80):
    fake_dataset = fake_dataset.append({'fake_X': 0}, ignore_index=True)
fake_dataset['fake_Y'] = fake_Y
fake_dataset.sample(80)['fake_X'] = 0

print(fake_dataset)
fake_dataset.info()

      fake_X    fake_Y
0   0.233592  0.179317
1   0.787621  0.977635
2   0.383234  0.396789
3   0.561447  0.407151
4   0.504800  0.685740
..       ...       ...
95  0.000000  0.075830
96  0.000000  0.893654
97  0.000000  0.841088
98  0.000000  0.642546
99  0.000000  0.168710

[100 rows x 2 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   fake_X  100 non-null    float64
 1   fake_Y  100 non-null    float64
dtypes: float64(2)
memory usage: 1.7 KB


# Generating Gaussian Data

In [21]:
import pandas as pd
gaussian_X = np.random.normal(0,1,20)
gaussian_Y = np.random.normal(0,1,100)
gaussian_dataset = pd.DataFrame(columns=['gaussian_X','gaussian_Y'])
gaussian_dataset['gaussian_X'] = gaussian_X

for i in range(80):
    gaussian_dataset = gaussian_dataset.append({'gaussian_X': 0}, ignore_index=True)
gaussian_dataset['gaussian_Y'] = gaussian_Y
gaussian_dataset.sample(80)['gaussian_X'] = 0

print(gaussian_dataset)
gaussian_dataset.info()

    gaussian_X  gaussian_Y
0     1.815102    1.186195
1    -0.371952    0.943884
2     0.722977   -0.777014
3     0.995099   -0.313576
4    -0.341309   -0.990444
..         ...         ...
95    0.000000    0.851144
96    0.000000   -1.547865
97    0.000000    0.706471
98    0.000000    0.550967
99    0.000000    1.536912

[100 rows x 2 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   gaussian_X  100 non-null    float64
 1   gaussian_Y  100 non-null    float64
dtypes: float64(2)
memory usage: 1.7 KB


# Adagrad Optimiser


In [22]:
def do_adagrad(X,Y,init_w , init_b,max_epochs):
    w,b,eta = init_w , init_b, 0.1
    v_w , v_b , eps = 0 , 0, 1e-8

    for i in range(max_epochs):
        dw,db = 0,0
        for x, y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b(w,b,x,y)
        v_w = v_w + dw**2
        v_b = v_b + db**2

        # EMERGENCY
        prev_error = error(w,b)

        w = w - (eta/np.sqrt(v_w + eps))*dw
        b = b - (eta/np.sqrt(v_b + eps))*db
    
        # EMERGENCY
        print("W is {: >20}\t b is {: >20}\t Error is {: >20}".format(w,b,error(w,b)))
        # print("W is",w,"b is ",b,"Error is",error(w,b))
        if(prev_error < error(w,b)):
            break

do_adagrad(gaussian_dataset['gaussian_X'],gaussian_dataset['gaussian_Y'],2 , 2,1000)

W is    2.099999998119969	 b is   1.9000000000058785	 Error is   0.2863695336782898
W is    2.169365805382937	 b is   1.8274230172570392	 Error is   0.2850117891083402
W is    2.224762348024347	 b is   1.7672746117459093	 Error is  0.28381331993267983
W is   2.2716992378038254	 b is   1.7146278027468496	 Error is  0.28270708514138876
W is    2.312799042660235	 b is   1.6671470380005133	 Error is  0.28166168988818746
W is   2.3495517100875998	 b is    1.623508842327848	 Error is  0.28065953319056247
W is   2.3828987904017302	 b is   1.5828777612128653	 Error is  0.27968960119643066
W is   2.4134793651696262	 b is   1.5446865403454078	 Error is  0.27874446493120403
W is   2.4417500990060903	 b is   1.5085291894478612	 Error is    0.277818830310739
W is   2.4680502168847633	 b is   1.4741032927751243	 Error is   0.2769087616364624
W is    2.492639466484343	 b is   1.4411764055795941	 Error is  0.27601123258188626
W is    2.515721645849039	 b is   1.4095652834415346	 Error is   0.275123850

# RMS Prop Optimiser

In [56]:
def do_rmsprop(X,Y,init_w,init_b,max_epochs):
    w,b,eta = init_w , init_b, 0.1
    v_w , v_b , eps , beta1 = 0 , 0, 1e-8,0.9
    for i in range(max_epochs):
        dw,db = 0,0
        for x, y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b(w,b,x,y)
       
        v_w = beta1 * v_w + (1-beta1)* dw**2
        v_b = beta1 * v_b + (1-beta1)* db**2

        # EMERGENCY
        prev_error = error(w,b)
        
        w = w - (eta/np.sqrt(v_w + eps))*dw
        b = b - (eta/np.sqrt(v_b + eps))*db

        # EMERGENCY
        print("W is {: >20}\t b is {: >20}\t Error is {: >20}".format(w,b,error(w,b)))
        # print("W is",w,"b is ",b,"Error is",error(w,b))
        if(prev_error < error(w,b)):
            break

do_rmsprop(gaussian_dataset['gaussian_X'],gaussian_dataset['gaussian_Y'],2 , 2,1000)

W is    2.316227706565055	 b is   1.6837722341690584	 Error is   0.2823797728336502
W is   2.5242409964854957	 b is   1.4381304398965684	 Error is    0.276572031868844
W is   2.6758128833135792	 b is   1.2262022156465466	 Error is   0.2703311087962589
W is    2.789678423900449	 b is   1.0358559862614334	 Error is   0.2635467430885194
W is   2.8725222872151317	 b is   0.8615087923188678	 Error is  0.25609022290254313
W is   2.9259421144577438	 b is   0.7000787261454796	 Error is  0.24778924156173662
W is    2.948749172232839	 b is   0.5496248487702694	 Error is   0.2384090880091552
W is     2.93792552180371	 b is   0.4087773368296995	 Error is  0.22763609189802056
W is    2.889624253442307	 b is  0.27646440510094017	 Error is   0.2150869329767424
W is    2.801168499873123	 b is  0.15177232356591813	 Error is  0.20039342501443103
W is    2.674096272338858	 b is 0.033877522355691664	 Error is   0.1834009639392905
W is   2.5159558682397876	 b is  -0.0779716999376176	 Error is    0.16438550

# Adaptive Moment Estimation Optimiser

## Usually:
- beta1 = 0.9 , beta2 = 0.999
- eps = 1e-8
- eta = 0.001 , 0.0001

In [55]:
import math
def do_adam(X,Y,init_w,init_b,max_epochs):
    w_b_dw_db = [(init_w , init_b,0,0)]
    w_history , b_history , error_history = [], [], []

    w, b, eta, mini_batch_size, num_points_seen = init_w , init_b , 0.1 , 10, 0 
    m_w , m_b , v_w , v_b , eps, beta1, beta2 = 0, 0, 0, 0, 1e-8, 0.9, 0.999

    for i in range(max_epochs):
        dw,db = 0,0
        for x,y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b( w,b,x,y)

        m_w = beta1 * m_w + (1 - beta1)*dw
        m_b = beta1 * m_b + (1 - beta1)*db

        v_w = beta2 * v_w + (1 - beta2)*dw**2
        v_b = beta2 * v_b + (1 - beta2)*db**2

        m_w = m_w / (1 - math.pow(beta1,i+1))
        m_b = m_b / (1 - math.pow(beta1,i+1))
        
        v_w = v_w / (1 - math.pow(beta2,i+1))
        v_b = v_b / (1 - math.pow(beta2,i+1))

         # EMERGENCY
        prev_error = error(w,b)

        w = w - (eta / np.sqrt(v_w + eps)) * m_w
        b = b - (eta / np.sqrt(v_b + eps)) * m_b

        # EMERGENCY
        print("W is {: >20}\t b is {: >20}\t Error is {: >20}".format(w,b,error(w,b)))
        # print("W is",w,"b is ",b,"Error is",error(w,b))
        if(prev_error < error(w,b)):
            break

do_adam(gaussian_dataset['gaussian_X'],gaussian_dataset['gaussian_Y'],2,2,20)

W is    2.099999998119969	 b is   1.9000000000058785	 Error is   0.2863695336782898
W is   2.1234454708167476	 b is   1.8763402550388995	 Error is  0.28594680248475535
W is   2.1277961578951317	 b is    1.871940046791548	 Error is  0.28586752871251575
W is    2.128520358906672	 b is   1.8712071045789043	 Error is    0.285854300447868
W is    2.128633100066872	 b is   1.8710929730190569	 Error is  0.28585223967319745
W is    2.128649878629786	 b is   1.8710759855316161	 Error is    0.285851932900302
W is    2.128652299211555	 b is   1.8710735346590615	 Error is  0.28585188863771804
W is    2.128652641017635	 b is   1.8710731885634035	 Error is   0.2858518823870317
W is    2.128652688596569	 b is   1.8710731403862866	 Error is  0.28585188151690566
W is   2.1286526951604374	 b is   1.8710731337397928	 Error is  0.28585188139686174
W is   2.1286526960616525	 b is   1.8710731328272232	 Error is   0.2858518813803795
W is   2.1286526961852053	 b is   1.8710731327021128	 Error is  0.2858518813