# Regular Gradient Descent

In [12]:
import numpy as np

X = [0.5, 2.5]
Y = [0.2, 0.9]

def f(w,b,x):
    return 1.0/(1.0 + np.exp(-(w*x + b)))

def error(w,b):
    err = 0.0
    for x,y in zip(X,Y):
        fx = f(w,b,x)
        err += 0.5 * (fx - y) ** 2
    return err

def grad_b(w,b,x,y):
    fx = f(w,b,x)
    return (fx - y) * fx * (1 - fx)

def grad_w(w,b,x,y):
    fx = f(w,b,x)
    return (fx - y) * fx * (1-fx) * x

def do_gradient_descent(w,b,eta,max_epochs):
    for i in range(max_epochs):
        dw,db = 0,0
        for x,y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b(w,b,x,y)
        w = w - eta * dw
        b = b - eta * db
        print("W is",w,"b is ",b,"Error is",error(w,b))
do_gradient_descent(-2,-2,1.0,1000)

W is -1.99450768078436 b is  -1.9922888407847856 Error is 0.41572985510220245
W is -1.9889492123276025 b is  -1.9845161253490138 Error is 0.4156381130345323
W is -1.9833229937607373 b is  -1.9766809492962403 Error is 0.4155446258111779
W is -1.9776273639215822 b is  -1.9687823878694068 Error is 0.41544934240038417
W is -1.9718605982115052 b is  -1.9608194952149804 Error is 0.41535220962939745
W is -1.9660209052415965 b is  -1.9527913036029303 Error is 0.41525317206086976
W is -1.9601064232510295 b is  -1.9446968225987113 Error is 0.4151521718599259
W is -1.954115216278706 b is  -1.9365350381830164 Error is 0.4150491486510202
W is -1.948045270067425 b is  -1.9283049118145963 Error is 0.4149440393636118
W is -1.9418944876777522 b is  -1.9200053794309366 Error is 0.41483677806557867
W is -1.9356606847864704 b is  -1.9116353503809989 Error is 0.41472729578316725
W is -1.9293415846419129 b is  -1.903193706283584 Error is 0.4146155203061341
W is -1.9229348126456238 b is  -1.8946792998041406 

# Momentum Based Gradient Descent

In [13]:
def do_momentum_gradient_descent(init_w,init_b,max_epochs):
    w,b,eta = init_w, init_b, 1.0
    prev_v_w, prev_v_b, gamma = 0 , 0,  0.9
    for i in range(max_epochs):
        dw, db = 0 , 0
        for x,y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b(w,b,x,y)

        v_w = gamma * prev_v_w + eta*dw
        v_b = gamma * prev_v_b + eta*db

        w = w - v_w
        b = b - v_b
        prev_v_w = v_w
        prev_v_b = v_b
        print("W is",w,"b is ",b,"Error is",error(w,b))

do_momentum_gradient_descent(-2,-2,1000)

W is -1.99450768078436 b is  -1.9922888407847856 Error is 0.41572985510220245
W is -1.9840061250335266 b is  -1.977576082055321 Error is 0.41555558079371396
W is -1.9688673360707378 b is  -1.9564434677326807 Error is 0.41529946239666427
W is -1.9493633941856943 b is  -1.9293606641030028 Error is 0.4149608936462756
W is -1.9256734075697135 b is  -1.8966979801040313 Error is 0.414536439411146
W is -1.8978865395684548 b is  -1.858736566437543 Error is 0.41401974830972726
W is -1.8660009795398573 b is  -1.8156764531171905 Error is 0.41340120448518153
W is -1.8299182601553428 b is  -1.7676426602264097 Error is 0.41266725728375003
W is -1.7894317045176555 b is  -1.7146894808539017 Error is 0.41179929569764534
W is -1.7442068520770284 b is  -1.6568028511091835 Error is 0.4107718116013056
W is -1.6937501830462272 b is  -1.5939004203451528 Error is 0.4095493642811055
W is -1.637359805935997 b is  -1.5258283744380419 Error is 0.4080813912376985
W is -1.5740469247910656 b is  -1.4523529449551191 

# Nestrov Accelerated Gradient Descent

In [14]:
def do_nestrov_accelerated_gradient_descent(init_w,init_b,max_epochs):
    w,b,eta = init_w,init_b,1.0
    prev_v_w , prev_v_b,gamma = 0,0,0.9

    for i in range(max_epochs):
        dw,db = 0,0
        # do partial updates
        v_w = gamma * prev_v_w
        v_b = gamma * prev_v_b

        for x,y in zip(X,Y):
            # Calculate the gradients after the partial update
            dw += grad_w( w - v_w , b - v_b , x , y)
            db += grad_b( w - v_w , b - v_b , x , y)

        # now do the full update
        v_w = gamma * prev_v_w + eta * dw
        v_b = gamma * prev_v_b + eta * db

        w = w - v_w
        b = b - v_b

        prev_v_w = v_w
        prev_v_b = v_b
        print("W is",w,"b is ",b,"Error is",error(w,b))

do_nestrov_accelerated_gradient_descent(-2,-2,1000)

W is -1.99450768078436 b is  -1.9922888407847856 Error is 0.41572985510220245
W is -1.9839458048573209 b is  -1.9775204120444787 Error is 0.41555479839614623
W is -1.9686325151111426 b is  -1.9562291127349407 Error is 0.41529635291153616
W is -1.9487882275159392 b is  -1.9288428308870547 Error is 0.4149530690540074
W is -1.9245385014832348 b is  -1.8956933523235946 Error is 0.4145204835864222
W is -1.8959126337084982 b is  -1.857024517313151 Error is 0.41399090353359225
W is -1.86283720259399 b is  -1.812998314105477 Error is 0.41335287451236424
W is -1.8251230554273352 b is  -1.7636989429088605 Error is 0.4125902233631629
W is -1.7824430659823918 b is  -1.7091346537418697 Error is 0.41168045143791177
W is -1.7342959979433499 b is  -1.649236757622674 Error is 0.41059204147120676
W is -1.6799481900565303 b is  -1.5838544198344047 Error is 0.40927980394662133
W is -1.6183378240251194 b is  -1.512742183533591 Error is 0.4076764267411583
W is -1.5479123524324296 b is  -1.4355334842315641 E

# Stochastic Gradient Descent

In [15]:
def do_stochastic_gradient_descent(w,b,eta,max_epochs):
    for i in range(max_epochs):
        dw,db = 0,0
        for x,y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b(w,b,x,y)
            w = w - eta * dw
            b = b - eta * db
        print("W is",w,"b is ",b,"Error is",error(w,b))
    
do_stochastic_gradient_descent(-2,-2,1,1000)

W is -1.991029401024078 b is  -1.9853832973517143 Error is 0.41565658781447334
W is -1.9819001493338824 b is  -1.9705749936143444 Error is 0.41548815574028225
W is -1.9726070828837345 b is  -1.955571610395328 Error is 0.4153143832484931
W is -1.9631447894488192 b is  -1.940369648856349 Error is 0.4151350365043807
W is -1.9535075894668794 b is  -1.9249655963301513 Error is 0.41494986850871013
W is -1.9436895172577084 b is  -1.9093559338088817 Error is 0.41475861813329884
W is -1.9336843004198243 b is  -1.8935371443824414 Error is 0.41456100905919135
W is -1.9234853371734617 b is  -1.87750572270914 Error is 0.4143567486037364
W is -1.9130856713834918 b is  -1.8612581856041932 Error is 0.4141455264203731
W is -1.9024779649540866 b is  -1.8447910838339663 Error is 0.4139270130519333
W is -1.891654467237637 b is  -1.8281010152050388 Error is 0.41370085831461956
W is -1.880606981042102 b is  -1.8111846390366793 Error is 0.4134666894853851
W is -1.8693268247517971 b is  -1.7940386921026574 Er

# Mini Batch Gradient Descent

In [16]:
def do_mini_batch_gradient_descent(init_w,init_b,init_eta,max_epochs,batch_size):
    w,b,eta = init_w,init_b,init_eta
    mini_batch_size, num_points_seen = batch_size,0
    for i in range(max_epochs):
        dw,db,num_points = 0,0,0
        for x,y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b(w,b,x,y)
            num_points_seen += 1

            if num_points_seen % mini_batch_size == 0:
                # seen one mini batch
                w = w - eta * dw
                b = b - eta * db
                dw, db = 0,0 # reset parameters

        print("W is",w,"b is ",b,"Error is",error(w,b))
    
do_mini_batch_gradient_descent(-2,-2,1,1000,2)

W is -1.99450768078436 b is  -1.9922888407847856 Error is 0.41572985510220245
W is -1.9889492123276025 b is  -1.9845161253490138 Error is 0.4156381130345323
W is -1.9833229937607373 b is  -1.9766809492962403 Error is 0.4155446258111779
W is -1.9776273639215822 b is  -1.9687823878694068 Error is 0.41544934240038417
W is -1.9718605982115052 b is  -1.9608194952149804 Error is 0.41535220962939745
W is -1.9660209052415965 b is  -1.9527913036029303 Error is 0.41525317206086976
W is -1.9601064232510295 b is  -1.9446968225987113 Error is 0.4151521718599259
W is -1.954115216278706 b is  -1.9365350381830164 Error is 0.4150491486510202
W is -1.948045270067425 b is  -1.9283049118145963 Error is 0.4149440393636118
W is -1.9418944876777522 b is  -1.9200053794309366 Error is 0.41483677806557867
W is -1.9356606847864704 b is  -1.9116353503809989 Error is 0.41472729578316725
W is -1.9293415846419129 b is  -1.903193706283584 Error is 0.4146155203061341
W is -1.9229348126456238 b is  -1.8946792998041406 

# Stochastic Momentum Gradient Descent and Stochastic Nestrov Accelerated Gradient Descent

In [17]:
def do_stochastic_momentum_gradient_descent(init_w,init_b,max_epochs):
    w,b,eta = init_w, init_b, 1.0
    prev_v_w, prev_v_b, gamma = 0 , 0,  0.9
    for i in range(max_epochs):
        dw, db = 0 , 0
        for x,y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b(w,b,x,y)
            # FOR STOCHASTIC
            v_w = gamma * prev_v_w + eta*dw
            v_b = gamma * prev_v_b + eta*db

            w = w - v_w
            b = b - v_b
            prev_v_w = v_w
            prev_v_b = v_b
        print("W is",w,"b is ",b,"Error is",error(w,b))

def do_stochastic_nestrov_accelerated_gradient_descent(init_w,init_b,max_epochs):
    w,b,eta = init_w,init_b,1.0
    prev_v_w , prev_v_b,gamma = 0,0,0.9

    for i in range(max_epochs):
        dw,db = 0,0
        # do partial updates
        v_w = gamma * prev_v_w
        v_b = gamma * prev_v_b

        for x,y in zip(X,Y):
            # Calculate the gradients after the partial update
            dw += grad_w( w - v_w , b - v_b , x , y)
            db += grad_b( w - v_w , b - v_b , x , y)

            # FOR STOCHASTIC
            # now do the update
            v_w = gamma * prev_v_w + eta * dw
            v_b = gamma * prev_v_b + eta * db

            w = w - v_w
            b = b - v_b

        prev_v_w = v_w
        prev_v_b = v_b
        print("W is",w,"b is ",b,"Error is",error(w,b))

In [18]:
do_stochastic_momentum_gradient_descent(-2,-2,1000)

W is -1.987927645789039 b is  -1.979179786881636 Error is 0.4155904562195717
W is -1.9607620307448692 b is  -1.934145457791069 Error is 0.4150697279824251
W is -1.920466681233956 b is  -1.8684900400710869 Error is 0.414258329020515
W is -1.8680743065256462 b is  -1.7845642956483379 Error is 0.4131256582453675
W is -1.8037892928246164 b is  -1.6837735136955356 Error is 0.4116083203515281
W is -1.726982008450368 b is  -1.5668238968087014 Error is 0.4095998240482958
W is -1.6360412063372267 b is  -1.4339637101815828 Error is 0.40692531198692344
W is -1.5279777394663128 b is  -1.2852674089325442 Error is 0.4032795678409704
W is -1.3974883452044444 b is  -1.1209985969815468 Error is 0.3980533212452412
W is -1.2346516869227167 b is  -0.94197875262902 Error is 0.38975849621837566
W is -1.0186754871600636 b is  -0.7493665045472188 Error is 0.3737260537216556
W is -0.6988238047978816 b is  -0.5411442614978645 Error is 0.33048750525283993
W is -0.14032059046588574 b is  -0.29889931940528236 Erro

In [19]:
do_stochastic_nestrov_accelerated_gradient_descent(-2,-2,1000)

W is -1.9909970205112286 b is  -1.9853703451465745 Error is 0.4156563048628465
W is -1.9717049106741067 b is  -1.9565077429100382 Error is 0.4153166347679565
W is -1.9425301480724873 b is  -1.91427181063635 Error is 0.41479268180265183
W is -1.9035315884381854 b is  -1.859201782052781 Error is 0.4140625603016133
W is -1.8543777335741165 b is  -1.7915333234777635 Error is 0.41308767827419657
W is -1.7942510889491008 b is  -1.7112022803310594 Error is 0.4118059051173131
W is -1.7216626229752487 b is  -1.617832756605361 Error is 0.41011750435424066
W is -1.6340938462399615 b is  -1.5106993722203619 Error is 0.4078546225715127
W is -1.5272700931467043 b is  -1.388630533869566 Error is 0.4047070020879354
W is -1.3935457326756668 b is  -1.2497415665302356 Error is 0.4000092343974468
W is -1.2178193132607127 b is  -1.090591384053973 Error is 0.3919841436450971
W is -0.9651862088526957 b is  -0.9030688066059374 Error is 0.37409071165336766
W is -0.5355533342348826 b is  -0.6612988919428238 Err

# Line Search Gradient Descent

In [20]:
def do_line_search_graident_descent(init_w,init_b,max_epochs):
    w,b,etas = init_w , init_b, [0.1,0.5,1.0,5.0,10.0]
    for i in range(max_epochs):
        dw,db = 0,0
        for x,y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b(w,b,x,y)
        min_error = 10000 # some large value
        best_w ,best_b = w,b

        for eta in etas:
            tmp_w = w - eta * dw
            tmp_b = b - eta * db

            if error(tmp_w,tmp_b) < min_error:
                best_w = tmp_w
                best_b = tmp_b
                min_error = error(tmp_w,tmp_b)
        w,b = best_w,best_b
        print("W is",w,"b is ",b,"Error is",error(w,b))

do_line_search_graident_descent(-2,-2,1000)

W is -1.9450768078436014 b is  -1.9228884078478556 Error is 0.41488064635640237
W is -1.8831054469148645 b is  -1.8394812734377968 Error is 0.41374348776767655
W is -1.8120649114070353 b is  -1.74879674330885 Error is 0.41233641572605906
W is -1.7288979331678107 b is  -1.6495830521438286 Error is 0.4105419172106144
W is -1.6286478879869966 b is  -1.5401434516427601 Error is 0.4081466093396266
W is -1.5024889119343525 b is  -1.4179069791426575 Error is 0.40469724955090564
W is -1.3324880080245631 b is  -1.2781578983999018 Error is 0.39897489824883353
W is -1.0742640410063662 b is  -1.1093136409328814 Error is 0.38618923417094675
W is -0.5761918954515619 b is  -0.8684548153122909 Error is 0.3285096872564685
W is 1.0521274161535668 b is  -0.2743502967854967 Error is 0.06582860162793036
W is 0.8158000271804455 b is  -0.7257960257203186 Error is 0.030719385795436517
W is 1.0131745621058676 b is  -1.0782325540564965 Error is 0.016922873823372667
W is 1.170174973330187 b is  -1.31223246323187

# Generating Fake Data

In [21]:
import pandas as pd
fake_X = np.random.rand(20)
fake_Y = np.random.rand(100)
fake_dataset = pd.DataFrame(columns=['fake_X','fake_Y'])
fake_dataset['fake_X'] = fake_X

# my_list = []
# for i in range(80):
#     my_list.append(0)
# my_df = pd.DataFrame(data=my_list, columns = ['fake_X'])
# print(my_df)
# fake_dataset.append(my_df,ignore_index=True)

for i in range(80):
    fake_dataset = fake_dataset.append({'fake_X': 0}, ignore_index=True)
fake_dataset['fake_Y'] = fake_Y
fake_dataset.sample(80)['fake_X'] = 0

print(fake_dataset)
fake_dataset.info()

      fake_X    fake_Y
0   0.277156  0.512258
1   0.661666  0.695145
2   0.358733  0.861846
3   0.153300  0.237351
4   0.801693  0.027216
..       ...       ...
95  0.000000  0.574934
96  0.000000  0.083682
97  0.000000  0.917966
98  0.000000  0.557888
99  0.000000  0.323357

[100 rows x 2 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   fake_X  100 non-null    float64
 1   fake_Y  100 non-null    float64
dtypes: float64(2)
memory usage: 1.7 KB


# Generating Gaussian Data

In [43]:
import pandas as pd
gaussian_X = np.random.normal(0,1,20)
gaussian_Y = np.random.normal(0,1,100)
gaussian_dataset = pd.DataFrame(columns=['gaussian_X','gaussian_Y'])
gaussian_dataset['gaussian_X'] = gaussian_X

for i in range(80):
    gaussian_dataset = gaussian_dataset.append({'gaussian_X': 0}, ignore_index=True)
gaussian_dataset['gaussian_Y'] = gaussian_Y
gaussian_dataset.sample(80)['gaussian_X'] = 0

print(gaussian_dataset)
gaussian_dataset.info()

    gaussian_X  gaussian_Y
0    -0.323795    0.663843
1    -1.011214   -2.634424
2    -1.104429    0.853929
3    -0.524441    0.542493
4     0.163618    0.137819
..         ...         ...
95    0.000000   -1.199485
96    0.000000   -1.300307
97    0.000000   -0.005835
98    0.000000   -0.095065
99    0.000000    1.523199

[100 rows x 2 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   gaussian_X  100 non-null    float64
 1   gaussian_Y  100 non-null    float64
dtypes: float64(2)
memory usage: 1.7 KB


# Adagrad Optimiser


In [44]:
def do_adagrad(X,Y,init_w , init_b,max_epochs):
    w,b,eta = init_w , init_b, 0.1
    v_w , v_b , eps = 0 , 0, 1e-8

    for i in range(max_epochs):
        dw,db = 0,0
        for x, y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b(w,b,x,y)
        v_w = v_w + dw**2
        v_b = v_b + db**2

        # EMERGENCY
        prev_error = error(w,b)

        w = w - (eta/np.sqrt(v_w + eps))*dw
        b = b - (eta/np.sqrt(v_b + eps))*db
    
        # EMERGENCY
        print("W is",w,"b is ",b,"Error is",error(w,b))
        if(prev_error < error(w,b)):
            break

do_adagrad(gaussian_dataset['gaussian_X'],gaussian_dataset['gaussian_Y'],2 , 2,1000)

W is 2.0999999999072636 b is  1.900000000005048 Error is 0.2863695337102963
W is 2.168626404205504 b is  1.828001837410815 Error is 0.2850193525972884
W is 2.223338235740657 b is  1.7686424986817106 Error is 0.28383782744137065
W is 2.269716094403006 b is  1.7168947227534863 Error is 0.2827560103335142
W is 2.310375403646874 b is  1.6703793456786236 Error is 0.2817410078911649
W is 2.3467903948567157 b is  1.6277491821380325 Error is 0.2807741800093169
W is 2.37988963100743 b is  1.5881540988206395 Error is 0.27984380021654565
W is 2.410303316812016 b is  1.5510169682515913 Error is 0.2789419524491451
W is 2.4384828046392837 b is  1.5159247783969727 Error is 0.2780630143243187
W is 2.4647647672995903 b is  1.4825699222970317 Error is 0.27720283660648604
W is 2.4894084579872837 b is  1.4507160105699588 Error is 0.27635826449252004
W is 2.512618690568163 b is  1.4201767423878036 Error is 0.27552684148514595
W is 2.53456071195401 b is  1.390802211479972 Error is 0.2747066174374192
W is 2.

# RMS Prop Optimiser

In [45]:
def do_rmsprop(X,Y,init_w,init_b,max_epochs):
    w,b,eta = init_w , init_b, 0.1
    v_w , v_b , eps , beta1 = 0 , 0, 1e-8,0.9
    for i in range(max_epochs):
        dw,db = 0,0
        for x, y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b(w,b,x,y)
       
        v_w = beta1 * v_w + (1-beta1)* dw**2
        v_b = beta1 * v_b + (1-beta1)* db**2

        # EMERGENCY
        prev_error = error(w,b)
        
        w = w - (eta/np.sqrt(v_w + eps))*dw
        b = b - (eta/np.sqrt(v_b + eps))*db

        # EMERGENCY
        print("W is",w,"b is ",b,"Error is",error(w,b))
        if(prev_error < error(w,b)):
            break

do_rmsprop(gaussian_dataset['gaussian_X'],gaussian_dataset['gaussian_Y'],2 , 2,1000)

W is 2.3162277630842576 b is  1.6837722341427988 Error is 0.28237977393660973
W is 2.519504388540514 b is  1.443528725186991 Error is 0.27670334453822554
W is 2.669161452863981 b is  1.2389544950455413 Error is 0.270785143756792
W is 2.7861340600882154 b is  1.056477684261111 Error is 0.26454764982475343
W is 2.880706530793475 b is  0.8898548352332081 Error is 0.2579775429764516
W is 2.958837494900904 b is  0.73573750193376 Error is 0.25109358218200306
W is 3.024354913201495 b is  0.5921321677271417 Error is 0.24393386517287197
W is 3.079897097575033 b is  0.45773866313381595 Error is 0.23654828896839825
W is 3.1273739678664665 b is  0.33162887137556263 Error is 0.22899288681527505
W is 3.1682163112784094 b is  0.21307994347328335 Error is 0.2213252650817649
W is 3.20352133602257 b is  0.10148553316310552 Error is 0.21360102969342082
W is 3.234143352849394 b is  -0.0036910324745756673 Error is 0.2058712055935271
W is 3.260753460068573 b is  -0.10294051568344355 Error is 0.1981806002308

# Adaptive Moment Estimation Optimiser

## Usually:
- beta1 = 0.9 , beta2 = 0.999
- eps = 1e-8
- eta = 0.001 , 0.0001

In [46]:
import math
def do_adam(X,Y,init_w,init_b,max_epochs):
    w_b_dw_db = [(init_w , init_b,0,0)]
    w_history , b_history , error_history = [], [], []

    w, b, eta, mini_batch_size, num_points_seen = init_w , init_b , 0.1 , 10, 0 
    m_w , m_b , v_w , v_b , eps, beta1, beta2 = 0, 0, 0, 0, 1e-8, 0.9, 0.999

    for i in range(max_epochs):
        dw,db = 0,0
        for x,y in zip(X,Y):
            dw += grad_w(w,b,x,y)
            db += grad_b( w,b,x,y)

        m_w = beta1 * m_w + (1 - beta1)*dw
        m_b = beta1 * m_b + (1 - beta1)*db

        v_w = beta2 * v_w + (1 - beta2)*dw**2
        v_b = beta2 * v_b + (1 - beta2)*db**2

        m_w = m_w / (1 - math.pow(beta1,i+1))
        m_b = m_b / (1 - math.pow(beta1,i+1))
        
        v_w = v_w / (1 - math.pow(beta2,i+1))
        v_b = v_b / (1 - math.pow(beta2,i+1))

         # EMERGENCY
        prev_error = error(w,b)

        w = w - (eta / np.sqrt(v_w + eps)) * m_w
        b = b - (eta / np.sqrt(v_b + eps)) * m_b

        # EMERGENCY
        print("W is",w,"b is ",b,"Error is",error(w,b))
        if(prev_error < error(w,b)):
            break

do_adam(gaussian_dataset['gaussian_X'],gaussian_dataset['gaussian_Y'],2,2,1000)

W is 2.0999999999072636 b is  1.900000000005048 Error is 0.2863695337102963
W is 2.1234000255747305 b is  1.8763811076062964 Error is 0.2859474463471161
W is 2.1277404431529283 b is  1.8719903172909729 Error is 0.28586832629168535
W is 2.1284628430573114 b is  1.8712590363077044 Error is 0.2858551253740114
W is 2.128575298398709 b is  1.871145168919974 Error is 0.28585306895967105
W is 2.128592034058393 b is  1.871128221123127 Error is 0.2858527628425932
W is 2.1285944484231756 b is  1.8711257760047533 Error is 0.28585271867513734
W is 2.1285947893491177 b is  1.8711254307239367 Error is 0.28585271243792565
W is 2.1285948368053393 b is  1.8711253826604473 Error is 0.2858527115696793
W is 2.12859484335226 b is  1.8711253760296482 Error is 0.28585271144989494
W is 2.1285948442511464 b is  1.8711253751192354 Error is 0.2858527114334483
W is 2.12859484437438 b is  1.8711253749944208 Error is 0.2858527114311936
W is 2.128594844391294 b is  1.8711253749772894 Error is 0.28585271143088403
W i



 Error is 0.28585271143083446
W is 2.1285948443939975 b is  1.871125374974551 Error is 0.28585271143083446
W is 2.1285948443939975 b is  1.871125374974551 Error is 0.28585271143083446
W is 2.1285948443939975 b is  1.871125374974551 Error is 0.28585271143083446
W is 2.1285948443939975 b is  1.871125374974551 Error is 0.28585271143083446
W is 2.1285948443939975 b is  1.871125374974551 Error is 0.28585271143083446
W is 2.1285948443939975 b is  1.871125374974551 Error is 0.28585271143083446
W is 2.1285948443939975 b is  1.871125374974551 Error is 0.28585271143083446
W is 2.1285948443939975 b is  1.871125374974551 Error is 0.28585271143083446
W is 2.1285948443939975 b is  1.871125374974551 Error is 0.28585271143083446
W is 2.1285948443939975 b is  1.871125374974551 Error is 0.28585271143083446
W is 2.1285948443939975 b is  1.871125374974551 Error is 0.28585271143083446
W is 2.1285948443939975 b is  1.871125374974551 Error is 0.28585271143083446
W is 2.1285948443939975 b is  1.87112537497455