Kaynak
 - http://colah.github.io/posts/2015-08-Backprop/

"Fundamentally, it’s a technique for calculating derivatives quickly."
> Backpropagation is the key algorithm that makes training deep models computationally tractable. For modern neural networks, it can make training with gradient descent as much as ten million times faster, relative to a naive implementation. That’s the difference between a model taking a week to train and taking 200,000 years.


If $a$ directly affects $c$, then we want to know how it affects $c$. 
If $a$ changes a little bit, how does $c$ change? We call this the partial derivative of $c$ with respect to $a$, as $$\frac{dc}{da}$$

![computationgraph.png](computationgraph.png)

___

![computationgraph2.png](computationgraph2.png)

## Backpropagation = Chain Rule
> When I first understood what backpropagation was, my reaction was: “Oh, that’s just the chain rule! How did it take us so long to figure out?” I’m not the only one who’s had that reaction. 



In [171]:
import random
import numpy as np

class yapay_sinir_agi():
    def __init__(self, katmanlar):
        self.katmanlar = katmanlar
        self.b = [np.random.randn(k, 1) for k in self.katmanlar[1:]] # bias degerleri (ilk katman haric)
        self.W = [np.random.randn(k2, k1) for k1, k2 in zip(self.katmanlar[:-1],self.katmanlar[1:])]

    def ag(self):
        return self.W, self.b
    
    def ileribesleme(self, a):
        """Katman katman yeni a degerleri hesaplaniyor"""
        for w, b in zip(self.W, self.b):
            z = np.dot(w, a)+b
            a = self.sigmoid(z)
        return a
    
    def geribesleme(self,X,y):
        delta_b = [np.zeros(b.shape) for b in self.b]
        delta_w = [np.zeros(w.shape) for w in self.W]
        a = X; A, Z = [a], []  # A, Z degerleri
        for w, b in zip(self.W, self.b):# z ve a degerlerini depolayalim
            z = np.dot(w, a) + b
            a = self.sigmoid(z)
            Z.append(z); A.append(a)
           
        hata = A[-1] - y # En son katmandaki hata 
        delta = hata * self.sigmoid_turevi(Z[-1])
        delta_b[-1] = delta # Son katmanda W, b'deki degisim  
        delta_w[-1] = delta * A[-2].T # ERROR: np.dot(delta, A[-2].T)
        
        for k in range(2, len(katmanlar)): # Hatanin geriye yayilimi
            delta = np.dot(self.W[-k+1].T, delta) * self.sigmoid_turevi(Z[-k])
            delta_b[-k] = delta
            delta_w[-k] = delta * A[-k-1].T # ERROR: np.dot(delta, A[-k-1].T)
        return (delta_b, delta_w)  
    
    
    def gradyan_inis(self, X_train, y_train, alpha, number_steps):
        for s in range(number_steps):
            i, m = 0,X_train.shape[1]
            X, y = X_train[:,[i]], y_train[:,[i]]
            tum_delta_b, tum_delta_w = self.geribesleme(X,y)
            for i in range(1,m): # Tum X kolonlari icin
                X, y = X_train[:,[i]], y_train[:,[i]]
                delta_b, delta_w = self.geribesleme(X,y)
                tum_delta_b = [tdb + db for tdb, db in zip(tum_delta_b, delta_b)]
                tum_delta_w = [tdw + dw for tdw, dw in zip(tum_delta_w, delta_w)]
            tum_delta_b = [alpha*tdb/m for tdb in tum_delta_b]
            tum_delta_w = [alpha*tdw/m for tdw in tum_delta_w]
        
        self.W = [w - dw for w, dw in zip(self.W, tum_delta_w)]

    def fit(self, X_train, y_train, alpha = 0.05, number_steps = 10000):
        # X verileri kolon=gozlem, satir=oznitelik (alistigimizin tersi)
        return self.gradyan_inis(X_train, y_train, alpha, number_steps)
    
    def predict(self, X_test):
        return [np.argmax(self.feedforward(a)) for x in X_test] 
    
    #### Yardimci Fonksiyonlar
    def sigmoid(self,z):
        return 1.0/(1.0+np.exp(-z))
    def sigmoid_turevi(self,z):
        return self.sigmoid(z)*(1-self.sigmoid(z))

In [163]:
katmanlar = [3, 4, 2]
ysa = yapay_sinir_agi(katmanlar)
agirlik, bias = ysa.ag()

print("\n\n\n\t\t\t\tBaslangic:\n")
print("agirlik matrisleri:")
for w in agirlik:
    print(w, "\n")
    
print("bias vektorleri:")
for b in bias:
    print(b, "\n")





				Baslangic:

agirlik matrisleri:
[[-0.79987567  0.74673723 -0.07141501]
 [ 0.90087227 -0.0412857   0.07956082]
 [ 0.08595711 -1.72847513  0.45546386]
 [ 1.23054466 -0.24416912  0.54879855]] 

[[ 0.37039086  0.97974311  0.08695518 -0.25879618]
 [-0.7213087   1.16720158  0.88319045 -0.71914994]] 

bias vektorleri:
[[-1.03681877]
 [ 1.00470544]
 [-0.24692503]
 [ 0.7565814 ]] 

[[-0.68158179]
 [ 0.36649893]] 



In [164]:
# X verileri kolon=gozlem, satir=oznitelik (alistigimizin tersi)
X_train, y_train = np.array([[1,1,0.1,0],[1,0.9, 0, 0],[1,1,0,0]]), np.array([[1,1,0,0],[0,0,1,1]])
print("Girdi: \n",X_train)
print("\nCikti:\n",y_train)

Girdi: 
 [[1.  1.  0.1 0. ]
 [1.  0.9 0.  0. ]
 [1.  1.  0.  0. ]]

Cikti:
 [[1 1 0 0]
 [0 0 1 1]]


In [165]:
ysa.fit(X_train, y_train, alpha = 0.05, number_steps = 100)
agirlik, bias = ysa.ag()


for w in agirlik:
    print(w, "\n")
    
for b in bias:
    print(b, "\n")

[[-0.79921948  0.7473805  -0.0707386 ]
 [ 0.90071402 -0.04142347  0.07941585]
 [ 0.08547766 -1.7289408   0.45497239]
 [ 1.2306969  -0.24402237  0.54895293]] 

[[ 0.37030571  0.98010482  0.0862146  -0.25818351]
 [-0.72180426  1.16501862  0.88305182 -0.7215263 ]] 

[[-1.03681877]
 [ 1.00470544]
 [-0.24692503]
 [ 0.7565814 ]] 

[[-0.68158179]
 [ 0.36649893]] 



In [117]:
i = 0
print("\n\n\n\t\t\t\tGERI YAYILIM {}:\n".format(i))
X, y = X_train[:,[i]], y_train[:,[i]]
print("Cikti:"); print(ysa.ileribesleme(X))

delta_b, delta_w = ysa.geribesleme(X,y)
print("\nagirlik matrislerinde degisim:")
for w in delta_w:
    print(w, "\n")

print("bias vektorlerinde degisim:")
for b in delta_b:
    print(b, "\n") 





				GERI YAYILIM 0:

Cikti:
[[0.84424832]
 [0.81554698]]

agirlik matrislerinde degisim:
[[-0.00341196 -0.00341196 -0.00341196]
 [ 0.00762659  0.00762659  0.00762659]
 [ 0.00949011  0.00949011  0.00949011]
 [ 0.04454836  0.04454836  0.04454836]] 

[[-0.00453394 -0.01706953 -0.00541298 -0.00619417]
 [ 0.02715961  0.10225151  0.03242533  0.03710491]] 

bias vektorlerinde degisim:
[[-0.00341196]
 [ 0.00762659]
 [ 0.00949011]
 [ 0.04454836]] 

[[-0.02048027]
 [ 0.12268282]] 



In [118]:
for i in range(1,X_train.shape[1]):
    print("\n\n\n\t\t\t\tGERI YAYILIM {}:\n".format(i))
    X, y = X_train[:,[i]], y_train[:,[i]]
    print("Cikti:"); print(ysa.ileribesleme(X))

    delta_b, delta_w = ysa.geribesleme(X,y)
    print("\nagirlik matrislerinde degisim:")
    for w in delta_w:
        print(w, "\n")

    print("bias vektorlerinde degisim:")
    for b in delta_b:
        print(b, "\n") 




				GERI YAYILIM 1:

Cikti:
[[0.84420588]
 [0.80954765]]

agirlik matrislerinde degisim:
[[-0.00350965 -0.00315869 -0.00350965]
 [ 0.00851181  0.00766063  0.00851181]
 [ 0.00954476  0.00859029  0.00954476]
 [ 0.0439596   0.03956364  0.0439596 ]] 

[[-0.00480299 -0.0166748  -0.00536015 -0.00587169]
 [ 0.0292572   0.10157369  0.03265108  0.0357671 ]] 

bias vektorlerinde degisim:
[[-0.00350965]
 [ 0.00851181]
 [ 0.00954476]
 [ 0.0439596 ]] 

[[-0.0204904 ]
 [ 0.12481626]] 




				GERI YAYILIM 2:

Cikti:
[[0.77562697]
 [0.79695245]]

agirlik matrislerinde degisim:
[[ 0.00329571  0.          0.        ]
 [ 0.00205121  0.          0.        ]
 [-0.00257503 -0.         -0.        ]
 [ 0.00030612  0.          0.        ]] 

[[ 0.02434781  0.08141862  0.07152268  0.0366569 ]
 [-0.00592668 -0.0198187  -0.01740986 -0.00892292]] 

bias vektorlerinde degisim:
[[ 0.03295712]
 [ 0.02051208]
 [-0.02575033]
 [ 0.00306121]] 

[[ 0.13498219]
 [-0.032857  ]] 




				GERI YAYILIM 3:

Cikti:
[[0.77639

In [141]:
alpha = 0.05
i, m = 0,X_train.shape[1]
X, y = X_train[:,[i]], y_train[:,[i]]
tum_delta_b, tum_delta_w = ysa.geribesleme(X,y)
for i in range(1,m): # Tum X kolonlari icin
    X, y = X_train[:,[i]], y_train[:,[i]]
    delta_b, delta_w = ysa.geribesleme(X,y)
    tum_delta_b = [tdb + db for tdb, db in zip(tum_delta_b, delta_b)]
    tum_delta_w = [tdw + dw for tdw, dw in zip(tum_delta_w, delta_w)]
tum_delta_b = [alpha*tdb/m for tdb in tum_delta_b]
tum_delta_w = [alpha*tdw/m for tdw in tum_delta_w]

In [142]:
agirlik

[array([[-0.49720992, -0.74019726,  1.44383667],
        [-0.06633068,  1.35568053, -0.10435108],
        [-0.63965511,  0.13977896, -0.70742847],
        [-0.1555358 ,  0.76482218, -0.47383811]]),
 array([[ 1.68050243,  0.77540919, -0.69730469,  0.55620609],
        [ 0.11919274,  0.57731016,  0.28141489,  1.81401582]])]

In [143]:
tum_delta_w

[array([[-4.53237414e-05, -8.21330702e-05, -8.65201355e-05],
        [ 2.27370010e-04,  1.91090153e-04,  2.01729910e-04],
        [ 2.05748013e-04,  2.26004967e-04,  2.37935921e-04],
        [ 1.11017599e-03,  1.05139998e-03,  1.10634948e-03]]),
 array([[0.00050413, 0.00161486, 0.0016789 , 0.00077014],
        [0.00055524, 0.00205577, 0.00037536, 0.00068841]])]

In [146]:
yeni_agirlik = [w - dw for w, dw in zip(agirlik, tum_delta_w)]
yeni_agirlik

[array([[-0.4971646 , -0.74011512,  1.44392319],
        [-0.06655805,  1.35548944, -0.1045528 ],
        [-0.63986086,  0.13955295, -0.7076664 ],
        [-0.15664598,  0.76377078, -0.47494446]]),
 array([[ 1.67999831,  0.77379433, -0.69898359,  0.55543595],
        [ 0.1186375 ,  0.57525439,  0.28103953,  1.81332741]])]

In [140]:
delta_b

[array([[ 0.03396865],
        [ 0.02052209],
        [-0.02555464],
        [ 0.00325627]]), array([[ 0.13478699],
        [-0.03231739]])]

In [137]:
tum_delta_b = [tdb + db for tdb, db in zip(tum_delta_b, delta_b)]
tum_delta_b

[array([[ 0.03696886],
        [ 0.02338072],
        [-0.02716815],
        [ 0.00799754]]), array([[ 0.14622692],
        [-0.02320116]])]

In [138]:
[alpha*tdb/2 for tdb in tum_delta_b]

[array([[ 0.00369689],
        [ 0.00233807],
        [-0.00271681],
        [ 0.00079975]]), array([[ 0.01462269],
        [-0.00232012]])]

In [124]:
list(zip(delta_b,tum_delta_b))[0]

(array([[ 0.03396865],
        [ 0.02052209],
        [-0.02555464],
        [ 0.00325627]]), array([[-0.00341196],
        [ 0.00762659],
        [ 0.00949011],
        [ 0.04454836]]))

In [126]:
list(zip(delta_b,tum_delta_b))[1]

(array([[ 0.13478699],
        [-0.03231739]]), array([[-0.02048027],
        [ 0.12268282]]))

In [89]:
X

array([[0.],
       [0.],
       [0.]])

In [166]:
y

array([[0],
       [1]])

In [169]:
s = np.array([[43],[5],[8],[4]])
np.argmax(s)

0

In [74]:
X.shape[1]

2

In [77]:
y[:,[0]]

array([[0],
       [1]])

In [69]:
y[:,i]

array([0, 1])

In [75]:
list(range(X.shape[1]))

[0, 1]

Ağırlıktaki değişim
$$
\frac{d Hata}{d w_{ji}(t)} = \frac{d Hata}{d z_{j}(t)} \frac{d z_{j}(t)}{d w_{ji}(t)} 
= \Delta_{j}(t)  a_i(t-1)
$$

In [93]:
W = np.array([[1,1,2,2],[2,2,1,1]])
a = np.array([[4],[3],[2],[1]])
delta = np.array([[2],[1]])
print(W, "\n")
print(a, "\n")
print(delta)

[[1 1 2 2]
 [2 2 1 1]] 

[[4]
 [3]
 [2]
 [1]] 

[[2]
 [1]]


In [97]:
# W ile ayni boyutlarda DEGIL
delta.T * a

array([[8, 4],
       [6, 3],
       [4, 2],
       [2, 1]])

In [95]:
# ERROR
# np.dot(delta, a.T)

In [96]:
delta *a.T # W ile ayni boyutlarda 

array([[8, 6, 4, 2],
       [4, 3, 2, 1]])

### Nöronlardaki hatanın güncellenmesi

Çıktı katmanında
$$
\Delta(T) = \frac{d Hata}{d z(T)} = (a(T)-y) \sigma'(z(T))
$$

$(t+1)$'inci ara katmandan $(t)$'inci ara katmana hatanın akışı,

$$
\Delta(t) =  w^T(t+1) \cdot  \Delta(t+1) \times \sigma'(z(t))
$$

### Ağırlık ve bias değerlerinin güncellenmesi

Hatayı minimize eden en iyi parametreleri arıyoruz.

Ağırlıktaki değişim
$$
\frac{d Hata}{d w_{ji}(t)} = \frac{d Hata}{d z_{j}(t)} \frac{d z_{j}(t)}{d w_{ji}(t)} 
= \Delta_{j}(t)  a_i(t-1)
$$

Biasdeki değişim
$$
\frac{d Hata}{d b_{j}(t)} = \frac{d Hata}{d z_{j}(t)} \frac{d z_{j}(t)}{d b_{j}(t)} 
= \Delta_{j}(t) 
$$

Geri Besleme algoritması neden hızlıdır?

> Dikkat ederseniz $\Delta_{j}(t)$ değerini sadece bir kez hesaplayıp, yeniden hesaplamadan bir çok yerde (Ağırlıktaki ve Biasdeki değişimde) tekrar tekrar kullanıyoruz. Bu bize hız kazandırıyor.



In [None]:
#Rakamlar veri kümesini yüklüyoruz.
from sklearn.datasets import load_digits
#numpy paketini yüklüyoruz.
import numpy as np
#Veri kümesini ikiye bölmek için kullanıyoruz.
from sklearn.model_selection import train_test_split
#Resimleri incelemek için matplotlib kullanıyoruz.
%matplotlib inline
import matplotlib.pyplot as plt

#Veri kümesini etiket değerleriyle birlikte yükleyelim.
X,y = load_digits(return_X_y=True)
print('Verinin boyutları (Gözlem, öznitelik) = ' + str(np.shape(X)))
print('Etiketlerin boyutları (Gözlem) = ' + str(np.shape(y)))

rakam1 = X[0]
rakam1 = np.reshape(rakam1, (8,8))

plt.figure(figsize= (2,2))
plt.imshow(rakam1, cmap="gray_r")
plt.show()
etiket1 = y[0]
print('Etiket: ' + str(etiket1))

In [None]:
#3 ve 8 rakamlarını seçmek için filtreleme kullanacağız.
#Filtrelemeyi etiket değerlerine (y) bakarak yapacağız. 
#Etiket değeri beşe bölündüğünde kalan üç ise bu gözlemleri veri kümesinde bırakıyoruz.
X= X[y%5 == 3]
y= y[y%5 == 3]

#Son olarak etiketleri 0 ve 1 değerlerine çevirelim.
#Rakam 3 ise etiket 0, 8 ise 1 olacak.
y = (y==8)*1
print('Verinin boyutları (Gözlem, öznitelik) = ' + str(np.shape(X)))
print('Etiketlerin boyutları (Gözlem) = ' + str(np.shape(y)))

In [None]:
#Stratify değeri veri kümesinin etiket yüzdelerini korumak için kullanılıyor.
#Öğrenme veri kümesini küçük tutarak sonuçların çok iyi olmasını engelliyoruz.
#random_state değeri sonuçların her seferinde aynı çıkmasını sağlamak için kullanılıyor.
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.5, test_size = 0.5, random_state = 0, stratify = y)

print('Öğrenme verisinin boyutları (Gözlem, öznitelik) = ' + str(np.shape(X_train)))
print('Sınama verisinin boyutları (Gözlem, öznitelik) = ' + str(np.shape(X_test)))

## Data Processing  - Normalize your data

Kaynak: https://www.springboard.com/blog/beginners-guide-neural-network-in-python-scikit-learn-0-18/

> The neural network in Python may have difficulty converging before the maximum number of iterations allowed if the data is not normalized. Multi-layer Perceptron is sensitive to feature scaling, so it is highly recommended to scale your data. Note that you must apply the same scaling to the test set for meaningful results. There are a lot of different methods for normalization of data, we will use the built-in StandardScaler for standardization.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit only to the training data
scaler.fit(X_train)

# Now apply the transformations to the data:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(activation= 'logistic', hidden_layer_sizes=(13,13,13),max_iter=500)
mlp.fit(X_train,y_train)

In [None]:
predictions = mlp.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

# Elle Deneme