In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.svm import LinearSVR
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.datasets import load_digits

# Introduction to Neural Networks

A neural network is composed of layers of neurons. \
The architecture (layers) depends on the task:
- Input layer - the first layer accepts the input data (vector). The size of the input layer should match the number of features in the data.
- Hidden layers - the number and size of the hidden layers depends on the complexity of the task. Activation functions such as ReLU are used to add nonlinearity.
- Output layer - the number of neurons depends on the task. For example if i want to make Classification i will use activation function softmax.

In [2]:
log_reg = LogisticRegression()

In [3]:
log_reg = [LogisticRegression() for _ in range(20)]
log_reg

[LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression()]

In [4]:
log_regs = [(f"log_regr_{i}", LogisticRegression()) for i in range(20)]

In [6]:
voting = VotingClassifier(log_regs)
voting

In [7]:
voting_classifiers = [(f"voting_{i}", VotingClassifier(log_regs)) for i in range(5)]

In [8]:
# one neural network:
StackingClassifier(voting_classifiers)

Neural network - a stack of layers (these layers are ensembles).

In [9]:
# the dimensions of the hidden layers
real_nn = MLPClassifier(hidden_layer_sizes = (3, 8, 15), max_iter = 5000)
real_nn

In [10]:
digits = load_digits()

In [11]:
digits_attributes, digits_classes = digits["data"], digits["target"]

In [12]:
real_nn.fit(digits_attributes, digits_classes)

In [13]:
type(real_nn.coefs_)

list

In [14]:
len(real_nn.coefs_) # the count layers like the 4-th is output

4

In [17]:
''' 
    (64, 3) - the matrix that describes the weight of all possible data to all 
    possible elements in the first layer. Each of the 3 layers includes one bias term.
'''
real_nn.coefs_[0].shape

(64, 3)

In [16]:
real_nn.intercepts_

[array([0.13319703, 0.06962544, 0.2379966 ]),
 array([ 1.39724262, -1.41484389,  1.12101644, -0.3786774 , -0.46185044,
        -0.24933253, -0.82366316, -0.59744644]),
 array([-0.17908054,  1.45210742, -0.09525538, -0.60165006, -0.45726207,
        -1.41066208,  0.03057346, -1.08472586,  1.12292055,  0.93896153,
        -0.76528312, -0.37623766,  0.89620366,  0.95861487,  1.05355787]),
 array([ 0.17778574,  0.06483139, -1.11412335, -0.87744796,  1.00989919,
        -0.64722162, -0.64136139,  1.12778523,  0.75614067,  0.42509887])]

In [18]:
real_nn.coefs_[2].shape  # input : output

(8, 15)

In [19]:
real_nn.predict(digits_attributes[:20])

array([0, 1, 1, 3, 4, 8, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 3])

In [20]:
# for each record all probabilities
real_nn.predict_proba(digits_attributes[:20]) 

array([[9.99996210e-01, 1.11378796e-06, 9.96922766e-16, 1.13750880e-24,
        2.00560761e-06, 2.32852418e-12, 3.34144437e-14, 6.57465658e-09,
        7.73669058e-09, 6.56571272e-07],
       [4.02259472e-13, 9.04390104e-01, 3.51647433e-02, 2.60783801e-06,
        2.89541373e-12, 3.17348735e-05, 4.03883440e-04, 6.92418419e-06,
        5.99946049e-02, 5.39757394e-06],
       [1.28174652e-15, 6.92450802e-01, 2.87512435e-01, 3.82969591e-05,
        2.13916228e-14, 7.67935656e-06, 2.93384613e-06, 2.69209159e-06,
        1.99027031e-02, 8.24585550e-05],
       [1.80061709e-16, 5.52588256e-05, 4.22825813e-02, 9.04511761e-01,
        1.08341617e-16, 7.11435396e-03, 1.58542691e-11, 4.93089631e-10,
        3.56328726e-02, 1.04031716e-02],
       [2.02603392e-06, 2.81735888e-04, 5.48991074e-06, 1.03765591e-07,
        9.97971885e-01, 3.74906789e-05, 3.85948954e-04, 1.06761004e-03,
        2.47693260e-04, 1.62063873e-08],
       [2.94773677e-09, 1.06695183e-05, 2.80816729e-05, 3.44204755e-02,
   

The first layer takes the data, multiplies it by the weights of the first layer, and runs the whole thing through the activation function. We feed the result to the second layer, then the result of this whole thing to the third layer etc. \
The rule is a composition of a complex function. The derivative of a complex function is the derivative of the outer function multiplied by the inner one.
Many graphs together make a layer, many such layers make a neural network..com/

A site for ML tasks and code challenges: https://www.deep-ml.com/