In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [2]:
iris = datasets.load_iris()

In [3]:
x = iris.data
y = iris.target

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0)

## Sklearn Multi Layered Perceptrons (MLP) Classifier

In [5]:
from sklearn.neural_network import MLPClassifier

In [8]:
clf = MLPClassifier(hidden_layer_sizes=(20,), max_iter=3000)
clf.fit(x_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=3000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

## Some important Parameters
1. hidden_layer_sizes : tuple, length = n_layers - 2, default (100,) The ith element represents the number of neurons in the ith hidden layer.

2. activation : {‘identity’, ‘logistic’, ‘tanh’, ‘relu’}, 
    * default ‘relu’.
    ##### Activation function for the hidden layer  :
    * ‘identity’, no-op activation, useful to implement linear bottleneck, returns f(x) = x
    * ‘logistic’, the logistic sigmoid function, returns f(x) = 1 / (1 + exp(-x)).
    * ‘tanh’, the hyperbolic tan function, returns f(x) = tanh(x).
    * ‘relu’, the rectified linear unit function, returns f(x) = max(0, x)

3. alpha : 
    * float, optional, default 0.0001.  
    * L2 penalty (regularization term) parameter.

4. batch_size: 
    * int, optional, default ‘auto’.
    * Size of minibatches for stochastic optimizers. If the solver is ‘lbfgs’, the classifier will not use minibatch.
    * When set to “auto”, batch_size=min(200, n_samples)
    

In [9]:
clf.score(x_test, y_test)

0.97368421052631582

In [11]:
## Weights excluding the bias term
len(clf.coefs_)

2

In [14]:
clf.coefs_[0].shape, clf.coefs_[1].shape
## Because 4 perceptrons in input layer
## 20 in first hidden layer
## 3 unit in the output layer

((4, 20), (20, 3))

In [16]:
## Weights for the bias term
len(clf.intercepts_)

2

In [20]:
clf.intercepts_[0].shape, clf.intercepts_[1].shape

((20,), (3,))