In [331]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
from  sklearn.metrics import accuracy_score
warnings.filterwarnings("ignore")
import random

In [332]:
# 上述函数，其输入是包含1个多个枚举类别的2D数组，需要reshape成为这种数组
# from sklearn.preprocessing import CategoricalEncoder  #后面会添加这个方法

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder

from scipy import sparse

# 后面再去理解
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """Encode categorical features as a numeric array.
    The input to this transformer should be a matrix of integers or strings,
    denoting the values taken on by categorical (discrete) features.
    The features can be encoded using a one-hot aka one-of-K scheme
    (``encoding='onehot'``, the default) or converted to ordinal integers
    (``encoding='ordinal'``).
    This encoding is needed for feeding categorical data to many scikit-learn
    estimators, notably linear models and SVMs with the standard kernels.
    Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
    Parameters
    ----------
    encoding : str, 'onehot', 'onehot-dense' or 'ordinal'
        The type of encoding to use (default is 'onehot'):
        - 'onehot': encode the features using a one-hot aka one-of-K scheme
          (or also called 'dummy' encoding). This creates a binary column for
          each category and returns a sparse matrix.
        - 'onehot-dense': the same as 'onehot' but returns a dense array
          instead of a sparse matrix.
        - 'ordinal': encode the features as ordinal integers. This results in
          a single column of integers (0 to n_categories - 1) per feature.
    categories : 'auto' or a list of lists/arrays of values.
        Categories (unique values) per feature:
        - 'auto' : Determine categories automatically from the training data.
        - list : ``categories[i]`` holds the categories expected in the ith
          column. The passed categories are sorted before encoding the data
          (used categories can be found in the ``categories_`` attribute).
    dtype : number type, default np.float64
        Desired dtype of output.
    handle_unknown : 'error' (default) or 'ignore'
        Whether to raise an error or ignore if a unknown categorical feature is
        present during transform (default is to raise). When this is parameter
        is set to 'ignore' and an unknown category is encountered during
        transform, the resulting one-hot encoded columns for this feature
        will be all zeros.
        Ignoring unknown categories is not supported for
        ``encoding='ordinal'``.
    Attributes
    ----------
    categories_ : list of arrays
        The categories of each feature determined during fitting. When
        categories were specified manually, this holds the sorted categories
        (in order corresponding with output of `transform`).
    Examples
    --------
    Given a dataset with three features and two samples, we let the encoder
    find the maximum value per feature and transform the data to a binary
    one-hot encoding.
    >>> from sklearn.preprocessing import CategoricalEncoder
    >>> enc = CategoricalEncoder(handle_unknown='ignore')
    >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
    ... # doctest: +ELLIPSIS
    CategoricalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
              encoding='onehot', handle_unknown='ignore')
    >>> enc.transform([[0, 1, 1], [1, 0, 4]]).toarray()
    array([[ 1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.],
           [ 0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.]])
    See also
    --------
    sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of
      integer ordinal features. The ``OneHotEncoder assumes`` that input
      features take on values in the range ``[0, max(feature)]`` instead of
      using the unique values.
    sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
      dictionary items (also handles string-valued features).
    sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
      encoding of dictionary items or strings.
    """

    def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
                 handle_unknown='error'):
        self.encoding = encoding
        self.categories = categories
        self.dtype = dtype
        self.handle_unknown = handle_unknown

    def fit(self, X, y=None):
        """Fit the CategoricalEncoder to X.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_feature]
            The data to determine the categories of each feature.
        Returns
        -------
        self
        """

        if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
            template = ("encoding should be either 'onehot', 'onehot-dense' "
                        "or 'ordinal', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or "
                        "'ignore', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
            raise ValueError("handle_unknown='ignore' is not supported for"
                             " encoding='ordinal'")

        X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
        n_samples, n_features = X.shape

        self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]

        for i in range(n_features):
            le = self._label_encoders_[i]
            Xi = X[:, i]
            if self.categories == 'auto':
                le.fit(Xi)
            else:
                valid_mask = np.in1d(Xi, self.categories[i])
                if not np.all(valid_mask):
                    if self.handle_unknown == 'error':
                        diff = np.unique(Xi[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, i))
                        raise ValueError(msg)
                le.classes_ = np.array(np.sort(self.categories[i]))

        self.categories_ = [le.classes_ for le in self._label_encoders_]

        return self

    def transform(self, X):
        """Transform X using one-hot encoding.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to encode.
        Returns
        -------
        X_out : sparse matrix or a 2-d array
            Transformed input.
        """
        X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
        n_samples, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)

        for i in range(n_features):
            valid_mask = np.in1d(X[:, i], self.categories_[i])

            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    # Set the problematic rows to an acceptable value and
                    # continue `The rows are marked `X_mask` and will be
                    # removed later.
                    X_mask[:, i] = valid_mask
                    X[:, i][~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(X[:, i])

        if self.encoding == 'ordinal':
            return X_int.astype(self.dtype, copy=False)

        mask = X_mask.ravel()
        n_values = [cats.shape[0] for cats in self.categories_]
        n_values = np.array([0] + n_values)
        indices = np.cumsum(n_values)

        column_indices = (X_int + indices[:-1]).ravel()[mask]
        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
                                n_features)[mask]
        data = np.ones(n_samples * n_features)[mask]

        out = sparse.csc_matrix((data, (row_indices, column_indices)),
                                shape=(n_samples, indices[-1]),
                                dtype=self.dtype).tocsr()
        if self.encoding == 'onehot-dense':
            return out.toarray()
        else:
            return out

In [333]:
# load data
train_df = pd.read_csv("./datasets/train.csv")
test_df = pd.read_csv("./datasets/test.csv")
combine = [train_df, test_df]

In [334]:
X_train = train_df.drop(["label"], axis=1)
y_train = train_df["label"]
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1)

In [335]:
cat_encoder = CategoricalEncoder(encoding='onehot-dense')
y_train_encoder = cat_encoder.fit_transform(y_train.reshape(-1, 1))
# y_test_encoder = cat_encoder.transform(y_test.reshape(-1, 1))
y_test_encoder = y_test
X_train_small = X_train.values
y_train_small = y_train_encoder

X_test_small = X_test.values
y_test_small = y_test_encoder
# test_data = zip(X_test_small, y_test_small)


In [336]:
from sklearn.preprocessing import StandardScaler

In [337]:
sd_scaler = StandardScaler()
X_train_small = sd_scaler.fit_transform(X_train_small)


In [338]:
X_train_small = [array.reshape(-1, 1) for array in X_train_small]
y_train_small = [array.reshape(-1, 1) for array in y_train_small]
training_data = zip(X_train_small, y_train_small)

X_test_small = [array.reshape(-1, 1) for array in X_test_small]
# y_test_small = [array.reshape(-1, 1) for array in y_test_small]

test_data = zip(X_test_small, y_test_small)

In [339]:
# test_df_encoder = cat_encoder.transform(test_df.values.reshape(-1, 1))
test_df_values = test_df.values

In [340]:
test_df_values = sd_scaler.fit_transform(test_df_values)

In [341]:
test_df_values = [array.reshape(-1, 1) for array in test_df_values]

In [342]:
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))
def sigmoid_prime(z):
    return sigmoid(z) * (1 - sigmoid(z))

In [343]:
class Network(object):
    def __init__(self, sizes):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x)
                        for x, y in zip(sizes[:-1], sizes[1:])]



    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
        """
        desc: 随机梯度下降
        :param training_data: list of tuples (x,y)
        :param epochs: 训练次数
        :param mini_batch_size: 随机的最小集合
        :param eta: learning rate： 学习速率
        :param test_data: 测试数据，有的话会评估算法，但会降低运行速度
        :return:
        """
        if test_data:
            test_data = list(test_data)
            n_test = len(test_data)
        training_data = list(training_data)
        n = len(training_data)
        for j in range(epochs):
            random.shuffle(training_data)
            mini_batches = [
                training_data[k: k + mini_batch_size]
                for k in range(0, n, mini_batch_size)
            ]
            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, eta)
            if test_data:
                result = self.evaluate(test_data)
                print("Epoch {}: {} / {}, Accuracy score: {}".format(
                    j, result, n_test, result / n_test))
            else:
                print("Epoch {} complete".format(j))

    def evaluate(self, test_data):
        """
        评估测试集的准确性
        :param test_data:
        :return:
        """
        test_results = [(np.argmax(self.feedforward(x)), y) for (x, y) in test_data]
        return sum(int(x == y) for (x, y) in test_results)
    
    def feedforward(self, a):
        """return the output of the network if "a" is input"""
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a) + b)
        return a
    
    def predict(self, X):
        X = []
        return np.argmax(X)

    def update_mini_batch(self, mini_batch, eta):
        """
        梯度下降更新weights和biases， 用到backpropagation反向传播。
        :param mini_batch:
        :param eta:
        :return:
        """
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]

        for x, y in mini_batch:
            delta_nabla_b, delta_nabla_w = self.backprop(x, y)
            nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]

        self.weights = [w - (eta / len(mini_batch)) * nw
                        for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b - (eta / len(mini_batch)) * nb
                       for b, nb in zip(self.biases, nabla_b)]

    def backprop(self, x, y):
        """
        :param x:
        :param y:
        :return: (nabla_b, nabla_w): gradient for 损失函数，类似于biaes， weight。
        """
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        # feedforward
        activation = x
        activations = [x]  # 存储所有的激活值
        zs = []  # 存储所有的z向量
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation) + b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)

        # backward pass
        delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())

        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l + 1].transpose(), delta) * sp
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l - 1].transpose())
        return (nabla_b, nabla_w)

    def cost_derivative(self, output_activations, y):
        """
        :param output_activations:
        :param y:
        :return: 给定输出激发， 返回偏导数向量
        """
        return (output_activations - y)

In [344]:
sizes = [784, 50, 20, 10]
networ = Network(sizes)
networ.SGD(training_data, 30, 50, 0.5, test_data)

Epoch 0: 913 / 4200, Accuracy score: 0.21738095238095237
Epoch 1: 1403 / 4200, Accuracy score: 0.33404761904761904
Epoch 2: 1745 / 4200, Accuracy score: 0.4154761904761905
Epoch 3: 2057 / 4200, Accuracy score: 0.4897619047619048
Epoch 4: 2218 / 4200, Accuracy score: 0.5280952380952381
Epoch 5: 2396 / 4200, Accuracy score: 0.5704761904761905
Epoch 6: 2559 / 4200, Accuracy score: 0.6092857142857143
Epoch 7: 2571 / 4200, Accuracy score: 0.6121428571428571
Epoch 8: 2560 / 4200, Accuracy score: 0.6095238095238096
Epoch 9: 2627 / 4200, Accuracy score: 0.6254761904761905
Epoch 10: 2625 / 4200, Accuracy score: 0.625
Epoch 11: 2598 / 4200, Accuracy score: 0.6185714285714285
Epoch 12: 2594 / 4200, Accuracy score: 0.6176190476190476
Epoch 13: 2646 / 4200, Accuracy score: 0.63
Epoch 14: 2689 / 4200, Accuracy score: 0.6402380952380953
Epoch 15: 2667 / 4200, Accuracy score: 0.635
Epoch 16: 2721 / 4200, Accuracy score: 0.6478571428571429
Epoch 17: 2740 / 4200, Accuracy score: 0.6523809523809524
Epoch

In [345]:
X = [networ.feedforward(i) for i in test_df_values]

In [346]:
y = [np.argmax(i) for i in X]

In [347]:
ImageId = np.arange(1, 28001)
result = pd.DataFrame({
    "ImageId": ImageId,
    "Label": y
})
result.to_csv("result_03.csv", index=False)
# dnn_clf.predict(test_tf_full_pipeline)