# ニューラルネットワーク学習 (RNNで自然言語処理)

In [1]:
import numpy as np
import pandas as pd

## 事前学習済みの単語分散表現モデル

In [2]:
# 分散表現の次元数 = 2
word2vec_dict = {
    "の": [0.02, 0.03],
    "好き": [0.01, 0.9],
    "嫌い": [0.9, 0.01],
    "ラーメン": [0.03, 0.9],
    "寿司": [0.03, 0.89],
    "ステーキ": [0.9, 0.03],
    "バーベキュー": [0.89, 0.03],
    "が": [0.01, 0.3],
    "東京": [0.5, 0.1],
    "テキサス": [0.1, 0.5]
}

## 文字列<->数値辞書

In [3]:
# 通常パディングは0で未知語は1
stoi = {
    "PAD": 0,
    "UNK": 1,
    "の": 2,
    "好き": 3,
    "嫌い": 4,
    "ラーメン": 5,
    "寿司": 6,
    "ステーキ": 7,
    "バーベキュー": 8,
    "が": 9,
    "東京": 10,
    "テキサス": 11
}

## テキストデータを読み込む

In [4]:
df = pd.read_csv('data/data_ja.csv', sep='\t')
df

Unnamed: 0,文,ラベル
0,東京 の ラーメン が 好き,1
1,東京 の 寿司 が 好き,1
2,東京 の ラーメン が 嫌い,0
3,東京 の 寿司 が 嫌い,0
4,テキサス の ステーキ が 好き,0
5,テキサス の バーベキュー が 好き,0
6,テキサス の ステーキ が 嫌い,1
7,テキサス の バーベキュー が 嫌い,1


## トークン化 (文字列を数値に変換) (分かち書き済み)

In [5]:
df.文[0].split(" ")

['東京', 'の', 'ラーメン', 'が', '好き']

In [6]:
# 空白を区切りに単語に分ける。文の長さは全て5なのでパディング無し。
df['X'] = df.apply(lambda x: [stoi[i] for i in x.文.split(" ")], axis=1)

In [7]:
df[['X', 'ラベル']]

Unnamed: 0,X,ラベル
0,"[10, 2, 5, 9, 3]",1
1,"[10, 2, 6, 9, 3]",1
2,"[10, 2, 5, 9, 4]",0
3,"[10, 2, 6, 9, 4]",0
4,"[11, 2, 7, 9, 3]",0
5,"[11, 2, 8, 9, 3]",0
6,"[11, 2, 7, 9, 4]",1
7,"[11, 2, 8, 9, 4]",1


## 単語を分散表現に変換する

In [8]:
# 数値<->文字列
itos = {v: k for k, v in stoi.items()}

In [9]:
df['X'] = df.apply(lambda x: [word2vec_dict[itos[i]] for i in x.X], axis=1)

In [10]:
df[['X', 'ラベル']]

Unnamed: 0,X,ラベル
0,"[[0.5, 0.1], [0.02, 0.03], [0.03, 0.9], [0.01,...",1
1,"[[0.5, 0.1], [0.02, 0.03], [0.03, 0.89], [0.01...",1
2,"[[0.5, 0.1], [0.02, 0.03], [0.03, 0.9], [0.01,...",0
3,"[[0.5, 0.1], [0.02, 0.03], [0.03, 0.89], [0.01...",0
4,"[[0.1, 0.5], [0.02, 0.03], [0.9, 0.03], [0.01,...",0
5,"[[0.1, 0.5], [0.02, 0.03], [0.89, 0.03], [0.01...",0
6,"[[0.1, 0.5], [0.02, 0.03], [0.9, 0.03], [0.01,...",1
7,"[[0.1, 0.5], [0.02, 0.03], [0.89, 0.03], [0.01...",1


## 特徴量とラベルに分ける

In [11]:
X = np.array([i for i in df.X.values])

# (バッチ, 系列, 分散表現の次元数)
X.shape

(8, 5, 2)

In [12]:
# 最初の文
X[0]

array([[0.5 , 0.1 ],
       [0.02, 0.03],
       [0.03, 0.9 ],
       [0.01, 0.3 ],
       [0.01, 0.9 ]])

In [13]:
# ラベルをワンホット表現
y = np.array([[0., 1.] if i == 1 else [1., 0.] for i in df.ラベル.values])

In [14]:
y

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.]])

## 活性化関数

In [15]:
# 分類のためのソフトマックス関数
def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T 

    x = x - np.max(x) 
    return np.exp(x) / np.sum(np.exp(x))

# ランプ関数(順伝搬)
def relu(x):
    return np.maximum(0, x)

# ランプ関数(逆伝搬)
def relu_grad(x):
    勾 = np.zeros(x.shape)
    勾[x>=0] = 1
    return 勾

# 双曲線正接関数(逆伝搬)
def tanh_grad(導, 重, 隠):
    return np.multiply(np.dot(導, 重.T), 1 - np.power(隠, 2))

## 損失関数

In [16]:
# 交差エントロピー誤差
def cross_entropy_error(y, t):
    delta = 1e-7
    batch_size = y.shape[0]
    return -np.sum(t * np.log(y + delta)) / batch_size

## Architecture

In [17]:
class RNN:
    # 重みと閾値をランダムで初期化する
    # 重入 (入力->隠れ層)
    # 重隠 (隠れ層->隠れ層)
    def __init__(self, word_vector_size:int, hidden_size:int, output_size:int, weight_init_std:float=0.01):
        self.params = {}
        self.params['重入1'] = weight_init_std * np.random.randn(word_vector_size, hidden_size)
        self.params['重隠1'] = weight_init_std * np.random.randn(hidden_size, hidden_size)
        self.params['閾1'] = np.zeros(hidden_size)
        self.params['重入2'] = weight_init_std * np.random.randn(word_vector_size, hidden_size)
        self.params['重隠2'] = weight_init_std * np.random.randn(hidden_size, hidden_size)
        self.params['閾2'] = np.zeros(hidden_size)
        self.params['重入3'] = weight_init_std * np.random.randn(word_vector_size, hidden_size)
        self.params['重隠3'] = weight_init_std * np.random.randn(hidden_size, hidden_size)
        self.params['閾3'] = np.zeros(hidden_size)
        self.params['重入4'] = weight_init_std * np.random.randn(word_vector_size, hidden_size)
        self.params['重隠4'] = weight_init_std * np.random.randn(hidden_size, hidden_size)
        self.params['閾4'] = np.zeros(hidden_size)
        self.params['重入5'] = weight_init_std * np.random.randn(word_vector_size, hidden_size)
        self.params['重隠5'] = weight_init_std * np.random.randn(hidden_size, hidden_size)
        self.params['閾5'] = np.zeros(hidden_size)
        self.params['重6'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['閾6'] = np.zeros(output_size)

    # 予測
    def predict(self, x):
        重入1, 重入2, 重入3, 重入4, 重入5 = self.params['重入1'], self.params['重入2'], self.params['重入3'], self.params['重入4'], self.params['重入5']
        重隠1, 重隠2, 重隠3, 重隠4, 重隠5 = self.params['重隠1'], self.params['重隠2'], self.params['重隠3'], self.params['重隠4'], self.params['重隠5']
        重6 = self.params['重6']
        閾1, 閾2, 閾3, 閾4, 閾5, 閾6 = self.params['閾1'], self.params['閾2'], self.params['閾3'], self.params['閾4'], self.params['閾5'], self.params['閾6']

        # 1番目の単語埋め込み
        特1 = np.dot(x[:,0], 重入1)
        入1 = relu(特1)
        
        # 2番目の単語埋め込み
        特2 = np.dot(x[:,1], 重入2)
        入2 = relu(特2)
        
        # 3番目の単語埋め込み
        特3 = np.dot(x[:,2], 重入3)
        入3 = relu(特3)
        
        # 4番目の単語埋め込み
        特4 = np.dot(x[:,3], 重入4)
        入4 = relu(特4)
        
        # 5番目の単語埋め込み
        特5 = np.dot(x[:,4], 重入5)
        入5 = relu(特5)
        
        隠0 = np.zeros(入1.shape)
        隠1 = np.dot(隠0+入1, 重隠1) + 閾1
        隠1 = np.tanh(隠1)
        隠2 = np.dot(隠1+入2, 重隠2) + 閾2
        隠2 = np.tanh(隠2)
        隠3 = np.dot(隠2+入3, 重隠3) + 閾3
        隠3 = np.tanh(隠3)
        隠4 = np.dot(隠3+入4, 重隠4) + 閾4
        隠4 = np.tanh(隠4)
        隠5 = np.dot(隠4+入5, 重隠5) + 閾5
        隠5 = np.tanh(隠5)
        
        # 隠れ層->出力
        全1 = np.dot(隠5, 重6) + 閾6
        y = softmax(全1)
        
        return y
    
    # 損失関数
    def loss(self, x, t):
        y = self.predict(x)
        
        return cross_entropy_error(y, t)
    
    # 評価関数
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        t = np.argmax(t, axis=1)
        
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
        
    # 勾配
    def gradient(self, x, t):
        # 初めはランダムな初期値。以降は更新されたパラメータを使用。
        重入1, 重入2, 重入3, 重入4, 重入5 = self.params['重入1'], self.params['重入2'], self.params['重入3'], self.params['重入4'], self.params['重入5']
        重隠1, 重隠2, 重隠3, 重隠4, 重隠5 = self.params['重隠1'], self.params['重隠2'], self.params['重隠3'], self.params['重隠4'], self.params['重隠5']
        重6 = self.params['重6']
        閾1, 閾2, 閾3, 閾4, 閾5, 閾6 = self.params['閾1'], self.params['閾2'], self.params['閾3'], self.params['閾4'], self.params['閾5'], self.params['閾6']

        # 学習パラメータの初期化
        grads = {}
        grads['重入1'] = None
        grads['重隠1'] = None
        grads['閾1'] = None
        grads['重入2'] = None
        grads['重隠2'] = None
        grads['閾2'] = None
        grads['重入3'] = None
        grads['重隠3'] = None
        grads['閾3'] = None
        grads['重入4'] = None
        grads['重隠4'] = None
        grads['閾4'] = None
        grads['重入5'] = None
        grads['重隠5'] = None
        grads['閾5'] = None
        grads['重6'] = None
        grads['閾6'] = None
        
        batch_num = x.shape[0]
        
        # 順伝搬
        # 1番目の単語埋め込み
        特1 = np.dot(x[:,0], 重入1)
        入1 = relu(特1)
        
        # 2番目の単語埋め込み
        特2 = np.dot(x[:,1], 重入2)
        入2 = relu(特2)
        
        # 3番目の単語埋め込み
        特3 = np.dot(x[:,2], 重入3)
        入3 = relu(特3)
        
        # 4番目の単語埋め込み
        特4 = np.dot(x[:,3], 重入4)
        入4 = relu(特4)
        
        # 5番目の単語埋め込み
        特5 = np.dot(x[:,4], 重入5)
        入5 = relu(特5)
        
        隠0 = np.zeros(入1.shape)
        隠1_0 = np.dot(隠0+入1, 重隠1) + 閾1
        隠1 = np.tanh(隠1_0)
        隠2_0 = np.dot(隠1+入2, 重隠2) + 閾2
        隠2 = np.tanh(隠2_0)
        隠3_0 = np.dot(隠2+入3, 重隠3) + 閾3
        隠3 = np.tanh(隠3_0)
        隠4_0 = np.dot(隠3+入4, 重隠4) + 閾4
        隠4 = np.tanh(隠4_0)
        隠5_0 = np.dot(隠4+入5, 重隠5) + 閾5
        隠5 = np.tanh(隠5_0)
        
        # 隠れ層->出力
        全1 = np.dot(隠5, 重6) + 閾6
        y = softmax(全1)
        print("予測: {}".format(y))
    
        # 逆伝搬
        導推 = (y - t) / batch_num
        導重6 = np.dot(隠5.T, 導推)
        導閾6 = np.sum(導推, axis=0)
        grads['重6'] = 導重6
        grads['閾6'] = 導閾6

        導活5 = tanh_grad(導推, 重6, 隠5)
        導重隠5 = np.dot((隠4+入5).T, 導活5)
        導閾5 = np.sum(導活5, axis=0)
        導入5 = relu_grad(入5) * np.dot(導活5, 重隠5.T)
        導重入5 = np.dot(X[:,4].T, 導入5)
        grads['重隠5'] = 導重隠5
        grads['閾5'] = 導閾5
        grads['重入5'] = 導重入5

        導活4 = tanh_grad(導活5, 重隠5, 隠4)
        導重隠4 = np.dot((隠3+入4).T, 導活4)
        導閾4 = np.sum(導活4, axis=0)
        導入4 = relu_grad(入4) * np.dot(導活4, 重隠4.T)
        導重入4 = np.dot(X[:,3].T, 導入4)
        grads['重隠4'] = 導重隠4
        grads['閾4'] = 導閾4
        grads['重入4'] = 導重入4

        導活3 = tanh_grad(導活4, 重隠4, 隠3)
        導重隠3 = np.dot((隠2+入3).T, 導活3)
        導閾3 = np.sum(導活3, axis=0)
        導入3 = relu_grad(入3) * np.dot(導活3, 重隠3.T)
        導重入3 = np.dot(X[:,2].T, 導入3)
        grads['重隠3'] = 導重隠3
        grads['閾3'] = 導閾3
        grads['重入3'] = 導重入3

        導活2 = tanh_grad(導活3, 重隠3, 隠2)
        導重隠2 = np.dot((隠1+入2).T, 導活2)
        導閾2 = np.sum(導活2, axis=0)
        導入2 = relu_grad(入2) * np.dot(導活2, 重隠2.T)
        導重入2 = np.dot(X[:,1].T, 導入2)
        grads['重隠2'] = 導重隠2
        grads['閾2'] = 導閾2
        grads['重入2'] = 導重入2

        導活1 = tanh_grad(導活2, 重隠2, 隠1)
        導重隠1 = np.dot((隠0+入1).T, 導活1)
        導閾1 = np.sum(導活1, axis=0)
        導入1 = relu_grad(入1) * np.dot(導活1, 重隠1.T)
        導重入1 = np.dot(X[:,0].T, 導入1)
        grads['重隠1'] = 導重隠1
        grads['閾1'] = 導閾1
        grads['重入1'] = 導重入1
        
        return grads

## ネットワークを初期化する

In [18]:
learning_rate = 0.1
network = RNN(word_vector_size=2, hidden_size=4, output_size=2)

## 学習

In [19]:
def train_1_epoch(X, y):
    # 勾配を計算
    grad = network.gradient(X, y)
    print("重みと閾値: ")
    print(grad)
    
    # パラメータを更新
    for key in ('重入1', '重隠1', '閾1', '重入2', '重隠2', '閾2', '重入3', '重隠3', '閾3', '重入4', '重隠4', '閾4', '重入5', '重隠5', '閾5', '重6', '閾6'):
        network.params[key] -= learning_rate * grad[key]
    
    # 損失を計算
    loss = network.loss(X, y)
    print("損失: "+str(loss))
    
    # 精度を計算
    accuracy = network.accuracy(X, y)
    print("精度: "+str(accuracy))

### 10エポック

In [20]:
for i in range(10):
    print("エポック "+str(i+1))
    train_1_epoch(X, y)
    print("\n")

エポック 1
予測: [[0.50000074 0.49999926]
 [0.50000074 0.49999926]
 [0.50000075 0.49999925]
 [0.50000075 0.49999925]
 [0.50000074 0.49999926]
 [0.50000074 0.49999926]
 [0.50000075 0.49999925]
 [0.50000075 0.49999925]]
重みと閾値: 
{'重入1': array([[-2.15298167e-18,  2.44357672e-18, -1.43485224e-17,
        -1.74066463e-18],
       [-2.22009462e-18,  2.53324857e-18, -1.47310953e-17,
        -1.70708005e-18]]), '重隠1': array([[ 4.10205290e-18,  8.76028040e-19, -3.45166317e-18,
        -9.79453854e-19],
       [ 1.24257067e-19,  2.65515469e-20, -1.04519313e-19,
        -2.97601981e-20],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 9.16980737e-18,  1.94614860e-18, -7.74470076e-18,
        -2.11764353e-18]]), '閾1': array([ 1.18556126e-15,  2.51224690e-16, -1.00223962e-15, -2.71468557e-16]), '重入2': array([[ 2.37112253e-17,  5.02449381e-18, -2.00447927e-17,
        -5.42937118e-18],
       [ 3.55668379e-17,  7.53674072e-18, -3.00671891e-17,
        -8.144056