### Wikipedia gold standard (WG) corpus
* PER(Person)
* LOC(Location)
* ORG(Organization)
* MISC(Miscellaneous, 이것저것 다양한)

http://downloads.schwa.org/wikiner/wikigold.conll.txt

In [1]:
import numpy as np
import random
from scipy.special import expit # expit for sigmoid function.
from pprint import pprint
from preprocess_window_classification import Preprocess

In [2]:
pre = Preprocess(file_dir='/ai/home/hanseok/sandbox/seminar/wikigold.conll.txt')

In [7]:
pre.sents[:3]

[[['<S>', 'O'],
  ['<S>', 'O'],
  ['010', 'I-MISC'],
  ['is', 'O'],
  ['the', 'O'],
  ['tenth', 'O'],
  ['album', 'O'],
  ['from', 'O'],
  ['Japanese', 'I-MISC'],
  ['Punk', 'O'],
  ['Techno', 'O'],
  ['band', 'O'],
  ['The Mad Capsule Markets', 'I-ORG'],
  ['.', 'O'],
  ['<E>', 'O'],
  ['<E>', 'O']],
 [['<S>', 'O'],
  ['<S>', 'O'],
  ['This', 'O'],
  ['album', 'O'],
  ['proved', 'O'],
  ['to', 'O'],
  ['be', 'O'],
  ['more', 'O'],
  ['commercial', 'O'],
  ['and', 'O'],
  ['more', 'O'],
  ['techno-based', 'O'],
  ['than', 'O'],
  ['Osc-Dis', 'I-MISC'],
  [',', 'O'],
  ['with', 'O'],
  ['heavily', 'O'],
  ['synthesized', 'O'],
  ['songs', 'O'],
  ['like', 'O'],
  ['Introduction 010', 'I-MISC'],
  ['and', 'O'],
  ['Come', 'I-MISC'],
  ['.', 'O'],
  ['<E>', 'O'],
  ['<E>', 'O']],
 [['<S>', 'O'],
  ['<S>', 'O'],
  ['Founding', 'O'],
  ['member', 'O'],
  ['Kojima Minoru', 'I-PER'],
  ['played', 'O'],
  ['guitar', 'O'],
  ['on', 'O'],
  ['Good Day', 'I-MISC'],
  [',', 'O'],
  ['and', 'O'],
 

In [4]:
pre.word_idx

{'bench': 0,
 'food': 1,
 'Anastasiya Vertinskaya': 4138,
 'Beauford': 2,
 'Don Wright': 3,
 'You': 4139,
 'using': 4,
 'Advisory': 5,
 'catch': 6,
 'effort': 7,
 'Ultimate Hockey': 3430,
 'favorable': 9,
 'streets': 10,
 'eleven': 11,
 'thirty': 4142,
 'New York Stock Exchange': 12,
 'BEL20': 13,
 'families': 4838,
 'indicating': 14,
 'charts': 15,
 'transformed': 16,
 'scientific': 4145,
 'Suite': 6169,
 'favourite': 4146,
 'Later': 4147,
 'Irene Radford': 17,
 'because': 4148,
 'inspires': 18,
 'majority': 19,
 'Newcastle upon Tyne': 7682,
 '1952': 20,
 'Center for Science and Culture': 4140,
 'nutrition': 23,
 'border': 24,
 'Israel': 25,
 'He': 4150,
 'Agotes': 4151,
 'Friedrich von Schlegel': 26,
 '11:00': 27,
 'boosted': 28,
 'AMEL': 30,
 'Asebi': 8,
 'opponent': 4152,
 'recognised': 32,
 'Prussian': 33,
 'Bridgeton': 4154,
 'listeners': 34,
 'rules.': 4155,
 'supporters': 35,
 'teach': 36,
 'Cards': 37,
 'industrial-agricultural': 4157,
 'Mistress to an Age': 4158,
 '6/40': 41,

In [5]:
pre.look_up.shape

(10, 8157)

In [6]:
window_size = pre.window_size
word_dim = pre.word_dim
window_dim = window_size*2 + 1
hidden_dim = 10

In [8]:
def get_window_vector(words): # words = 2w+1 word
    global window_size
    x = []
    for word in words:
        x.extend(pre.look_up[:,pre.word_idx[word[0]]])
    words_c = words.copy()
    while words_c[window_size][1] == 'I-LOC':
        random.shuffle(words_c)
    x_c = []
    for word in words_c:
        x_c.extend(pre.look_up[:,pre.word_idx[word[0]]])
    x = np.array(x).reshape(len(x), 1) # 10, -> 10, 1
    x_c = np.array(x_c).reshape(len(x_c), 1)
    return x, words, x_c, words_c

In [9]:
def feedforward(x, x_c):
    global W, U, b
    z = np.dot(W, x) + b
    z_c = np.dot(W, x_c) + b
    a = expit(z)
    a_c = expit(z_c)
    s = np.dot(U.transpose(), a)
    s_c = np.dot(U.transpose(), a_c)
    return z, a, s, z_c, a_c, s_c

In [10]:
def backprop():
    global W, U, b
    global x, z, a, s, x_c, z_c, a_c, s_c
    dU, dU_c = a, a_c
    error = np.multiply(U, np.multiply(expit(z), 1-expit(z))) # np.multiply(A, B) <=> For all i, a_i * b_i
    error_c = np.multiply(U, np.multiply(expit(z_c), 1-expit(z_c)))
    dW = np.dot(error, x.transpose()) # np.dot(A, B) <=> Inner product of A and B
    dW_c = np.dot(error_c, x_c.transpose())
    db = error
    db_c = error_c
    dx = np.dot(W.transpose(), error)
    dx_c = np.dot(W.transpose(), error_c)
    return dU, dW, db, dx, dU_c, dW_c, db_c, dx_c

In [11]:
def update():
    global U, W, b, dU, dW, db, dU_c, dW_c, db_c, learning_rate
    U -= learning_rate * (-dU + dU_c)
    W -= learning_rate * (-dW + dW_c)
    b -= learning_rate * (-db + db_c)
    return U, W, b

In [12]:
def update_lookup():
    global word, word_c, pre, word_dim, x, x_c, dx, dx_c, learning_rate
    look_up = np.copy(pre.look_up)
    for n, pair in enumerate(word):
        n_c = word_c.index(pair)
        target_dx = dx[n*word_dim:(n+1)*word_dim].reshape(word_dim,)
        target_dx_c = dx_c[n_c*word_dim:(n_c+1)*word_dim].reshape(word_dim,)
        look_up[:,pre.word_idx[pair[0]]] -= learning_rate * (-target_dx + target_dx_c) 
    return look_up

In [13]:
W = np.random.normal(loc=0, scale=0.5, 
                     size=(hidden_dim, window_dim*pre.word_dim))
U = np.random.normal(loc=0, scale=0.5, size=(hidden_dim, 1))
b = np.random.normal(loc=0, scale=0.5, size=(hidden_dim, 1))

In [14]:
W.shape, U.shape, b.shape

((10, 50), (10, 1), (10, 1))

In [15]:
W_old, U_old, b_old, look_up_old = np.copy(W), np.copy(U), np.copy(b), np.copy(pre.look_up)

In [16]:
look_up_old[:,pre.word_idx['UK']]

array([ 1.38104121, -0.36026122,  0.5625093 , -0.72992135, -0.15014624,
       -0.87295697,  1.29532456, -0.3446441 , -1.90868303, -0.74725082])

In [17]:
print(W)
print(U)
print(b)

[[  2.43200332e-01   6.76929703e-01   8.86985031e-02   5.16660075e-01
   -1.55152978e-01  -5.06688708e-02  -3.24277388e-01  -1.04097893e-02
   -2.19701041e-01   1.69051685e-01  -1.10132669e-01   6.51976065e-02
    4.03210139e-01   5.90684117e-01  -4.91265565e-01   2.37408359e-01
   -1.16821098e-01  -6.37384093e-01  -2.85391082e-01   3.10446757e-01
    3.43354243e-01   3.98176800e-01   6.80154727e-02  -3.90165628e-01
   -5.05090262e-01  -2.48118912e-01   2.71693450e-01  -9.88097087e-02
    5.32333560e-01  -4.81993580e-01   2.72004487e-01  -6.77590064e-01
    6.38265847e-02  -6.04214617e-01   2.11728440e-01  -1.63654750e-01
   -3.23491088e-01   1.89022070e-01   2.60317616e-01   1.10302022e-01
   -3.73982974e-01  -7.00924200e-01   4.02302199e-01   2.72390347e-01
   -1.02078373e-01   3.71892337e-01   1.44547256e-01  -3.49748407e-01
   -2.40694211e-01   1.34762755e-01]
 [ -2.52445856e-03   2.63402910e-01  -9.44116439e-01  -6.53720212e-01
    4.29362739e-01  -6.37495560e-01   2.93187599e-01 

In [18]:
learning_rate=0.1
epoch = 20

In [19]:
for t in range(epoch):
    pprint("[%d/%d]" % (t+1, epoch))
    J_pass, J_count = 0, 0
    for sent in pre.sents:
        for n in range(len(sent) - pre.window_size*2):
            nums = [n+i for i in range(pre.window_size*2+1)]
            words = [sent[num] for num in nums]
            if words[pre.window_size][1] != 'I-LOC':
                continue
            x, word, x_c, word_c = get_window_vector(words)
            z, a, s, z_c, a_c, s_c = feedforward(x, x_c)
            J = max(0, 1 - s[0][0] + s_c[0][0])

            J_count += 1
            if J > 0:
                dU, dW, db, dx, dU_c, dW_c, db_c, dx_c = backprop()
                U, W, b= update()
                pre.look_up = update_lookup()
            else:
                J_pass += 1
                continue
    pprint("%d / %d" % (J_pass, J_count))

'[1/20]'
'564 / 1014'
'[2/20]'
'734 / 1014'
'[3/20]'
'795 / 1014'
'[4/20]'
'853 / 1014'
'[5/20]'
'868 / 1014'
'[6/20]'
'883 / 1014'
'[7/20]'
'906 / 1014'
'[8/20]'
'925 / 1014'
'[9/20]'
'919 / 1014'
'[10/20]'
'923 / 1014'
'[11/20]'
'941 / 1014'
'[12/20]'
'931 / 1014'
'[13/20]'
'951 / 1014'
'[14/20]'
'943 / 1014'
'[15/20]'
'943 / 1014'
'[16/20]'
'962 / 1014'
'[17/20]'
'965 / 1014'
'[18/20]'
'970 / 1014'
'[19/20]'
'968 / 1014'
'[20/20]'
'973 / 1014'


In [20]:
pre.look_up[:,pre.word_idx['UK']]

array([ 1.92266745,  0.65821441,  0.56778065, -2.15965321, -0.18903798,
       -0.87155705,  1.47740683, -1.01838206, -0.97944603, -0.57275121])

In [21]:
print(W)
print(U)
print(b)

[[-0.54678644  1.1465987  -0.51765668 -0.24567722  1.01976957 -1.10536732
  -0.21756396 -0.74293868  0.53323297 -0.80530814 -2.37564963 -0.0715684
   1.25015666  1.09819217  0.10817743 -0.56291918 -1.10036713 -0.32921334
   0.17980226  0.58243646  0.35454741  1.96639962  0.63510313 -2.68920088
  -1.21624252 -0.76542782 -0.07046284 -1.09132623  2.09807814 -0.07529415
   0.24535771 -1.55210567 -0.76731511  0.51996708 -0.11912168  0.66971634
   0.05624902  0.15474465  1.59063026  1.01563783  0.06493863 -0.11745417
  -0.0457221   0.16944778 -0.54450902  0.08227546 -0.50382554 -0.36925855
   1.32323487 -0.4389935 ]
 [-0.2554074  -0.05455256 -1.29694532 -0.76692052  0.08824678 -0.45115434
  -0.3754048   1.2319611   1.22499434 -0.50818254  0.26037529  0.28981754
   0.63749331 -0.65032219 -0.57636648  0.02738162 -0.28987779 -1.00066895
  -1.70040838  0.20782449  1.02962504  1.05125849 -0.02155495 -0.93429029
   0.7688841  -0.27348878  0.45277341 -0.30408077  1.39101052  0.05470791
  -0.9413423