In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

NUM = "NNNUMMM"
START = "<s>"
END = "</s>"
UNK = "UNKKK"
WORDEMBSIZE = 50
LABELS = ["O","B-PER", "B-ORG", "B-LOC", "B-MISC", "I-PER", "I-ORG", "I-LOC", "I-MISC"] 
num_layers = 3 
window_size = 1

np.random.seed(1)

In [2]:
def read_conll(fstream):
    '''
    Reads the training data
    '''
    ret = []

    current_toks, current_lbls = [], []
    for line in fstream:
        line = line.strip()
        if len(line) == 0 or line.startswith("-DOCSTART-"):
            if len(current_toks) > 0:
                assert len(current_toks) == len(current_lbls)
                ret.append((current_toks, current_lbls))
            current_toks, current_lbls = [], []
        else:
            arr=line.split()
            tok, lbl = arr[0], arr[-1]
            if tok.isdigit():
                tok = NUM
            current_toks.append(tok)
            current_lbls.append(lbl)
    if len(current_toks) > 0:
        assert len(current_toks) == len(current_lbls)
        ret.append((current_toks, current_lbls))
    return ret

In [3]:
def createMapping(data):
    '''
    Creates unique 'token to number' and 'number to token' mappings
    '''
    token_to_number = {}
    number_to_token = {}
    token_to_number = build_dict((word.lower() for sentence,_ in data for word in sentence), offset=1)
    token_to_number["CASE: aa"] = len(token_to_number)+1
    token_to_number["CASE: aA"] = len(token_to_number)+1
    token_to_number["CASE: Aa"] = len(token_to_number)+1
    token_to_number["CASE: AA"] = len(token_to_number)+1
    token_to_number[START] = len(token_to_number)+1
    token_to_number[END] = len(token_to_number)+1
    token_to_number[UNK] = len(token_to_number)+1
    
      
    return token_to_number, number_to_token

In [4]:
def build_dict(words, max_words=None, offset=0):
    '''
    Returns a dictionary with a maximum number of words mentioned
    '''
    cnt = Counter(words)
    if max_words:
        words = cnt.most_common(max_words)
    else:
        words = cnt.most_common()
        
    return {word: offset+i for i, (word, _) in enumerate(words)}

In [5]:
def findCase(word):
    '''
    Returns the casing of a word
    ''' 
    if len(word) == 0: return word
    
    # all lowercase
    if word.islower(): return "CASE: aa"
    # all uppercase
    elif word.isupper(): return "CASE: AA"
    # starts with capital
    elif word[0].isupper(): return "CASE: Aa"
    # has non-initial capital
    else: return "CASE: aA"

In [6]:
def embeddingLayer(fstream, token_to_num):
    '''
    Returns the Embedding Layer Matrix
    '''
    embeddings_dict = {}
    for line in fstream:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector
    emb = np.zeros((len(token_to_num) + 1, 50))
    emb[0] = 0.
    for word, vec in embeddings_dict.items():
        if word in token_to_num:
            emb[token_to_num[word]] = vec
    return emb

In [7]:
def oneHot(n_dim, i):
    '''
    Returns a one Hot encoded array
    '''
    if isinstance(i, int):
        
        ret = np.zeros(n_dim)
        ret[i] = 1.0
        return ret

In [8]:
def makeWindowedData(dat, size_wind, embedding_layer, token_to_number, LABELS):
    '''
    Returns windowed word embeddings 
    '''
    final = list()
    for el in dat:
      sentence = el[0]
      lab = el[1]
      sentence = [START]*size_wind + sentence + [END]*size_wind
      for i in range(len(sentence)):
            temp = sentence[i].lower()
            if temp not in token_to_number:
                sentence[i] = UNK
                temp = UNK
            sentence[i] = np.hstack((embedding_layer[token_to_number[temp]],embedding_layer[token_to_number[findCase(sentence[i])]]))
      k = 0
      for i in range(size_wind, len(sentence)-size_wind):
          res = list()
          for j in range(i-size_wind, i+size_wind+1):
              res.extend(sentence[j]) 
          final.append(np.array([np.array(res),oneHot(len(LABELS), LABELS.index(lab[k]))]))
          k += 1
    return np.array(final)

In [9]:
def classFreqDict(word_emb):
    '''
    Returns the Frequency of the Class of the given word
    '''
    y_true=[]
    freq_dict = {}
    total = len(word_emb)
    for el in word_emb:
        t_label = el[1]
        x = t_label.tolist()
        val = LABELS[x.index(max(x))]
        if val not in freq_dict:
            freq_dict[val] = 1
        else:
            freq_dict[val] += 1
    return freq_dict

In [10]:
def initialize_weights(n,m): # m:input layer nodes, n:output layer nodes
    '''
    Xavier Glorot Initialization of Weights
    '''
    r = np.sqrt(6/(m+n))
    weights = np.random.uniform(-r,r,size=(n,m))
    return weights

def sigmoid(Z):
    A = 1 / (1 + np.exp(-Z))
    return A


def tanh(Z):
    A = np.tanh(Z)
    return A


def relu(Z):
    A = np.maximum(0, Z)
    return np.array(A, dtype = np.float128)

def softmax(x):
    b = x.max()
    y = np.exp(x - b)
    return y / y.sum()

def cross_entropy_loss(output, Y_batch, freq_dict, total, hidden_weights, lambd):
    '''
    Returns the Weighted Cross Entropy Loss with Regularization
    '''
    m = Y_batch.shape[1]
    freq = []
    for i in range(len(Y_batch)):
        t_label = Y_batch[i]
        x = t_label.tolist()
        val = LABELS[x.index(max(x))]
        freq.append(freq_dict[val]/total)
    freq = np.array(freq)
    freq = np.reshape(freq,(-1,1))
    out_sum = np.sum(np.sum(np.multiply(freq,Y_batch) * np.where(output != 0, np.log(output)+.001, 0), axis=0, keepdims=True), axis=1)
    loss = -(1/m)*out_sum
    
    L = len(hidden_weights)
    L2_regularization_cost = 0
    for i in range(1,L):
        L2_regularization_cost += np.sum(np.square(hidden_weights[i]))

    L2_regularization_cost = L2_regularization_cost*(lambd/(2*m))
    loss += L2_regularization_cost
    return loss


In [11]:
def fwdprop(X_batch, hidden_weights, bias, hidden_layers_activation_fn="relu"):
    '''
    Performs forward propagation part of the Network
    '''
    A = X_batch                           
    caches = [] 
    Z = []
    A_last = []
    L = len(hidden_weights)       
    for l in range(1, L):
        A_prev = A
        A = []
        for i in range(len(X_batch)):
            z = np.dot(hidden_weights[l], np.reshape(A_prev[i],(-1,1))) + bias[l]
            a = hidden_layers_activation_fn(z)
            Z.append(z)
            A.append(a)
        cache =(np.array(A_prev,dtype=np.float128), hidden_weights[l],bias[l],np.array(Z,dtype=np.float128))
        A = np.array(A)
        caches.append(cache)
    
    A = np.squeeze(A)
    for i in range(len(X_batch)):
        z = np.dot(hidden_weights[L],np.reshape(A[i], (-1,1))) + bias[L]
        a = softmax(np.array(z, dtype=np.float128))
        Z.append(z)
        A_last.append(a)
    
    cache = (np.array(A), hidden_weights[L], bias[L], np.array(Z))
    caches.append(cache)

    return np.squeeze(np.array(A_last,dtype=np.float128)), caches


In [12]:
def sigmoid_backprop(dA, Z):
    A, Z = sigmoid(Z)
    dZ = dA * A * (1 - A)

    return dZ


def tanh_backprop(dA, Z):
    A, Z = tanh(Z)
    dZ = dA * (1 - np.square(A))

    return dZ


def relu_backprop(dA, Z):
    A = relu(Z)
    dZ = np.multiply(dA, np.int64(A > 0))

    return np.array(dZ, dtype = np.float128)

def softmax_backprop(target, predicted):
    return predicted - target



In [13]:

def bwd_step(dZ, A_prev, W, b):
    '''
    Performs one step of Backward Propagation
    '''
    m = A_prev.shape[1]

    dW = (1 / m) * np.dot(dZ, A_prev.T)
    db = (1 / m) * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)
    
    assert dA_prev.shape == A_prev.shape
    assert dW.shape == W.shape
    assert db.shape == b.shape

    return np.array(dW,dtype = np.float128), np.array(db,dtype = np.float128), np.array(dA_prev, dtype = np.float128)

def bwdprop(output, Y_batch, caches, hidden_weights, lambd, hidden_layers_activation_fn="relu"):
    '''
    Peforms Backward Propagation
    '''
    m= Y_batch.shape[1]
    
    Y_batch = Y_batch.reshape(output.shape)
    L = len(caches)
    A_prev, W, b, Z =caches[L-1]
    dW = {}
    dB = {}
    dA = {}
    
    dZ = softmax_backprop(Y_batch, output)
    dW[L] = []
    dB[L] = []
    dA[L-1] = []
    
    for i in range(len(dZ)):
        dw = np.dot(np.reshape(dZ[i],(-1,1)), np.transpose(np.reshape(A_prev[i],(-1,1)))) + (lambd/m)*hidden_weights[L]
        db = np.reshape(dZ[i],(-1,1))
        da = np.dot(W.T, np.reshape(dZ[i],(-1,1)))
        dW[L].append(dw)
        dB[L].append(db)
        dA[L-1].append(da)

    for l in range(L - 1, 0, -1):
        cache = caches[l - 1]
        A_prev, W, b, Z =cache
        Z = np.squeeze(Z)
        dA[l] = np.array(dA[l])
        dW[l] = []
        dB[l] = []
        dA[l-1] = []

        for i in range(len(dZ)):
            dz = relu_backprop(dA[l][i], np.reshape(Z[i],(-1,1)))
            dz = np.array(dz)
            dw, db, da = bwd_step(dz, np.reshape(A_prev[i],(-1,1)), W, b) 
            dw += (lambd/m)*hidden_weights[l]
            
            if hidden_layers_activation_fn == "sigmoid":
                dz = sigmoid_backprop(dA[l][i], np.reshape(Z[i],(-1,1)))
                dw, db, da = bwd_step(dz, np.transpose(np.reshape(A_prev[i],(-1,1))), W, b) 

            elif hidden_layers_activation_fn == "tanh":
                dz = tanh_backprop(dA[l][i],  np.reshape(Z[i],(-1,1)))
                dw, db, da = bwd_step(dz, np.transpose(np.reshape(A_prev[i],(-1,1))), W, b) 

            elif hidden_layers_activation_fn == "relu": 
                dz = relu_backprop(dA[l][i], np.reshape(Z[i],(-1,1)))
                dz = np.array(dz)
                dw, db, da = bwd_step(dz, np.transpose(np.reshape(A_prev[i],(-1,1))), W, b) 
        
            
            dW[l].append(dw)
            dB[l].append(db)
            dA[l-1].append(da)
    return dW,dB


def update_parameters(hidden_weights, b, dW, db, alpha):
    '''
    Returns the updated weights
    '''
    L = len(hidden_weights)

    length = len(dW[1])
    for l in range(1, L + 1):
        dweights = np.array(dW[l])
        dbias = np.array(db[l])

        for i in range(1,length):
            
            hidden_weights[l] -= alpha*(dweights[i])
            b[l] -= alpha*dbias[i]
        
    return hidden_weights,b

    

In [14]:
def neuralNetwork_train(data, vocabulary_size, num_layers, H, embed_size, alpha, batch_size, epochs,window_size, activation_fn, freq_dict, lambd):
    '''
    Trains the Neural Network
    '''
    X = []
    Y = []
    for i in range(len(data)):
        X.append(data[i][0])
        Y.append(data[i][1])
    X = np.array(X, dtype=np.float128)
    Y = np.array(Y, dtype=np.float128)
    
    hidden_weights ={}
    b = {}
    
    for i in range(1,num_layers):
        hidden_weights[i] = initialize_weights(H[i],H[i-1]) 
        b[i] = np.zeros((H[i],1), dtype=np.float128)
    no_windows = len(data)
    costs = []
    
    for epoch in range(epochs):
        cost = 0
        store_val = []
        for i in range(0,no_windows-1,batch_size):
            X_batch = X[i:i+batch_size,:]
            Y_batch = Y[i:i+batch_size,:]

            output, caches = fwdprop(X_batch, hidden_weights,b,activation_fn)
            loss = cross_entropy_loss(output, Y_batch,freq_dict,len(data),hidden_weights, lambd)
            dW, db = bwdprop(output, Y_batch, caches, hidden_weights, lambd, activation_fn)
            store_val.append(output)
            hidden_weights,b = update_parameters(hidden_weights, b, dW, db, alpha)
            cost += np.squeeze(loss)
        costs.append(cost)
    plt.plot(np.arange(epochs), costs)
    plt.xlabel('# of epochs')
    plt.ylabel('cost')
    return store_val,hidden_weights,b
    

In [15]:
def precision_recall_f1score(y_true, predicted, LABELS):
    '''
    Prints the Precision, Recall and F1-score
    '''
    tp = {"O" : 0,"B-PER": 0, "B-ORG": 0, "B-LOC": 0, "B-MISC": 0, "I-PER": 0, "I-ORG": 0, "I-LOC": 0, "I-MISC": 0}
    fp = {"O" : 0,"B-PER": 0, "B-ORG": 0, "B-LOC": 0, "B-MISC": 0, "I-PER": 0, "I-ORG": 0, "I-LOC": 0, "I-MISC": 0}
    tn = {"O" : 0,"B-PER": 0, "B-ORG": 0, "B-LOC": 0, "B-MISC": 0, "I-PER": 0, "I-ORG": 0, "I-LOC": 0, "I-MISC": 0}
    fn = {"O" : 0,"B-PER": 0, "B-ORG": 0, "B-LOC": 0, "B-MISC": 0, "I-PER": 0, "I-ORG": 0, "I-LOC": 0, "I-MISC": 0}
    prec = {"O" : 0,"B-PER": 0, "B-ORG": 0, "B-LOC": 0, "B-MISC": 0, "I-PER": 0, "I-ORG": 0, "I-LOC": 0, "I-MISC": 0}
    rec = {"O" : 0,"B-PER": 0, "B-ORG": 0, "B-LOC": 0, "B-MISC": 0, "I-PER": 0, "I-ORG": 0, "I-LOC": 0, "I-MISC": 0}
    f1 = {"O" : 0,"B-PER": 0, "B-ORG": 0, "B-LOC": 0, "B-MISC": 0, "I-PER": 0, "I-ORG": 0, "I-LOC": 0, "I-MISC": 0}
    tp_total = 0
    fp_total = 0
    tn_total = 0
    fn_total = 0
    for label in LABELS:
        for i in range(len(y_true)):
            if label == y_true[i]:
                if y_true[i]==predicted[i]:
                    tp[label] += 1
                    tp_total += 1
                else:
                    fn[label] += 1
                    fn_total += 1
            elif label == predicted[i]:
                fp[label] += 1
                fp_total += 1
            else:
                tn[label] += 1
                tn_total += 1
        prec[label] = tp[label] / (tp[label] + fp[label])
        rec[label] = tp[label] / (tp[label] + fn[label])
        f1[label] = 2 * prec[label] * rec[label] / (prec[label]+rec[label])
    prec3=0
    rec3=0
    f1_3=0
    for label in LABELS:
        weight = y_true.count(label)/len(y_true)
        prec3 += weight * prec[label]
        rec3 += weight * rec[label]
        f1_3 += weight * f1[label]
    for label in LABELS:
        print('Label :',label,'\nActual :',y_true.count(label),'Predicted :', predicted.count(label),'\n')
    print('Precision :',prec3)
    print('Recall :',rec3)
    print('F1-Score :',f1_3)

In [16]:
def predict(w,b):
    '''
    Runs the model on the test set and prints the class-wise data and the evaluation metrics values
    '''
    test_data = read_conll(open('test.txt','r'))
    token_to_number_test, number_to_token_test = createMapping(test_data)
    embedding_layer = embeddingLayer(open('embeddings.txt'),token_to_number_test)
    word_emb_test = makeWindowedData(test_data, window_size, embedding_layer, token_to_number_test, LABELS)
    X = []
    Y= []
    for i in range(len(word_emb_test)):
            X.append(word_emb_test[i][0])
            Y.append(word_emb_test[i][1])
    output, caches = fwdprop(X,w,b,relu)
    output = output.tolist()
    predicted = []
    for k in range(len(output)):
            predicted.append(LABELS[output[k].index(max(output[k]))])
    y_true = []
    for el in word_emb_test:
        t_label=el[1]
        x=t_label.tolist()
        y_true.append(LABELS[x.index(max(x))])
    precision_recall_f1score(y_true, predicted, LABELS)


In [17]:
# #Training
# #Uncomment this cell to Train the model
# data = read_conll(open('train.txt','r'))
# H =[300,150,9]
# token_to_number, num2tok = createMapping(data)
# embedding_layer = embeddingLayer(open('embeddings.txt'), token_to_number)
# word_emb = makeWindowedData(data, window_size, embedding_layer, token_to_number, LABELS)
# freq_dict = classFreqDict(word_emb)
# y_pred, w, b = neuralNetwork_train(word_emb, len(token_to_number), num_layers, H, WORDEMBSIZE, 0.001, 32, 20, window_size, relu, freq_dict, 0.2)
# np.save('weights', w)
# np.save('bias', b)

In [18]:
#Testing
w = np.load('trained_weights.npy', allow_pickle=True)
b = np.load('trained_bias.npy', allow_pickle=True)
weights = {}
bias = {}
for i in range(1, num_layers):
    weights[i] = w.item().get(i)
    bias[i] = b.item().get(i)
predict(weights, bias)

Label : O 
Actual : 38323 Predicted : 37006 

Label : B-PER 
Actual : 1617 Predicted : 1096 

Label : B-ORG 
Actual : 1661 Predicted : 867 

Label : B-LOC 
Actual : 1668 Predicted : 2949 

Label : B-MISC 
Actual : 702 Predicted : 1303 

Label : I-PER 
Actual : 1156 Predicted : 1582 

Label : I-ORG 
Actual : 835 Predicted : 485 

Label : I-LOC 
Actual : 257 Predicted : 843 

Label : I-MISC 
Actual : 216 Predicted : 304 

Precision : 0.8429413818807842
Recall : 0.8230644987617101
F1-Score : 0.8281321474207554
