In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import string
import random
from tqdm import tqdm
from datetime import datetime
from heapq import heappush,heappop
from collections import Counter,defaultdict
import multiprocessing as mp
from sklearn.utils import shuffle
import time
import networkx as nx
import copy
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
print("Is GPU available? ",torch.cuda.is_available())

def preOrder(tree):
    levels = dict()
    def dfs(root,level,i):
        if root==None:
            return i
        ls = levels.get(level,[])
        ls.append(root.word+":"+str(i))
        i+=1
        levels[level]=ls
        i = dfs(root.left,level+1,i)
        i = dfs(root.right,level+1,i)
        return i
    dfs(tree,0,0)
    return levels

class Tree:
    def __init__(self,left=None,right=None,word=""):
        self.left = left
        self.right = right
        self.word = word

class Solution:
    def solve(self,string):
        self.i=0
        self.word=""
        tree = Tree()
        def dfs(root): # preorder
            if self.i==len(string):
                return
            if string[self.i]=="(":
                if self.word.strip()!="":
                    root.word=self.word.strip()
                else:
                    self.i+=1
                    return dfs(root)
                self.word=""
                self.i+=1
                root.left=Tree()
                dfs(root.left)
                while string[self.i]==")":
                    self.i+=1
                root.right=Tree()
                dfs(root.right)
            elif string[self.i]==")":
                root.word=self.word.strip()
                self.word=""
                self.i+=1
            else:
                self.word+=string[self.i]
                self.i+=1
                dfs(root)
        dfs(tree)
        return tree
    
def draw(levels):
    levels_ = copy.deepcopy(levels)
    stack = [levels_[0][0]]
    G = nx.Graph()
    i = 1
    while stack:
        prev = stack.pop(0)
        if i not in levels_:
            break
        cur1 = levels_[i].pop(0)
        cur2 = levels_[i].pop(0)
        if cur1.split(":")[0].strip() in string.digits:
            stack.append(cur1)
        if cur2.split(":")[0].strip() in string.digits:
            stack.append(cur2)
        G.add_edge(prev,cur1)
        G.add_edge(prev,cur2)
        if len(levels_[i])==0:
            i+=1
    pos = {}
    for key in levels.keys():
        width = len(levels[key])
        x0 = -0.5-(width//2-1)
        for j in range(len(levels[key])):
            try:
                x = int(levels[key][j].split(":")[1])
                pos[levels[key][j]]=(x0+j,10-key*0.5)
            except:
                pass
    d = dict(G.degree)
    plt.figure(figsize=(20,20*len(levels)/8))
    nx.draw_networkx(G,pos,
        node_size=[5000 for v in d.values()])
    plt.axis("off")
    
s = Solution()
        
PATHglove="/home/yui/Documents/data/nlp/glove.6B/glove.6B.50d.txt"
PATH_Train_Tree="/home/yui/Documents/data/nlp/trainDevTestTrees_PTB/trees/train.txt"
PATH_Test_Tree="/home/yui/Documents/data/nlp/trainDevTestTrees_PTB/trees/test.txt"

class preprocess:
    def __init__(self,PATHglove,PATH_Train_Tree,
                PATH_Test_Tree):
        self.PATHglove = PATHglove
        self.PATH_Train_Tree=PATH_Train_Tree
        self.PATH_Test_Tree=PATH_Test_Tree
        self.l2id,self.id2l=dict(),dict()
        self.initEmbeddings()
        self.loadTreeString()
        self.getRequiredVocabs()
        self.buildEmbeddingsMatrix()
    def initEmbeddings(self):
        self.w2id,self.id2w,self.res={},{},[]
        with open(self.PATHglove) as f:
            lines = f.readlines()
            for i,line in enumerate(lines):
                line = line.strip()
                tokens = line.split(" ")
                word = tokens[0]
                vec = list(map(float,tokens[1:]))
                self.res.append(vec)
                self.w2id[word]=i
                self.id2w[i]=word
        self.k = len(self.res[0])
        print("[INFO] Loading Embeddings ... Done")
        print("Vocabulary size: ",len(self.w2id))
        print("Embeddings: ",self.res[0])
        print("----------")
    def loadTreeString(self):
        self.data = []
        with open(self.PATH_Train_Tree) as f:
            lines = f.readlines()
            for line in lines:
                self.data.append(line)
        self.tdata = []
        with open(self.PATH_Test_Tree) as f:
            lines = f.readlines()
            for line in lines:
                self.tdata.append(line)
    def extractLevels(self,l):
        words,j = [],0
        for key in l.keys():
            ls = l[key]
            for ele in ls:
                partition=ele.split(":")[0].strip().split(" ")
                label = partition[0].strip()
                if label not in self.l2id:
                    self.l2id[label]=j
                    self.id2l[j]=label
                    j+=1
                if len(partition)==2:
                    word = partition[1].lower()
                    self.vocabs.add(word)
                    words.append(word)
                    if word not in self.w2id:
                        i=len(self.w2id)
                        self.w2id[word]=i
                        self.id2w[i]=word
                
        return words
    def getRequiredVocabs(self):
        self.trainTrees,self.testTrees=[],[]
        self.vocabs=set()
        s = Solution()
        for d in tqdm(self.data):
            t = s.solve(d)
            self.trainTrees.append(t)
            l = preOrder(t)
            words = self.extractLevels(l)
        for d in tqdm(self.tdata):
            t = s.solve(d)
            self.testTrees.append(t)
            l = preOrder(t)
            words = self.extractLevels(l)
            
        print("Vocabularies required in Tree: ",len(self.vocabs))
        print("----------")
        
    def buildEmbeddingsMatrix(self):
        self.embedMatrix=np.zeros((len(self.vocabs),self.k))
        self.w2id_exp,self.id2w_exp=dict(),dict()
        for i,vocab in enumerate(self.vocabs):
            self.w2id_exp[vocab]=i
            self.id2w_exp[i]=vocab
            try:
                self.embedMatrix[i]=self.res[self.w2id[vocab]]
            except:
                self.embedMatrix[i]=np.random.randn(self.k)
        print("Embedding Matrix: ",self.embedMatrix)
        print("----------")
pp = preprocess(PATHglove,PATH_Train_Tree,PATH_Test_Tree)

Is GPU available?  True


  9%|▉         | 761/8544 [00:00<00:01, 7606.43it/s]

[INFO] Loading Embeddings ... Done
Vocabulary size:  400000
Embeddings:  [0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.044457, -0.49688, -0.17862, -0.00066023, -0.6566, 0.27843, -0.14767, -0.55677, 0.14658, -0.0095095, 0.011658, 0.10204, -0.12792, -0.8443, -0.12181, -0.016801, -0.33279, -0.1552, -0.23131, -0.19181, -1.8823, -0.76746, 0.099051, -0.42125, -0.19526, 4.0071, -0.18594, -0.52287, -0.31681, 0.00059213, 0.0074449, 0.17778, -0.15897, 0.012041, -0.054223, -0.29871, -0.15749, -0.34758, -0.045637, -0.44251, 0.18785, 0.0027849, -0.18411, -0.11514, -0.78581]
----------


100%|██████████| 8544/8544 [00:01<00:00, 4306.86it/s]
100%|██████████| 2210/2210 [00:00<00:00, 7943.81it/s]


Vocabularies required in Tree:  18646
----------
Embedding Matrix:  [[-0.019331  -0.98478   -0.090353  ...  0.5815    -0.56574   -0.54883  ]
 [-1.0163     0.2053    -0.013379  ...  0.81246   -0.36699    0.62712  ]
 [ 0.40235   -0.45165   -0.90334   ...  0.68559    0.97844    0.36966  ]
 ...
 [-0.72956    0.38623    1.0412    ... -1.2386    -0.86671    0.46243  ]
 [-0.73887   -0.88399   -0.0055762 ... -0.91796   -0.31749   -1.2714   ]
 [-0.22895    0.30128    1.523     ... -0.1907     0.42533    0.2329   ]]
----------


In [2]:
class TNN(nn.Module):
    def __init__(self,pp):
        super().__init__()
        self.pp = pp
        self.emb = nn.Embedding(len(pp.w2id_exp),
                        len(pp.res[0]))
        self.emb.load_state_dict({'weight':\
                        torch.Tensor(pp.embedMatrix)})
        self.emb.weight.requires_grad = False
        self.Wl = nn.parameter.Parameter(\
            torch.randn(len(pp.res[0]),len(pp.res[0])))
        self.Wr= nn.parameter.Parameter(\
            torch.randn(len(pp.res[0]),len(pp.res[0])))
        self.b = nn.parameter.Parameter(\
            torch.randn(len(pp.res[0])))
        self.Wo = nn.parameter.Parameter(\
            torch.randn(len(pp.res[0]),len(pp.l2id)))
        self.bo = nn.parameter.Parameter(\
            torch.randn(len(pp.l2id)))
        self.loss = nn.CrossEntropyLoss()
        self.outF = nn.Softmax(dim=2)
        self.actF = nn.ReLU()
    def postOrder(self,tree):
        def dfs(root):
            if root.word.strip() not in string.digits:
                tokens = root.word.strip().split(" ")
                score,word=tokens[0],tokens[1].lower()
                v=torch.tensor([[self.pp.w2id_exp[word]]])
                v=self.emb(v)
                out=self.outF(torch.matmul(v,
                    self.Wo)+self.bo).squeeze(0)
                target=torch.tensor([self.pp.l2id[score]])
                return v,self.loss(out,target),\
                    1*(torch.argmax(out)==target.squeeze())
            xl,lossl,hitl=dfs(root.left)
            xr,lossr,hitr= dfs(root.right)
            h = torch.matmul(xl,self.Wl)+self.b+\
                    torch.matmul(xr,self.Wr)
            h = self.actF(h)
            tokens = root.word.strip().split(" ")
            out=self.outF(torch.matmul(h,
                self.Wo)+self.bo).squeeze(0)
            target=torch.tensor([self.pp.l2id[tokens[0]]])
            return h,self.loss(out,target),\
                1*(torch.argmax(out)==target.squeeze())
        return dfs(tree)
    def forward(self,x):
        if type(x)==Tree:
            return self.postOrder(x)
        elif type(x)==list:
            h,loss = torch.empty((len(x),1,self.pp.k)),0
            hit,ignore=0,[]
            for i,x_ in enumerate(x):
                try:
                    h[i],loss_,hit_=self.postOrder(x_)
                    hit+=hit_
                    loss+=loss_
                except:
                    ignore.append(i)
            return h,loss,hit,(len(x)-len(ignore))

In [3]:
model = TNN(pp)
batch_size = 128
optimizer = optim.SGD(model.parameters(),lr=0.001,momentum=0.9)
# optimizer = optim.Adam(model.parameters(),lr=1e-3)
epochs = 200
printInterval=1
losses = []
for epoch in range(200):
    random.shuffle(pp.trainTrees)
    running_loss,numExamples,numHits=0,0,0
    trainSize = len(pp.trainTrees)
    steps = trainSize//batch_size
    for i in tqdm(range(steps)):
        optimizer.zero_grad()
        _,loss,hits,cases=model(\
            pp.trainTrees[i*batch_size:(i+1)*batch_size])
        loss.backward()
        optimizer.step()
        numHits+=hits
        numExamples+=cases
        running_loss+=loss.item()
    if epoch%printInterval==0:
        print("Epoch {} training loss: {:.4f}".format(epoch,
                    running_loss/numExamples))
        print("Epoch {} training accuracy: {:.4f}".format(epoch,
                    numHits/numExamples))
        losses.append(running_loss)
        

100%|██████████| 66/66 [00:24<00:00,  2.65it/s]
  0%|          | 0/66 [00:00<?, ?it/s]

Epoch 0 training loss: 1.6453
Epoch 0 training accuracy: 0.2595


100%|██████████| 66/66 [00:25<00:00,  2.62it/s]
  0%|          | 0/66 [00:00<?, ?it/s]

Epoch 1 training loss: 1.6464
Epoch 1 training accuracy: 0.2584


100%|██████████| 66/66 [00:24<00:00,  2.65it/s]
  0%|          | 0/66 [00:00<?, ?it/s]

Epoch 2 training loss: 1.6457
Epoch 2 training accuracy: 0.2591


100%|██████████| 66/66 [00:25<00:00,  2.63it/s]
  0%|          | 0/66 [00:00<?, ?it/s]

Epoch 3 training loss: 1.6463
Epoch 3 training accuracy: 0.2585


 15%|█▌        | 10/66 [00:04<00:23,  2.39it/s]


KeyboardInterrupt: 