In [1]:
import re
import pickle
import json
import string
import gensim
import xml.etree.ElementTree as ET
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import os
from nltk.tokenize import word_tokenize, sent_tokenize
import numpy as np
import math
from difflib import SequenceMatcher
import pandas as pd

In [2]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
year_regex = re.compile(r'((19[0-9]{2})|(20[0-9]{2}))[a-z]?')
conversion_dict = {}
stop_words = [',', '.', '(', ')', ':', '-', "+", ";", "a", "about", "al", "al.", "all", 
	"already", "also", "although", "am", "an", "and", "another", "any", "anyhow", "are", 
	"aren", "aren't", "around", "as", "at", "back", "be", "because", "been", 
	"being", "beyond", "but", "by", "can", "cannot", "cant", "co", "con", "could", "couldn", 
	"couldnt", "d", "de", "did", "didn", "didn't", "do", "does", "doesn", "doesn't", 
	"doing", "don", "don't", "done", "due", "each", "either", "else", "elsewhere", "et", 
	"etc", "even", "ever", "except", "for", "found", "from", "further", "had", "hadn", 
	"hadn't", "has", "hasn", "hasn't", "hasnt", "have", "haven", "haven't", "having", 
	"he", "hence", "her", "here", "hereafter", "hereby", "hers", 
	"herself", "him", "himself", "his", "how", "however", "i", "ie", "if", "in", "inc", 
	"indeed", "interest", "into", "is", "isn", "isn't", "it", "it's", "its", "itself", 
	"just", "ltd", "ll", "m", "may", "me", "meanwhile", "might", "mightn", 
	"mightn't", "mine", "moreover", "most", "mostly", "move", "much", "must", "mustn", 
	"mustn't", "my", "myself", "name", "namely", "need", "needn", "needn't", "neither", 
	"nevertheless", "no", "nobody", "noone", "nor", "not", "now", "nowhere", "o", "of", 
	"off", "often", "on", "only", "onto", "or", "other", "others", "otherwise", "our", "own", 
	"per", "perhaps", "put", "rather", "re", "s", "same", "see", "seem", "seemed", 
	"seeming", "seems", "serious", "she", "should", "shouldn", "shouldn't", "since", 
	"sincere", "so", "some", "somehow", "someone", "something", "somewhere", "still", 
	"such", "t", "take", "than", "that", "that'll", "the", "their", "theirs", 
	"them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", 
	"therein", "thereupon", "these", "they", "this", "those", "though", "throughout", 
	"thru", "thus", "to", "together", "too", "toward", "towards", "un", "until", "upon", 
	"us", "ve", "very", "via", "was", "wasn", "wasn't", "we", "well", "were", "weren", 
	"weren't", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", 
	"whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", 
	"whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", 
	"without", "won", "won't", "would", "wouldn", "wouldn't", "y", "yet", "you", "your", 
	"yours", "yourself", "yourselves", "from SVM import SVCone", "two", "three", "four", "five", "six", "seven",
	"eight", "nine", "zero", "between", 'below', 'ourselves', "you'll", 'again', 'once', 'over', 'shan', 'few', 
    'against', 'before', 'out', 'down', 'both', 'up', "you've", "shan't", "you're", "should've", 'ours', 'ma', 
    "couldn't", 'during', 'more', 'ain', 'through', 'after', 'above', "she's", "you'd", 'under' ]

In [3]:
def get_citations(folder):
    citation_list = {}
    for file in os.listdir(folder) :
        tree = ET.parse(folder+file)
        root = tree.getroot()
        id = file[:8]
        for element in root.iterfind("algorithm"):
            if(element.attrib['name']=="ParsCit"):
                citlist = element.getchildren()
                cits = citlist[0].getchildren()
                citations = []
                for cit in cits:
                    cit_dict = {}
                    if(cit.attrib['valid']=="true"):
                        try :
                            title = cit.find('title').text.lower()
                        except :
                            title = cit.find('rawString').text.lower()
                        cit_dict['title'] = title
                        cit_dict['cit'] = cit
                        citations.append(cit_dict)
                
                citation_list[id] = citations
            
    return citation_list 

In [4]:
citations = get_citations("../xmls/")

In [5]:
def get_words(string, cit_auths=None):

    string = string.replace('-', '').lower()
    context_words = word_tokenize(string)
    final_context_words = []

    for word in context_words:
        if word in stop_words:
            continue
        if re.fullmatch(year_regex, word): #or match_auths(word, cit_auths):
            continue
        if re.fullmatch(r'[0-9]+([.][0-9]+)?', word):
            word = '<number>'
            final_context_words.append(word)
        elif 'this_citation' in word:
            final_context_words.append('<this_citation>')
        elif re.fullmatch(r'[a-z]+', word):
            original_word = word
            final_word = lemmatizer.lemmatize(word)
            
            if final_word not in conversion_dict:
                conversion_dict[final_word] = {}

            if original_word not in conversion_dict[final_word]:
                conversion_dict[final_word][original_word] = 0
            conversion_dict[final_word][original_word]+=1

            final_context_words.append(final_word)

    return final_context_words

In [6]:
def get_contexts(citations) :
    dataset = {}
    count = 0
    for key in citations.keys():
        context_list = []
        for cit in citations[key] :
            dict1 = {}
            dict1['paper_name'] = cit['title']
            dict1['context'] = ""
            for context in cit['cit'].findall('contexts/context') :
                text = context.text.lower()
                citstr = context.get('citStr').lower()
                text = text.replace(citstr, "this_citation")
                dict1['context']+= text+" "
            dict1['context'] = get_words(dict1['context'])
            context_list.append(dict1)
        dataset[key] = context_list
        count+=1
        if(count%100==0) :
            print(count)
    return dataset

In [7]:
contexts = get_contexts(citations)

100
200
300
400
500
600
700
800
900
1000
1100


In [8]:
max1 = 0
avg = 0
number = 0
for key in contexts :
    papers = contexts[key]
    for pap in papers :
        val = len(pap['context'])
        avg += val
        number += 1
        if(val>max1) :
            max1 = val
            
print(max1)
print(avg/number)

1141
140.41749366058292


In [9]:
embeddings_dict = {}
with open("glove.6B.100d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [10]:
print(len(embeddings_dict.keys()))

400000


In [12]:
print(embeddings_dict['unk'].shape)

(100,)


In [13]:
mapping = {}
mapping_rev = {}
values = []
count = 0
for key in contexts :
    papers = contexts[key]
    for pap in papers :
        name = pap['paper_name']
        words = pap['context']
        if(len(words)>0) :
            mapping[key+"_|_"+name] = count
            mapping_rev[count] = key+"_|_"+name
            count+=1
            embed = []
            for word in words :
                if(word in embeddings_dict) :
                    embed.append(embeddings_dict[word])
                else :
                    embed.append(embeddings_dict['unk'])
            diff = 1141-len(words)
            if(diff>0) :
                for i in range(diff) :
                    embed.append(np.zeros((100,)))
            values.append(embed)  

In [14]:
print(len(values))

27635


In [15]:
tags = pickle.load(open("../pickles_data/baseline_tags.pkl","rb"))

In [16]:
output = []
for i in range(count) :
    key,name = mapping_rev[i].split("_|_")
    papers = tags[key]
    for pap in papers :
        if(pap['paper_name']==name) :
            output.append(pap['tag'])
            break

In [17]:
print(len(output))

27635


In [18]:
size = len(values)
trainset = values[:int(0.8*size)]
trainlabel = output[:int(0.8*size)]
testset = values[int(0.8*size):]
testlabel = output[int(0.8*size):]

In [46]:
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import CuDNNLSTM, Bidirectional, Dropout

def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)
    

class AttentionWithContext(Layer):
    """
    Attention operation, with a context/query vector, for temporal data.
    Supports Masking.
    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
    "Hierarchical Attention Networks for Document Classification"
    by using a context vector to assist the attention
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    How to use:
    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
    The dimensions are inferred based on the output shape of the RNN.
    Note: The layer has been tested with Keras 2.0.6
    Example:
        model.add(LSTM(64, return_sequences=True))
        model.add(AttentionWithContext())
        # next add a Dense layer (for classification/regression) or whatever...
    """

    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight(shape=(input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight(shape=(input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)

        a = K.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]
    
def model_lstm_atten(batch_size, timesteps, vectorsize):
    inp = Input(batch_shape=(batch_size, timesteps, vectorsize))
    x = Bidirectional(CuDNNLSTM(32, return_sequences=True))(inp)
    x = AttentionWithContext()(x)
    x = Dense(32, activation="relu")(x)
    x = Dropout(0.25)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    return model

In [22]:
# from sklearn.metrics import precision_recall_fscore_support
# from sklearn.metrics import classification_report

# def eval_model(model, val_loader) :
#     final_out = []
#     final_lab = []

#     for idx, (val_input, val_label) in enumerate(val_loader):
#     # val_input = val_input.permute(1,0,2)
#     # print(train_input.shape)
#         val_input = val_input.type(torch.FloatTensor)
#         val_input = val_input.cuda()
#         output = model(val_input)
#         output = output.cpu().detach().numpy()
#         val_label = val_label.cpu().detach().numpy()
#         ar= []
#         for i in range(output.shape[0]) :
#             index = -1
#             max_val = -1
#             for j in range(len(output[i])) :
#                 if(output[i][j]>max_val) :
#                     index = j
#                     max_val = output[i][j]
#             ar.append(index)
    
#         val_label = list(val_label)
#         final_out.extend(ar)
#         final_lab.extend(val_label)

        
#     return classification_report(final_lab, final_out)

In [35]:
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [39]:
import tensorflow as tf
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

['/job:localhost/replica:0/task:0/device:GPU:0']


In [40]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17708579766986834680
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 6636438457831287673
physical_device_desc: "device: XLA_GPU device"
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 11351868100505684653
physical_device_desc: "device: XLA_CPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 9922186445
locality {
  bus_id: 2
  numa_node: 1
  links {
  }
}
incarnation: 13917449353595200715
physical_device_desc: "device: 0, name: GeForce RTX 2080 Ti, pci bus id: 0000:af:00.0, compute capability: 7.5"
]


In [41]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

In [54]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
batch_size = 128
timesteps = 250
vectorsize = 100
model = model_lstm_atten(batch_size, timesteps, vectorsize)
model.compile(loss='binary_crossentropy', optimizer='adam',
                  metrics=[get_f1])
file_path = ".model.hdf5"
ckpt = ModelCheckpoint(file_path, monitor='val_loss', verbose=1,
                       save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=15)
trainset1 = trainset[:22080]
trainlabel1 = trainlabel[:22080]
testset1 = testset[:5504]
testlabel1 = testlabel[:5504]

In [55]:
model.fit(np.array(trainset1), np.array(trainlabel1), batch_size=64, epochs=15, validation_data=(np.array(testset1), np.array(testlabel1)), callbacks=[ckpt, early])

Train on 22080 samples, validate on 5504 samples
Epoch 1/15

Epoch 00001: val_loss improved from inf to 0.23412, saving model to .model.hdf5
Epoch 2/15

Epoch 00002: val_loss improved from 0.23412 to 0.21257, saving model to .model.hdf5
Epoch 3/15

Epoch 00003: val_loss improved from 0.21257 to 0.20626, saving model to .model.hdf5
Epoch 4/15

Epoch 00004: val_loss improved from 0.20626 to 0.20140, saving model to .model.hdf5
Epoch 5/15

Epoch 00005: val_loss did not improve from 0.20140
Epoch 6/15

Epoch 00006: val_loss did not improve from 0.20140
Epoch 7/15

Epoch 00007: val_loss did not improve from 0.20140
Epoch 8/15

Epoch 00008: val_loss improved from 0.20140 to 0.20119, saving model to .model.hdf5
Epoch 9/15

Epoch 00009: val_loss did not improve from 0.20119
Epoch 10/15

Epoch 00010: val_loss did not improve from 0.20119
Epoch 11/15

Epoch 00011: val_loss did not improve from 0.20119
Epoch 12/15

Epoch 00012: val_loss did not improve from 0.20119
Epoch 13/15

Epoch 00013: val_l

<keras.callbacks.callbacks.History at 0x7fa9f0053748>

In [44]:
np.array(trainset).shape

AttributeError: 'list' object has no attribute 'shape'

In [23]:
# learning_rate = 0.005
# num_epochs = 15
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# torch.cuda.empty_cache()
# model = Attention_Net().to(device)
# criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# for epoch in range(num_epochs):
#     for batch_id, (train_input, train_label) in enumerate(train_dataloader):
#         optimizer.zero_grad() 
#         train_input = train_input.type(torch.FloatTensor)
#         train_input = train_input.cuda()
#         output = model(train_input)

#         train_label = train_label.type(torch.LongTensor)
#         train_label = train_label.cuda()
#         loss = criterion(output, train_label)
#         loss.backward()
#         optimizer.step()
        
#         if batch_id % 100 == 0:
#             print('Epoch [{}/{}], Loss:{:.4f}'
#                 .format(epoch+1, num_epochs, loss.data))
#             print("classification_report")
#             print(eval_model(model,val_dataloader))
#             print("--------------------------------------------------------------")
#             # losses.append(loss.data)
            
#         del train_input
#         del train_label
#         del output
#         del loss
#         torch.cuda.empty_cache()

Epoch [1/15], Loss:0.6940
classification_report


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00      5000
           1       0.10      1.00      0.17       527

    accuracy                           0.10      5527
   macro avg       0.05      0.50      0.09      5527
weighted avg       0.01      0.10      0.02      5527

--------------------------------------------------------------
Epoch [2/15], Loss:0.5060
classification_report
              precision    recall  f1-score   support

           0       0.95      0.87      0.91      5000
           1       0.33      0.59      0.42       527

    accuracy                           0.84      5527
   macro avg       0.64      0.73      0.67      5527
weighted avg       0.89      0.84      0.86      5527

--------------------------------------------------------------
Epoch [3/15], Loss:0.4954
classification_report
              precision    recall  f1-score   support

           0       0.96      0.79      0.87      5000
           1   

In [25]:
# print(eval_model(model, val_dataloader))

              precision    recall  f1-score   support

           0       0.94      0.98      0.96      5000
           1       0.65      0.43      0.52       527

    accuracy                           0.92      5527
   macro avg       0.80      0.70      0.74      5527
weighted avg       0.91      0.92      0.92      5527



In [None]:
# torch.save(model.state_dict, "models/2layer_bilstm_atten_statedict.pt")

In [None]:
# torch.save(model, "models/2layer_bilstm_atten_model.pt")

In [None]:
# class Attention_Net_Features(nn.Module) :
    
    