In [1]:
import re
import pickle
import json
import xml.etree.ElementTree as ET
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import os
from nltk.tokenize import word_tokenize, sent_tokenize
import numpy as np
import math
from difflib import SequenceMatcher
import pandas as pd

In [2]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
year_regex = re.compile(r'((19[0-9]{2})|(20[0-9]{2}))[a-z]?')
conversion_dict = {}

In [3]:
stop_words = [',', '.', '(', ')', ':', '-', "+", ";", "a", "about", "al", "al.", "all", 
	"already", "also", "although", "am", "an", "and", "another", "any", "anyhow", "are", 
	"aren", "aren't", "around", "as", "at", "back", "be", "because", "been", 
	"being", "beyond", "but", "by", "can", "cannot", "cant", "co", "con", "could", "couldn", 
	"couldnt", "d", "de", "did", "didn", "didn't", "do", "does", "doesn", "doesn't", 
	"doing", "don", "don't", "done", "due", "each", "either", "else", "elsewhere", "et", 
	"etc", "even", "ever", "except", "for", "found", "from", "further", "had", "hadn", 
	"hadn't", "has", "hasn", "hasn't", "hasnt", "have", "haven", "haven't", "having", 
	"he", "hence", "her", "here", "hereafter", "hereby", "hers", 
	"herself", "him", "himself", "his", "how", "however", "i", "ie", "if", "in", "inc", 
	"indeed", "interest", "into", "is", "isn", "isn't", "it", "it's", "its", "itself", 
	"just", "ltd", "ll", "m", "may", "me", "meanwhile", "might", "mightn", 
	"mightn't", "mine", "moreover", "most", "mostly", "move", "much", "must", "mustn", 
	"mustn't", "my", "myself", "name", "namely", "need", "needn", "needn't", "neither", 
	"nevertheless", "no", "nobody", "noone", "nor", "not", "now", "nowhere", "o", "of", 
	"off", "often", "on", "only", "onto", "or", "other", "others", "otherwise", "our", "own", 
	"per", "perhaps", "put", "rather", "re", "s", "same", "see", "seem", "seemed", 
	"seeming", "seems", "serious", "she", "should", "shouldn", "shouldn't", "since", 
	"sincere", "so", "some", "somehow", "someone", "something", "somewhere", "still", 
	"such", "t", "take", "than", "that", "that'll", "the", "their", "theirs", 
	"them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", 
	"therein", "thereupon", "these", "they", "this", "those", "though", "throughout", 
	"thru", "thus", "to", "together", "too", "toward", "towards", "un", "until", "upon", 
	"us", "ve", "very", "via", "was", "wasn", "wasn't", "we", "well", "were", "weren", 
	"weren't", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", 
	"whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", 
	"whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", 
	"without", "won", "won't", "would", "wouldn", "wouldn't", "y", "yet", "you", "your", 
	"yours", "yourself", "yourselves", "from SVM import SVCone", "two", "three", "four", "five", "six", "seven",
	"eight", "nine", "zero", "between", 'below', 'ourselves', "you'll", 'again', 'once', 'over', 'shan', 'few', 
    'against', 'before', 'out', 'down', 'both', 'up', "you've", "shan't", "you're", "should've", 'ours', 'ma', 
    "couldn't", 'during', 'more', 'ain', 'through', 'after', 'above', "she's", "you'd", 'under' ]

In [4]:
def match_titles(title1, title2):
    title1 = re.sub(r'[\W_]+', '', title1).lower()
    title2 = re.sub(r'[\W_]+', '', title2).lower()
    if title1 in title2 or title2 in title1:
        return True
    return False

In [5]:
def find_common_words(paper_title, cit_title):

	paper_title_words = word_tokenize(paper_title.replace('-', ' ').lower())
	cit_title_words = word_tokenize(cit_title.replace('-', ' ').lower())

	final_paper_title_words = []
	for word in paper_title_words:
		if word in stop_words:
			continue
		elif re.fullmatch(r'[a-z]+', word):
			word = lemmatizer.lemmatize(word)
			final_word = stemmer.stem(word)
			word = final_word

			final_paper_title_words.append(word)

	final_cit_title_words = []
	for word in cit_title_words:
		if word in stop_words:
			continue
		elif re.fullmatch(r'[a-z]+', word):
			word = lemmatizer.lemmatize(word)
			final_word = stemmer.stem(word)
			word = final_word

			final_cit_title_words.append(word)

	return len(set(final_cit_title_words)&set(final_paper_title_words))/(len(set(final_paper_title_words)|set(final_cit_title_words)))

In [6]:
def get_citations(folder):
    citation_list = {}
    for file in os.listdir(folder) :
        tree = ET.parse(folder+file)
        root = tree.getroot()
        id = file[:8]
        for element in root.iterfind("algorithm"):
            if(element.attrib['name']=="ParsCit"):
                citlist = element.getchildren()
                cits = citlist[0].getchildren()
                citations = []
                for cit in cits:
                    cit_dict = {}
                    if(cit.attrib['valid']=="true"):
                        try :
                            title = cit.find('title').text.lower()
                        except :
                            title = cit.find('rawString').text.lower()
                        cit_dict['title'] = title
                        cit_dict['cit'] = cit
                        citations.append(cit_dict)
                
                citation_list[id] = citations
            
    return citation_list 

In [11]:
def get_popularity_feat(citations) :
    global_citStr = []
    for key in citations :
        papers = citations[key]
        for paper in papers :
            global_citStr.extend(list(set([context.get('citStr').lower() for context in paper['cit'].findall('contexts/context')])))
    global_citStr = list(set(global_citStr))
    
    dataset = {}
    count=0
    for key in citations :
        papers = citations[key]
        data = []
        for paper in papers :
            dict1 = {}
            dict1['paper_name'] = paper['title']
            citStrs = list(set([context.get('citStr').lower() for context in paper['cit'].findall('contexts/context')]))
            other_cits = list(set(global_citStr)-set(citStrs))
            count_pop = 0
            for context in paper['cit'].findall('contexts/context') :
                con_text = context.text.lower()
                sentences = sent_tokenize(con_text)
                for sent in sentences :
                    for citstr in citStrs :
                        if(citstr in sent) :
                            for cit in other_cits :
                                count_pop+= con_text.count(cit)                
            dict1['popularity'] = count_pop
            data.append(dict1)
        if(count%100==0) :
            print(count)
        count+=1
        dataset[key] = data
        
    return global_citStr, dataset

In [22]:
def get_paper_titles(folder) :
    names = {}
    papers = []
    count = 0
    for file in os.listdir(folder) :
        tree = ET.parse(folder+file)
        root = tree.getroot()
        id = file[:8]
        for element in root.iterfind("algorithm"):
            if(element.attrib['name']=="SectLabel"):
                for elem in element.iterfind('variant/'):
                    if(elem.tag == "title"):
                        names[id] = elem.text
        papers.append(id)
                        
    ids_not_found = list(set(papers)-set(names.keys()))
    for id in ids_not_found :
        try :
            tree = ET.parse("titles/"+id[0]+"/"+id[:3]+"/"+id[:3]+".xml")
            root = tree.getroot()
            for element in root.iterfind("paper"):
                if(element.attrib['id']==id[4:]):
                    names[id] = element.find('title').text.lower()
        except :
            names[id] = ''
            count+=1
    
    print(count)
    return names

In [23]:
paper_titles = get_paper_titles("xmls/")

1


In [24]:
pickle.dump(paper_titles, open("pickles_data/acl_paper_titles.pkl", "wb"))

In [33]:
paper_tits = {}
authors_acl = pickle.load(open("pickles_data/authors_acl.pkl","rb"))
count = 0
for key in paper_titles :
    data = {}
    data['paper_name'] = paper_titles[key].lower().replace("\n", " ")
    year = int(key[1:3])
    year_str = ""
    if(year>=0 and year<=20) :
        if(len(str(year))==1):
            year_str = "200"+str(year)
        else :
            year_str = "20"+str(year)
    else :
        year_str = "19" + str(year)
    data['year'] = year_str    
    data['authors'] = authors_acl[key]
    if(len(data['authors'])==0) :
        count+=1
    paper_tits[key] = data 
print(count)

148


In [34]:
paper_tits

{'P10-1026': {'paper_name': ' a bayesian method for robust estimation of distributional similarities ',
  'year': '2010',
  'authors': ['jun’ichi kazama',
   'stijn de saeger',
   'kow kuroda',
   'masaki murata',
   'kentaro torisawa']},
 'P13-1111': {'paper_name': ' handling ambiguities of bilingual predicate-argument structures for statistical machine translation ',
  'year': '2013',
  'authors': ['feifei zhai', 'jiajun zhang', 'yu zhou', 'chengqing zong']},
 'W14-4407': {'paper_name': ' a template-based abstractive meeting summarization: leveraging summary and source text relationships ',
  'year': '2014',
  'authors': []},
 'D10-1003': {'paper_name': ' utilizing extra-sentential context for parsing ',
  'year': '2010',
  'authors': []},
 'P13-1171': {'paper_name': ' question answering using enhanced lexical semantic models ',
  'year': '2013',
  'authors': ['wen-tau yih',
   'ming-wei chang',
   'christopher meek',
   'andrzej pastusiak']},
 'J15-1005': {'paper_name': ' automatic 

In [35]:
pickle.dump(paper_tits, open("pickles_data/acl_titles_year_auth.pkl","wb"))

In [19]:
f = open("acl_paper_titles.txt", "w") 
for key in paper_titles :
    name = paper_titles[key]
    if(len(name)>0) :
        f.write(name.lower().replace("\n", " ")+"\n")

In [8]:
def get_cue_count(words) :
    cue_word_list = ['among', 'precis', 'implement', 'maximum', 'experi', 'strategi', 'overal', 'previou', 'correl', 'calcul', 'achiev', 'gold', 'top', 'obtain', 'significantli', 'report', 'best', 'evalu', 'result', 'base', 'modifi', 'extend', 'metric', 'baselin', 'accord', 'perform', 'model', 'recal', 'standard', 'figur', 'comparison', 'yield', 'outperform', 'increas', 'stateoftheart', 'accuraci', 'method', 'procedur', 'score', 'origin', 'highest', 'signific', 'higher', 'fscore', 'compar']
    count = 0
    for word in words :
        if(word in cue_word_list) :
            count+=1
    return count

In [11]:
def get_citation_titles(citation_list) :
    citation_titles = {}
    for id in citation_list.keys() :
        citations = citation_list[id]
        titles = []
        for cit in citations :
            title = cit['title']
            titles.append(title)
        citation_titles[id] = titles
    return citation_titles

In [9]:
def get_overlap(citation_titles, names) :
    dataset = {}
    ids = names.keys()
    for key in citation_titles.keys() :
        paper_title = names[key]
        list_val = []
        for paper in citation_titles[key] :
            overlap = find_common_words(paper_title, paper)
            dict1 = {}
            dict1['paper_name'] = paper
            dict1['overlap'] = overlap
            list_val.append(dict1)
        dataset[key] = list_val
    return dataset   

In [9]:
citations = get_citations("xmls/")
# cit_titles = get_citation_titles(citations)

In [12]:
popularity = get_popularity_feat(citations)

0
100
200
300
400
500
600
700
800
900
1000
1100


In [14]:
pickle.dump(popularity[1], open("pickles_data/popularity_sent.pkl", "wb"))

In [52]:
pickle.dump(cit_titles, open("pickles_data/citation_titles.pkl", "wb"))

In [16]:
paper_titles = get_paper_titles("xmls/")

In [48]:
def get_year_difference(citations) :
    dataset = {}
    count = 0
    count1 = 0
    sum1 = 0
    numb = 0
    for key in citations.keys() :
        count_list = []
        try :
            tree = ET.parse("../Data/titles/"+key[0]+"/"+key[:3]+"/"+key[:3]+".xml")
            root = tree.getroot()
            for element in root.iterfind("paper"):
                if(element.attrib['id']==key[4:]):
                    paper_year = element.find('year').text.lower()
        except :
            paper_year = ""
            count1+=1
            if(int(key[1:3])>=0 and int(key[1:3])<=20):
                year = "20"+key[1:3]
            else :
                year = "19"+key[1:3]
        for cit in citations[key] :
            dict1 = {}
            dict1['paper_name'] = cit['title']
            dict1['paper_year'] = paper_year
            try :
                dict1['cit_year'] = cit['cit'].find('date').text.lower()
            except :
                year_re =  re.findall(year_regex,cit['cit'].find('rawString').text.lower())
                if(len(year_re)>=1) :
                    year_re_ag = year_re[0]
                    if(len(year_re_ag)>=1) :
                        year = re.findall(year_regex,cit['cit'].find('rawString').text.lower())[0][0]
                        dict1['cit_year'] = year
                    else :
                        dict1['cit_year']=""
                else :
                    dict1['cit_year']=""
                    
            if(dict1['paper_year']!="" and dict1['cit_year']!="") :
                dict1['diff'] = int(dict1['paper_year']) - int(dict1['cit_year'])
                sum1+= dict1['diff']
                numb+= 1
                
            else :
                count+=1
                dict1['diff'] = 7
                
            count_list.append(dict1)
            
        dataset[key] = count_list
        
    print(count)
    return dataset                  

In [23]:
def get_section_feature(section_labels, citations):
    dataset = {}
    other_map = {'table_captions':'tables', 'figure_captions':'figures', 'overall':'body'}
    for key in citations.keys() :
        feat_list = []
        for cit in citations[key] :
            data1 = {}
            data1['paper_name'] = cit['title']
            citStrs = list(set([context.get('citStr').lower() for context in cit['cit'].findall('contexts/context')]))
            section_feature = {'experiment':{'body':0, 'figures':0, 'tables':0, 'note':0, 'overall':0}, 'conclusion':{'body':0, 'figures':0, 'tables':0, 'note':0, 'overall':0}, 'discussion':{'body':0, 'figures':0, 'tables':0, 'note':0, 'overall':0}, 'other_sections':{'body':0, 'figures':0, 'tables':0, 'note':0, 'overall':0}, 'related_work':{'body':0, 'figures':0, 'tables':0, 'note':0, 'overall':0}, 'abstract':{'body':0, 'figures':0, 'tables':0, 'note':0, 'overall':0}, 'introduction':{'body':0, 'figures':0, 'tables':0, 'note':0, 'overall':0}}
            for section in section_labels[key]:
                if section=='overall':
                    continue
                for subsection in section_labels[key][section]:
                    text = section_labels[key][section][subsection]
                    for citStr in citStrs:
                        count = text.count(citStr)
                        try:
                            section_feature[section][subsection]+=count
                        except KeyError:
                            section_feature[section][other_map[subsection]]+=count
                        text = text.replace(citStr, '')
            data1['section_feature'] = section_feature
            feat_list.append(data1)
        dataset[key] = feat_list 
    return dataset

In [28]:
def get_location_feature(sect_feat):
    dataset = {}
    dataset1 = {}
    for key in sect_feat.keys():
        citations = sect_feat[key]
        count_list = []
        count_list1 = []
        for cit in citations :
            dict1 = {}
            dict2 = {}
            dict1['paper_name'] = cit['paper_name']
            dict2['paper_name'] = cit['paper_name']
            dict2['num_table'] = 0
            sect = cit['section_feature']
            data = []
            data1 = []
            sections = list(sect.keys())
            sections.sort()
            for section in sections :
                if(section!='discussion'):
                    subsections = list(sect[section].keys())
                    subsections.sort()
                    count = 0
                    for subsection in subsections :
                        if(subsection!='overall'):
                            count+= sect[section][subsection]
                        if(subsection=='tables'):
                            dict2['num_table']+= sect[section][subsection]  
                            data1.append(sect[section][subsection])
                    data.append(count)
            count_list1.append(dict2)
            dict1['location_feature'] = data
            dict2['table_loc'] = data1
            count_list.append(dict1)
        dataset[key] = count_list  
        dataset1[key] = count_list1
    return dataset,dataset1

In [44]:
def get_baselines():
    f = open("./annotation/annotations2")
    data = []
    for line in f:
        data.append(line)

    dataset = {}
    for line in data:
        x = line.split("_")
        dataset[x[0]] = x[1:]
        
    f = open("./annotation/django_annotations")
    data = []
    for line in f:
        data.append(line)
        
    for line in data :
        x = line.split("_")
        if(len(x)>1) :
            dataset[x[0]] = x[1:]
            
    df = pd.read_excel("./annotation/Baselines.xlsx")
    ar = []
    for i in range(586) :
        if(df['Baseline Papers'][i]!='None' and type(df['Baseline Papers'][i])!=type(0.1)):
            ar.append(i)
    
    for ind in ar :
        paperId = df['PaperId'][ind]
        if(len(paperId)!=8):
            paperId = paperId[:-1]      
        x = df['Baseline Papers'][ind].split("_")
        
        dataset[paperId] = x
            
    return dataset

In [38]:
def get_fixed_context(citations):
    dataset = {}
    for key in citations.keys():
        context_list = []
        for cit in citations[key] :
            dict1 = {}
            dict1['paper_name'] = cit['title']
            dict1['fixed_context'] = []
            for context in cit['cit'].findall('contexts/context') :
                words = get_words(context.text.lower())
                if(len(words)==0):
                    dict1['fixed_context'].append(['<citation>'])
                else :
                    a = int(len(words)/2)
                    dict1['fixed_context'].append(words[a-20//2:a+20//2])
            if(len(dict1['fixed_context'])==0):
                dict1['fixed_context'].append(['<citation>'])
            context_list.append(dict1)
        dataset[key] = context_list
        
    return dataset

In [32]:
def get_contexts(citations) :
    dataset = {}
    count = 0
    for key in citations.keys():
        context_list = []
        for cit in citations[key] :
            dict1 = {}
            dict1['paper_name'] = cit['title']
            dict1['context'] = ""
            for context in cit['cit'].findall('contexts/context') :
                text = context.text.lower()
                citstr = context.get('citStr').lower()
                text = text.replace(citstr, "this_citation")
                dict1['context']+= text+" "
            dict1['context'] = get_words(dict1['context'])
            context_list.append(dict1)
        dataset[key] = context_list
        count+=1
        if(count%100==0) :
            print(count)
    return dataset

In [34]:
def get_words(string, cit_auths=None):

    string = string.replace('-', '').lower()
    context_words = word_tokenize(string)
    final_context_words = []

    for word in context_words:
        if word in stop_words:
            continue
        if re.fullmatch(year_regex, word): #or match_auths(word, cit_auths):
            continue
        if re.fullmatch(r'[0-9]+([.][0-9]+)?', word):
            word = '<number>'
            final_context_words.append(word)
        elif 'this_citation' in word:
            final_context_words.append('<this_citation>')
        elif re.fullmatch(r'[a-z]+', word):
            original_word = word
            word = lemmatizer.lemmatize(word)
            final_word = stemmer.stem(word)

            if final_word not in conversion_dict:
                conversion_dict[final_word] = {}

            if original_word not in conversion_dict[final_word]:
                conversion_dict[final_word][original_word] = 0
            conversion_dict[final_word][original_word]+=1

            final_context_words.append(final_word)

    return final_context_words

In [45]:
def get_baseline_tags(baselines, citations):
    dataset = {}
    ids = citations.keys()
    for id in ids :
        cits = citations[id]
        baselist = baselines[id]
        citlist = []
        for cit in cits :
            title = cit['title']
            dict1 = {}
            dict1['paper_name'] = title
            check = 0
            for base in baselines[id] :
                if(match_titles(base, title)):
                    check = 1
                    dict1['tag'] = 1
                    break
            if(check==0) :
                dict1['tag']= 0
            citlist.append(dict1)
        dataset[id] = citlist
    return dataset 

In [21]:
citations = get_citations("xmls_total/")

In [41]:
paper_titles = get_paper_titles("xmls_total/")

In [22]:
def get_overlap(citation_titles, names) :
    dataset = {}
    ids = names.keys()
    for key in citation_titles.keys() :
        paper_title = names[key]
        list_val = []
        for paper in citation_titles[key] :
            overlap = find_common_words(paper_title, paper)
            dict1 = {}
            dict1['paper_name'] = paper
            dict1['overlap'] = overlap
            list_val.append(dict1)
        dataset[key] = list_val
    return dataset    

In [42]:
cit_titles = get_citation_titles(citations)
overlap = get_overlap(cit_titles,paper_titles)

In [33]:
pickle.dump(cit_titles, open("pickles/citation_titles.pkl","wb"))

In [43]:
pickle.dump(overlap, open("pickles/title_overlap.pkl","wb"))

In [22]:
section_labels = pickle.load(open("pickles_data/section_labels.pkl", "rb"))

In [24]:
sect_feat = get_section_feature(section_labels, citations)

In [25]:
pickle.dump(sect_feat, open("pickles_data/section_feature.pkl","wb"))

In [26]:
print(len(sect_feat.keys()))

1181


In [29]:
loc_feat, num_table = get_location_feature(sect_feat)

In [30]:
pickle.dump(loc_feat, open("pickles_data/location_feature.pkl","wb"))
pickle.dump(num_table, open("pickles_data/num_table.pkl","wb"))

In [35]:
contexts = get_contexts(citations)

100
200
300
400
500
600
700
800
900
1000
1100


In [36]:
pickle.dump(contexts, open("pickles_data/context_words.pkl","wb"))

In [39]:
fixed_context = get_fixed_context(citations)

In [40]:
pickle.dump(fixed_context, open("pickles_data/fixed_context.pkl","wb"))

In [41]:
cue_word_list = ['among', 'precis', 'implement', 'maximum', 'experi', 'strategi', 'overal', 'previou', 'correl', 'calcul', 'achiev', 'gold', 'top', 'obtain', 'significantli', 'report', 'best', 'evalu', 'result', 'base', 'modifi', 'extend', 'metric', 'baselin', 'accord', 'perform', 'recal', 'standard', 'figur', 'comparison', 'yield', 'outperform', 'increas', 'stateoftheart', 'accuraci', 'method', 'procedur', 'score', 'origin', 'highest', 'signific', 'higher', 'fscore', 'compar']

In [42]:
def get_cue_words(contexts) :
    dataset = {}
    for key in contexts.keys() :
        count_list = []
        for cit in contexts[key] :
            dict1 = {}
            dict1['paper_name'] = cit['paper_name']
            context_words = cit['context']
            cue_words = list(set(cue_word_list)&set(context_words))
            count = 0
            for word in cue_words :
                count+=context_words.count(word)
            dict1['cue_count'] = count
            count_list.append(dict1)
        dataset[key] = count_list
    return dataset

In [43]:
cue_count = get_cue_words(contexts)
pickle.dump(cue_count, open("pickles_data/cue_count.pkl", "wb"))

In [20]:
def get_context_count(citations):
    dataset = {}
    for key in citations :
        papers = citations[key]
        feat_list = []
        for paper in papers :
            dict1 = {}
            dict1['paper_name'] = paper['title']
            dict1['context_count'] = len(paper['cit'].findall("contexts/context"))
            feat_list.append(dict1)
        dataset[key] = feat_list
    return dataset

In [21]:
context_count = get_context_count(citations)
pickle.dump(context_count, open("pickles_data/context_count.pkl","wb"))

In [46]:
baselines = get_baselines()
tags = get_baseline_tags(baselines, citations)

In [47]:
pickle.dump(tags, open("pickles_data/baseline_tags.pkl","wb"))

In [49]:
year_diff = get_year_difference(citations)

31943


In [50]:
pickle.dump(year_diff, open("pickles_data/year_diff.pkl", "wb"))

In [22]:
def get_dist_words(string, cit_auths=None):
    
    string = string.replace('-', '').lower()
    context_words = word_tokenize(string)
    final_context_words = []

    # conversion_dict = pickle.load(open('pickles/conversion_dict.pkl', 'rb'))

    orig_pos = []

    for i in range(len(context_words)):
        word = context_words[i]
        if word in stop_words:
            continue
        if re.fullmatch(year_regex, word): #or match_auths(word, cit_auths):
            continue
        if re.fullmatch(r'[0-9]+([.][0-9]+)?', word):
            word = '<number>'
            orig_pos.append(i)
            final_context_words.append(word)
        elif 'this_citation' in word:
            orig_pos.append(i)
            final_context_words.append('<this_citation>')
        elif re.fullmatch(r'[a-z]+', word):
            original_word = word
            word = lemmatizer.lemmatize(word)
            final_word = stemmer.stem(word)

            if final_word not in conversion_dict:
                conversion_dict[final_word] = {}

            if original_word not in conversion_dict[final_word]:
                conversion_dict[final_word][original_word] = 0
            conversion_dict[final_word][original_word]+=1

            orig_pos.append(i)
            final_context_words.append(final_word)

    # pickle.dump(conversion_dict, open('pickles/conversion_dict.pkl', 'wb'))

    word_dist = calc_dist(final_context_words, orig_pos)
    return final_context_words, word_dist

In [23]:
def calc_dist(context_words, orig_pos) :  
    distances = []
    pivot_positions = []
    max_pos = []
    for i in range(len(context_words)):
        if context_words[i]=='<this_citation>':	
            pivot_positions.append(orig_pos[i])
            max_pos.append(max(i, len(context_words)-i))

    for i in range(len(context_words)):
        if orig_pos[i] in pivot_positions:
            distances.append(0)
            continue
        else:
            dist = 1
            for k in range(len(pivot_positions)):
                pos = pivot_positions[k]
                dist = min(dist, (abs(pos-orig_pos[i])/max_pos[k])**2)
            distances.append(dist)

    return distances

In [24]:
def get_contexts_dist(citations) :
    dataset = {}
    count = 0
    for key in citations.keys():
        context_list = []
        for cit in citations[key] :
            dict1 = {}
            dict1['paper_name'] = cit['title']
            dict1['context'] = ""
            for context in cit['cit'].findall('contexts/context') :
                text = context.text.lower()
                citstr = context.get('citStr').lower()
                text = text.replace(citstr, "this_citation")
                dict1['context']+= text+" "
            dict1['context'], dict1['distances'] = get_dist_words(dict1['context'])
            context_list.append(dict1)
        dataset[key] = context_list
        count+=1
        if(count%100==0) :
            print(count)
    return dataset          

In [25]:
contexts_dist = get_contexts_dist(citations)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300


In [26]:
pickle.dump(contexts_dist, open("pickles/contexts_dist.pkl","wb"))

In [19]:
def get_reduced_context(citations):
    dataset = {}
    count = 0
    for key in citations.keys():
        context_list = []
        for cit in citations[key] :
            dict1 = {}
            dict1['paper_name'] = cit['title']
            dict1['context'] = ""
            for context in cit['cit'].findall('contexts/context') :
                text = context.text.lower()
                citstr = context.get('citStr').lower()
                text = text.replace(citstr, "this_citation")
                a = len(text)
                text = text[a//2-500:a//2+500]
                dict1['context']+= text+" "
            dict1['context'] = get_words(dict1['context'])
            context_list.append(dict1)
        dataset[key] = context_list
        count+=1
        if(count%100==0) :
            print(count)
    return dataset
    

In [20]:
citations = get_citations("xmls/")

In [21]:
reduced_context = get_reduced_context(citations)

100
200
300
400
500
600
700
800
900
1000
1100


In [25]:
cue_words_reduced = get_cue_words(reduced_context)

In [26]:
pickle.dump(cue_words_reduced, open("pickles/cue_words_reduced.pkl", "wb"))

In [27]:
pickle.dump(reduced_context, open("pickles/reduced_context.pkl", "wb"))

In [10]:
tags = pickle.load(open("pickles_data/baseline_tags.pkl", "rb"))

In [7]:
def get_abstract_similarity() :
    abstracts = pickle.load(open("pickles_data/abstracts_total.pkl", "rb"))
    paper_info = pickle.load(open("pickles/paper_info.pickle","rb"))
    
    dataset = {}
    for key in paper_info :
        papers = paper_info[key]
        abstract = abstracts[key]
        data = []
        for paper in papers :
            abcit = paper['abstract']
            try :
                abcit = str(abcit)
            except :
                abcit = ""
            sim = find_common_words(abcit, abstract)
            dict1 = {}
            dict1['paper_name'] = paper['paper_name']
            dict1['abs_sim'] = sim
            data.append(dict1)
        dataset[key] = data
    return dataset

In [12]:
abs_sim = get_abstract_similarity()

In [14]:
pickle.dump(abs_sim, open("pickles_data/abstract_similarity.pkl", "wb"))