In [2]:
import re
import pickle
import json
import xml.etree.ElementTree as ET
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import os
from nltk.tokenize import word_tokenize, sent_tokenize
import numpy as np
import math
from difflib import SequenceMatcher
import pandas as pd

In [3]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
year_regex = re.compile(r'((19[0-9]{2})|(20[0-9]{2}))[a-z]?')
conversion_dict = {}
stop_words = [',', '.', '(', ')', ':', '-', "+", ";", "a", "about", "al", "al.", "all", 
	"already", "also", "although", "am", "an", "and", "another", "any", "anyhow", "are", 
	"aren", "aren't", "around", "as", "at", "back", "be", "because", "been", 
	"being", "beyond", "but", "by", "can", "cannot", "cant", "co", "con", "could", "couldn", 
	"couldnt", "d", "de", "did", "didn", "didn't", "do", "does", "doesn", "doesn't", 
	"doing", "don", "don't", "done", "due", "each", "either", "else", "elsewhere", "et", 
	"etc", "even", "ever", "except", "for", "found", "from", "further", "had", "hadn", 
	"hadn't", "has", "hasn", "hasn't", "hasnt", "have", "haven", "haven't", "having", 
	"he", "hence", "her", "here", "hereafter", "hereby", "hers", 
	"herself", "him", "himself", "his", "how", "however", "i", "ie", "if", "in", "inc", 
	"indeed", "interest", "into", "is", "isn", "isn't", "it", "it's", "its", "itself", 
	"just", "ltd", "ll", "m", "may", "me", "meanwhile", "might", "mightn", 
	"mightn't", "mine", "moreover", "most", "mostly", "move", "much", "must", "mustn", 
	"mustn't", "my", "myself", "name", "namely", "need", "needn", "needn't", "neither", 
	"nevertheless", "no", "nobody", "noone", "nor", "not", "now", "nowhere", "o", "of", 
	"off", "often", "on", "only", "onto", "or", "other", "others", "otherwise", "our", "own", 
	"per", "perhaps", "put", "rather", "re", "s", "same", "see", "seem", "seemed", 
	"seeming", "seems", "serious", "she", "should", "shouldn", "shouldn't", "since", 
	"sincere", "so", "some", "somehow", "someone", "something", "somewhere", "still", 
	"such", "t", "take", "than", "that", "that'll", "the", "their", "theirs", 
	"them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", 
	"therein", "thereupon", "these", "they", "this", "those", "though", "throughout", 
	"thru", "thus", "to", "together", "too", "toward", "towards", "un", "until", "upon", 
	"us", "ve", "very", "via", "was", "wasn", "wasn't", "we", "well", "were", "weren", 
	"weren't", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", 
	"whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", 
	"whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", 
	"without", "won", "won't", "would", "wouldn", "wouldn't", "y", "yet", "you", "your", 
	"yours", "yourself", "yourselves", "from SVM import SVCone", "two", "three", "four", "five", "six", "seven",
	"eight", "nine", "zero", "between", 'below', 'ourselves', "you'll", 'again', 'once', 'over', 'shan', 'few', 
    'against', 'before', 'out', 'down', 'both', 'up', "you've", "shan't", "you're", "should've", 'ours', 'ma', 
    "couldn't", 'during', 'more', 'ain', 'through', 'after', 'above', "she's", "you'd", 'under' ]

In [4]:
import xml.etree.ElementTree as ET
import re
import pickle
import json
import os

In [5]:
def match_titles(title1, title2):
    title1 = re.sub(r'[\W_]+', '', title1).lower()
    title2 = re.sub(r'[\W_]+', '', title2).lower()
    if title1 in title2 or title2 in title1:
        return True
    return False

In [6]:
tag_to_type_mapping = {'bodyText':'body', 'listItem':'body', 'figureCaption':'figure_captions', 'tableCaption':'table_captions', 'table':'tables', 'figure':'figures', 'note':'note', 'footnote':'note'}

In [7]:
def assign_section_heading_type(sectionHeader):

    exp_regex = re.compile(r'experiment|empiric')
    meth_regex = re.compile(r'method|approach|architect')
    eval_regex = re.compile(r'evaluat|result|analys|compar|perform|discussion')
    int_regex = re.compile(r'introduction')
    rel_regex = re.compile(r'related work|background|previous work|study')
    con_regex = re.compile(r'conclusion|future work')
    ref_regex = re.compile(r'referenc')


    if exp_regex.search(sectionHeader):
        heading_type = 'experiment'
    elif meth_regex.search(sectionHeader):
        heading_type = 'method'
    elif eval_regex.search(sectionHeader) :
        heading_type = 'evaluation'
    elif int_regex.search(sectionHeader):
        heading_type = 'introduction'
    elif rel_regex.search(sectionHeader):
        heading_type = 'related_work'
    elif con_regex.search(sectionHeader):
        heading_type = 'conclusion'
    elif ref_regex.search(sectionHeader):
        heading_type = 'reference'
    else:
        heading_type = 'other_sections'
        # print('section', sectionHeader.lstrip())

    return heading_type

In [8]:
def sectlabel(tree_root):

    section_labeling = {'overall':'', 'experiment':{'body':'', 'figures':'', 'figure_captions':'', 'tables':'', 'table_captions':'', 'note':'', 'overall':''},
                        'method':{'body':'', 'figures':'', 'figure_captions':'', 'tables':'', 'table_captions':'', 'note':'', 'overall':''} ,
                        'conclusion':{'body':'', 'overall':'', 'figures':'', 'figure_captions':'', 'tables':'', 'table_captions':'', 'note':''},
                        'evaluation':{'body':'', 'overall':'', 'figures':'', 'figure_captions':'', 'tables':'', 'table_captions':'', 'note':''},
                        'other_sections':{'body':'', 'overall':'', 'figures':'', 'figure_captions':'', 'tables':'', 'table_captions':'', 'note':''}, 
                        'related_work':{'body':'', 'overall':'', 'figures':'', 'figure_captions':'', 'tables':'', 'table_captions':'', 'note':''}, 
                        'introduction':{'body':'', 'overall':'', 'figures':'', 'figure_captions':'', 'tables':'', 'table_captions':'', 'note':''}}

    curr_section = {'experiment':-1, 'method':-1, 'introduction':-1, 'related_work':-1, 'other_sections':-1, 'evaluation':-1, 'conclusion':-1, 'reference':-1}

    for element in tree_root.iterfind('variant/'):

        if element.tag=='sectionHeader':

            for sec in curr_section:
                curr_section[sec]=-1

            sectionHeader = element.text.lower()
            heading_type = assign_section_heading_type(sectionHeader)

            if heading_type=='reference':
                continue

            curr_section[heading_type] = 0

        elif 'Header' in element.tag:

            i = 0
            while i<4:
                if 'sub'*i+'sectionHeader'==element.tag:
                    break
                i+=1

            for sec in curr_section:
                if curr_section[sec]>=i:
                    curr_section[sec]=-1

            sectionHeader = element.text.lower()
            heading_type = assign_section_heading_type(sectionHeader)

            if heading_type=='reference':
                continue

            if curr_section[heading_type]>i or curr_section[heading_type]<0:
                curr_section[heading_type] = i

        elif element.tag in ['bodyText', 'listItem', 'figureCaption', 'tableCaption', 'table', 'figure', 'note', 'footnote']:
            element_text = element.text.lower().replace('-\n', '').replace('\n', ' ').rstrip()+' '
            section_labeling['overall']+=element_text
            for sec in curr_section:
                if curr_section[sec]>=0:
                    if sec=='other_sections':
                        if any([(curr_section[sec]>=0 and sec not in ['other_sections']) for sec in curr_section]):
                            continue
                    section_labeling[sec]['overall']+=element_text
                    section_labeling[sec][tag_to_type_mapping[element.tag]]+=element_text

        elif element.tag in ['title', 'author', 'affiliation', 'page', 'equation', 'reference', 'email', 'address', 'construct']:
            pass

        else:
            # print('tag', element.tag)
            pass

    return section_labeling

In [9]:
section_labels = {}
for file in os.listdir('../xmls/'):
    tree = ET.parse("../xmls/"+file)
    root = tree.getroot()
    id = file[:8]
    for element in root.iterfind("algorithm"):
        if(element.attrib['name']=="SectLabel"):
            sect_labelling = sectlabel(element)
            section_labels[id] = sect_labelling

In [10]:
for file in os.listdir('../all_acl_xmls/all_files/') :
    try :
        tree = ET.parse("../all_acl_xmls/all_files/"+file)
        root = tree.getroot()
        id = file[:8]
        for element in root.iterfind("algorithm"):
            if(element.attrib['name']=="SectLabel"):
                sect_labelling = sectlabel(element)
                section_labels[id] = sect_labelling
    except :
        pass

In [11]:
print(len(section_labels.keys()))

2801


In [12]:
pickle.dump(section_labels, open("base_pickles/section_labels.pkl", "wb"))

In [42]:
def get_section_feature(section_labels, citations):
    dataset = {}
    dataset1 = {}
    sent_list = []
    count = 0 
    for key in citations.keys() :
        for cit in citations[key] :
            for context in cit['cit'].findall('contexts/context') :
                context_text = context.text.lower()
                section_feature = {'experiment':0, 'conclusion':0, 'other_sections':0, 'related_work':0, 'method':0, 'introduction':0, 'evaluation':0}
                sents = sent_tokenize(context_text)
                citstr = context.get('citStr').lower()
                con_sent = ""
                for sent in sents :
                    if(citstr in sent.lower()) :
                        count+=1
                        con_sent = sent
                        sent_list.append(sent)
                        break
                        
                for section in section_labels[key]:
                    if section=='overall':
                        continue
                    section_text = ""
                    for subsection in section_labels[key][section]:
                        text = section_labels[key][section][subsection]
                        section_text += text + " "
                    if match_titles(context_text, section_text) :
                        dataset[context_text] = section
                        dataset1[con_sent] = section
                        break
    
    print(count)
    return dataset, dataset1, sent_list

In [21]:
def get_section_feat_our(section_labels, citations) :
    dataset = {}
    dataset1 = {}
    sent_list = []
    for key in citations.keys() :
        all_list = []
        all_list1 = []
        for cit in citations[key] :
            section_feature = {'experiment':0, 'conclusion':0, 'other_sections':0, 'related_work':0, 'method':0, 'introduction':0, 'evaluation':0}
            map_val = {'experiment':0, 'conclusion':1, 'other_sections':2, 'related_work':3, 'method':4, 'introduction':5, 'evaluation':6}
            paper_name = cit['title']
            con_data = {}
            con_data1 = {}
            con_data['paper_name'] = paper_name
            con_data1['paper_name'] = paper_name
            con_data['context'] = ""
            con_data1['context'] = ""
            con_data['loc_feat'] = [0,0,0,0,0,0,0]
            con_data1['loc_feat'] = [0,0,0,0,0,0,0]
            
            for context in cit['cit'].findall('contexts/context') :
                context_text = context.text.lower()
                sents = sent_tokenize(context_text)
                citstr = context.get('citStr').lower()
                con_sent = ""
                for sent in sents :
                    if(citstr in sent.lower()) :
                        con_sent = sent
                        sent_list.append(sent)
                        break
                        
                for section in section_labels[key]:
                    if section=='overall':
                        continue
                    section_text = ""
                    for subsection in section_labels[key][section]:
                        text = section_labels[key][section][subsection]
                        section_text += text + " "
                    if match_titles(context_text, section_text) :
                        con_data["context"]+=context_text+" "
                        con_data1["context"]+=con_sent+" "
                        con_data['loc_feat'][map_val[section]]+=1
                        con_data1['loc_feat'][map_val[section]]+=1
                        break
                        
            all_list.append(con_data)
            all_list1.append(con_data1)
            
        dataset[key] = all_list
        dataset1[key] = all_list1
        
    return dataset, dataset1

In [43]:
section_cont, sec_con_sent, sent_list = get_section_feature(section_labels, citations)

64468


In [22]:
our_full, our_sent = get_section_feat_our(section_labels, citations)

In [24]:
print(len(our_full.keys()))

2801


In [23]:
pickle.dump(our_full, open("our_model/section_con_loc_feat.pkl","wb"))
pickle.dump(our_sent, open("our_model/section_sent_loc_feat.pkl","wb"))

In [49]:
for sent in sent_list :
    if(sent_list.count(sent)>1) :
        print(sent)
        break

the data sparseness problem is usually solved by smoothing, regularization, margin maximization and so on (chen and goodman, 1998; chen and rosenfeld, 2000; cortes and vapnik, 1995).


In [50]:
pickle.dump(section_cont, open("base_pickles/section_contexts.pkl","wb"))
pickle.dump(sec_con_sent, open("base_pickles/section_sentences.pkl","wb"))

In [51]:
print(len(section_cont.keys()))
print(len(sec_con_sent.keys()))

63315
38591


In [31]:
def get_citation_worthiness(citations, section_labels) :
    dataset = {}
    count0 = 0
    count1 = 0
    
    for key in citations :
        all_text = " "
        for section in section_labels[key] :
            if(section=='overall') :
                continue
            else :
                all_text+= section_labels[key][section]['overall']
            
        all_citstrs = []
        for cit in citations[key] :
            all_citstrs.extend(list(set([context.get('citStr').lower() for context in cit['cit'].findall('contexts/context')])))
            
            
        sentences = sent_tokenize(all_text)
        for sent in sentences :
            sent = sent.lower()
            check = 0
            for citstr in all_citstrs :
                if(citstr in sent) :
                    sent = sent.replace(citstr," ")
                    check = 1
                    dataset[sent] = 1
                    count1+=1
                    break
            if(check==0) :
                dataset[sent] = 0
                count0+=1
            
    print(count0)
    print(count1)
    return dataset

In [32]:
worthy = get_citation_worthiness(citations, section_labels)

494373
47891


In [33]:
pickle.dump(worthy, open("base_pickles/citation_worthy.pkl","wb"))

In [13]:
def get_citations():
    citation_list = {}
    for file in os.listdir("../xmls/") :
        tree = ET.parse("../xmls/"+file)
        root = tree.getroot()
        id = file[:8]
        for element in root.iterfind("algorithm"):
            if(element.attrib['name']=="ParsCit"):
                citlist = element.getchildren()
                cits = citlist[0].getchildren()
                citations = []
                for cit in cits:
                    cit_dict = {}
                    if(cit.attrib['valid']=="true"):
                        try :
                            title = cit.find('title').text.lower()
                        except :
                            title = cit.find('rawString').text.lower()
                        cit_dict['title'] = title
                        cit_dict['cit'] = cit
                        citations.append(cit_dict)
                
                citation_list[id] = citations
                
    for file in os.listdir("../all_acl_xmls/all_files/") :
        try :
            tree = ET.parse("../all_acl_xmls/all_files/"+file)
            root = tree.getroot()
            id = file[:8]
            for element in root.iterfind("algorithm"):
                if(element.attrib['name']=="ParsCit"):
                    citlist = element.getchildren()
                    cits = citlist[0].getchildren()
                    citations = []
                    for cit in cits:
                        cit_dict = {}
                        if(cit.attrib['valid']=="true"):
                            try :
                                title = cit.find('title').text.lower()
                            except :
                                title = cit.find('rawString').text.lower()
                            cit_dict['title'] = title
                            cit_dict['cit'] = cit
                            citations.append(cit_dict)

                    citation_list[id] = citations
        except :
            pass
            
    return citation_list 

In [14]:
citations = get_citations()

In [18]:
def get_contexts(citations) :
    dataset = {}
    dataset1 = {}
    count = 0
    for key in citations.keys():
        context_list = []
        con_list = []
        for cit in citations[key] :
            dict1 = {}
            dict1['paper_name'] = cit['title']
            dict1['context'] = []
            dict2 = {}
            dict2['paper_name'] = cit['title']
            dict2['sents'] = []
            for context in cit['cit'].findall('contexts/context') :
                text = context.text
                sents = sent_tokenize(text)
                citstr = context.get('citStr')
                con = ""
                for sent in sents :
                    if(citstr in sent) :
                        ind = sents.index(sent)
#                         sent = sent.replace(citstr, "this_citation")
                        con+= sents[ind-1] + " "
                        con+= sent + " "
                        dict2['sents'].append(sent)
                        if(ind+1<len(sents)):
                            con+= sents[ind+1] 
                        break
                con = con.lower()
                dict1['context'].append(text)
            context_list.append(dict1)
            con_list.append(dict2)
        dataset[key] = context_list
        dataset1[key] = con_list
        count+=1
        if(count%100==0) :
            print(count)
    return dataset, dataset1

In [19]:
contexts, context_sents = get_contexts(citations)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800


In [21]:
pickle.dump(contexts, open("base_pickles/all_contexts.pkl","wb"))
pickle.dump(context_sents, open("base_pickles/all_citation_sents.pkl", "wb"))