In [4]:
import re
import pickle
import json
import xml.etree.ElementTree as ET
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import os
from nltk.tokenize import word_tokenize, sent_tokenize
import numpy as np
import math
from difflib import SequenceMatcher
import pandas as pd
from statistics import stdev

In [9]:
def pickle_dump(obj, filename) :
    pickle.dump(obj, open("pickles/"+filename+".pkl", "wb"))

In [41]:
"""
Get reduced citation context 
1. context features 
2. similarity features :

2We use the vector space based model
(Turney and Pantel, 2010) after stemming the words using
Porter stammer (Porter, 1997).

     Then we calculate the cosine-similarity2
    between the title (T) of Pj and (i) SF:TTitle. the title, (ii) SF:TAbs. the abstract, SF:TIntro. the introduction, (iv) SF:TConcl. the conclusion, and (v)
    SF:TRest. the rest of the sections (sections other
    than abstract, introduction and conclusion) of Pi
    
    . Therefore,
we take the same similarity based approach mentioned above, but replace the title of Pj with its RC
and obtain five more features: (vi) SF:RCTitle, (vii)
SF:RCAbs, (viii) SF:RCIntro, (ix) SF:RCConcl and
(x) SF:RCRest. If a reference appears multiple times
in a citing paper, we consider the aggregation of all
RCs together.
.
3. frequency 
We count the frequency
of Rij in (i) FF:Whole. the entire content, (ii)
FF:Intro. the introduction, (iii) FF:Rel. the related
work, (iv) FF:Rest. the rest of the sections (as mentioned in Section 3.3.2) of Pi
. We also introduce (v)
FF:Sec. to measure the fraction of different sections
of Pi where Rij occurs (assuming that appearance of
Rij in different sections is more influential). These
features are further normalized using the number of
sentences in Pi
in order to avoid unnecessary bias on
the size of the paper.

4. Position 
For the first two features, we divide
the entire paper into two parts equally based on the
sentence count and then see whether Rij appears (i)
PF:Begin. in the beginning or (ii) PF:End. in the
end of Pi
. Importantly, if Rij appears multiple times
in Pi
, we consider the fraction of times it occurs in
each part.

 (iii) PF:Mean. mean position of appearance, (iv)
PF:Std. standard deviation of different appearances.
These features are normalized by the total length
(number of sentences) of Pi
. , thus ranging from 0
(indicating beginning of Pi) to 1 (indicating the end
of Pi).

5. Linguistic
6. Miscellaneous

MS:GCount. To answer
whether a highly-cited paper has more academic influence on the citing paper than the one which is less
cited, we measure the number of other papers (except Pi) citing Pj .
(ii) MS:SelfC. To see the effect of self-citation, we
check whether at least one author is common in both
Pi and Pj .
(iii) MG:Time. The fact that older papers are rarely
cited, may not stipulate that these are less influential.
Therefore, we measure the difference of the publication years of Pi and Pj .
(iv) MG:CoCite. It measures the co-citation counts
of Pi and Pj defined by |Ri∩Rj |
|Ri∪Rj |
, which in turn answers the significance of reference-based similarity
driving the academic influence """

'\nGet reduced citation context \n1. context features \n2. similarity features :\n\n2We use the vector space based model\n(Turney and Pantel, 2010) after stemming the words using\nPorter stammer (Porter, 1997).\n\n     Then we calculate the cosine-similarity2\n    between the title (T) of Pj and (i) SF:TTitle. the title, (ii) SF:TAbs. the abstract, SF:TIntro. the introduction, (iv) SF:TConcl. the conclusion, and (v)\n    SF:TRest. the rest of the sections (sections other\n    than abstract, introduction and conclusion) of Pi\n    \n    . Therefore,\nwe take the same similarity based approach mentioned above, but replace the title of Pj with its RC\nand obtain five more features: (vi) SF:RCTitle, (vii)\nSF:RCAbs, (viii) SF:RCIntro, (ix) SF:RCConcl and\n(x) SF:RCRest. If a reference appears multiple times\nin a citing paper, we consider the aggregation of all\nRCs together.\n.\n3. frequency \nWe count the frequency\nof Rij in (i) FF:Whole. the entire content, (ii)\nFF:Intro. the introduc

In [33]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
year_regex = re.compile(r'((19[0-9]{2})|(20[0-9]{2}))[a-z]?')
conversion_dict = {}

In [5]:
def get_citations(folder):
    citation_list = {}
    for file in os.listdir(folder) :
        tree = ET.parse(folder+file)
        root = tree.getroot()
        id = file[:8]
        for element in root.iterfind("algorithm"):
            if(element.attrib['name']=="ParsCit"):
                citlist = element.getchildren()
                cits = citlist[0].getchildren()
                citations = []
                for cit in cits:
                    cit_dict = {}
                    if(cit.attrib['valid']=="true"):
                        try :
                            title = cit.find('title').text.lower()
                        except :
                            title = cit.find('rawString').text.lower()
                        cit_dict['title'] = title
                        cit_dict['cit'] = cit
                        citations.append(cit_dict)
                
                citation_list[id] = citations
            
    return citation_list 

In [11]:
def get_contexts(citations) :
    dataset = {}
    dataset1 = {}
    count = 0
    for key in citations.keys():
        context_list = []
        con_list = []
        for cit in citations[key] :
            dict1 = {}
            dict1['paper_name'] = cit['title']
            dict1['context'] = []
            dict2 = {}
            dict2['paper_name'] = cit['title']
            dict2['sents'] = []
            for context in cit['cit'].findall('contexts/context') :
                text = context.text
                sents = sent_tokenize(text)
                citstr = context.get('citStr')
                con = ""
                for sent in sents :
                    if(citstr in sent) :
                        ind = sents.index(sent)
#                         sent = sent.replace(citstr, "this_citation")
                        con+= sents[ind-1] + " "
                        con+= sent + " "
                        dict2['sents'].append(sent)
                        if(ind+1<len(sents)):
                            con+= sents[ind+1] 
                        break
                con = con.lower()
                dict1['context'].append(text)
            context_list.append(dict1)
            con_list.append(dict2)
        dataset[key] = context_list
        dataset1[key] = con_list
        count+=1
        if(count%100==0) :
            print(count)
    return dataset, dataset1

In [7]:
citations = get_citations("../xmls/")

In [12]:
contexts, citation_sents = get_contexts(citations)

100
200
300
400
500
600
700
800
900
1000
1100


In [13]:
pickle_dump(contexts, "contexts_info")
pickle_dump(citation_sents, "citation_sents")

In [38]:
len(contexts.keys())

1181

In [39]:
citation_sents[list(citation_sents.keys())[0]]

[{'paper_name': 'on a measure of divergence between two statistical populations defined by their probability distributions.',
  'sents': ['Our technical contribution in this paper is to show that in the case where the context profiles are multinomial distributions, the priors are Dirichlet, and the base similarity measure is the Bhattacharyya coefficient (this_citation), we can derive an analytical form for Eq.',
   '2, even in the Dirichlet prior case.l In this study, we employ the Bhattacharyya coefficient (this_citation) (BC for short), which is defined as follows: BC(p1, p2) = The BC is also a similarity measure on probability distributions and is suitable for our purposes as we describe in the next section.',
   'BC The Bhattacharyya coefficient (this_citation) between p(fk|w1) and p(fkIw2).']},
 {'paper_name': 'an empirical study of smoothing techniques for language modeling.',
  'sents': ['The data sparseness problem is usually solved by smoothing, regularization, margin maximiz

In [23]:
contexts.keys()

dict_keys(['P10-1026', 'P13-1111', 'W14-4407', 'D10-1003', 'P13-1171', 'J15-1005', 'N09-1036', 'P11-1130', 'P13-1029', 'D15-1136', 'P13-1087', 'N06-1047', 'Q14-1008', 'P91-1041', 'W10-2915', 'D15-1218', 'P15-1135', 'P00-1039', 'P13-1118', 'D12-1002', 'W15-4902', 'S15-1005', 'W06-2914', 'P15-1127', 'P13-1109', 'D12-1120', 'W13-3514', 'P14-1095', 'D12-1019', 'N13-1132', 'P02-1059', 'J12-1003', 'D12-1134', 'W14-4338', 'Q13-1009', 'P13-1059', 'P06-1009', 'N10-1014', 'P10-1107', 'N01-1011', 'P02-1065', 'P14-1136', 'P08-1048', 'D08-1034', 'D08-1090', 'D09-1092', 'P10-1125', 'D15-1089', 'D12-1051', 'E12-1008', 'E14-1075', 'D14-1123', 'D09-1036', 'S15-1017', 'P12-1072', 'P12-1007', 'P03-1011', 'P14-1012', 'W06-1649', 'W15-5618', 'N07-1011', 'D08-1069', 'D08-1113', 'D14-1008', 'P98-2234', 'P11-1149', 'P08-1049', 'P14-1015', 'D13-1191', 'D14-1038', 'P10-1004', 'P14-1033', 'D13-1014', 'D11-1030', 'P05-1004', 'D15-1045', 'P13-1013', 'P01-1041', 'J01-1002', 'P00-1011', 'D13-1009', 'P05-1015', 'D14-

In [28]:
pickle.dump(contexts, open("pickles/reduced_context.pkl", "wb"))

In [40]:
pickle.dump(citation_sents, open("pickles/citation_sentences.pkl", "wb"))

In [50]:
def year_diff() :
    year_diff = pickle.load(open("../pickles_data/year_diff.pkl", "rb"))
    pickle.dump(year_diff, open("pickles/year_diff.pkl", "wb"))

In [46]:
def location() :
    loc_feat = pickle.load(open("../pickles_data/location_feature.pkl", "rb"))
    dataset = {}
    for key in loc_feat :
        papers = loc_feat[key]
        pap_list = []
        for pap in papers :
            dict1 = {}
            dict1['paper_name'] = pap['paper_name']
            locs = pap['location_feature']
            dict1['whole'] = sum(locs)
            dict1['intro'] = locs[3]
            dict1['relwork'] = locs[-1]
            dict1['rest'] = sum(locs) - locs[3] -locs[1]
            pap_list.append(dict1)
        dataset[key] = pap_list
        
    return dataset         

In [51]:
loc = location()
year_diff()

In [52]:
pickle.dump(loc, open("pickles/alf_location.pkl", "wb"))

In [54]:
def get_section_feature(section_labels, citations):
    dataset = {}
    for key in citations.keys() :
        feat_list = []
        for cit in citations[key] :
            data1 = {}
            data1['paper_name'] = cit['title']
            citStrs = list(set([context.get('citStr').lower() for context in cit['cit'].findall('contexts/context')]))
            section_feature = {'experiment':0, 'conclusion':0, 'other_sections':0, 'related_work':0, 'method':0, 'introduction':0, 'evaluation':0}
            for section in section_labels[key]:
                if section=='overall':
                    continue
                for subsection in section_labels[key][section]:
                    text = section_labels[key][section][subsection]
                    for citStr in citStrs:
                        count = text.count(citStr)
                        section_feature[section]+=count
                        text = text.replace(citStr, '')
            data1['section_feature'] = section_feature
            feat_list.append(data1)
        dataset[key] = feat_list 
    return dataset

In [56]:
sect_feat = get_section_feature(pickle.load(open("pickles/en_section_labels.pkl", "rb")),citations)

In [57]:
pickle.dump(sect_feat, open("pickles/en_section_feature.pkl", "wb"))

In [None]:
"""
Code for all citation strings
    global_citStr = []
    for key in citations :
        papers = citations[key]
        for paper in papers :
            global_citStr.extend(list(set([context.get('citStr').lower() for context in paper['cit'].findall('contexts/context')])))
    global_citStr = list(set(global_citStr))
"""

In [28]:
def position_feat(citations) :
    section_labels = pickle.load(open("../pickles_data/section_labels.pkl", "rb"))
    dataset = {}
    count = 0
    ct = 0
    ct2 = 0
    for key in citations :
        paper = section_labels[key]['overall']
        first_half = paper[:len(paper)//2]
        second_half = paper[len(paper)//2:]
        first_half = first_half.lower()
        second_half = second_half.lower()
        papers = citations[key] 
        data_list = []
        for pap in papers :
            data = {}
            data['paper_name'] = pap['title']
            cit = pap['cit']
            citStrs = list(set([context.get('citStr').lower() for context in pap['cit'].findall('contexts/context')]))
            upper = 0
            lower = 0
            for citstr in citStrs :
                upper = first_half.count(citstr)
                lower = second_half.count(citstr)
                
            if(upper+lower != 0) :
                data['position_feat'] = []
                data['position_feat'].append(upper/(upper+lower))
                data['position_feat'].append(lower/(upper+lower))
                sentences = sent_tokenize(paper)
                indices = []
                for i in range(len(sentences)) :
                    for citstr in citStrs :
                        if(citstr in sentences[i]) :
                            indices.append(i)

                if(len(indices)==0) :
                    data['position_feat'].append(0)
                    data['position_feat'].append(0)
                    ct2+=1
                else :
                    data['position_feat'].append(sum(indices)/len(indices))
                    if(len(indices)>2) :
                        data['position_feat'].append(stdev(indices))
                    else :
                        data['position_feat'].append(0)
                data_list.append(data)
            else :
                data['position_feat'] = [0,0,0,0]
                ct+=1
                
        dataset[key] = data_list
        count+=1
        if(count%1000==0) :
            print(count)

    print(ct)
    print(ct2)
    return dataset       

In [29]:
pos_feat = position_feat(citations)
pickle_dump(pos_feat, "alf_pos_feat")

1000
4344
1980


In [None]:
"""
Rel = "pivotal, comparable, innovative, relevant, relevantly, inspiring, related, relatedly, similar, similarly, applicable, appropriate,
pertinent, influential, influenced, original, originally, useful, suggested, interesting, inspired, likewise
recent, recently, latest, later, late, latest, up-to-date, continuing, continued, upcoming, expected, update, renewed, extended"
Rec = [subsequent, subsequently, initial, initially, sudden, current, currently, future, unexpected, previous, previously, old,
ongoing, imminent, anticipated, unprecedented, proposed, startling, preliminary, ensuing, repeated, reported, new, earlier,
earliest, early, existing, further, revised, improved]
Ext = [greatly, awfully, drastically, intensely, acutely, almighty, exceptionally, excessively, exceedingly, tremendously, importantly
significantly, notably, outstandingly]
Comp = [easy, easier, easiest, vague, vaguer, vaguest, weak, weaker, weakest, strong, stronger, strongest, bogus, unclear]
"""

In [31]:
rel = []
rec = []
ext = []
comp = []
rels = "pivotal, comparable, innovative, relevant, relevantly, inspiring, related, relatedly, similar, similarly, applicable, appropriate, pertinent, influential, influenced, original, originally, useful, suggested, interesting, inspired, likewise, recent, recently, latest, later, late, latest, up-to-date, continuing, continued, upcoming, expected, update, renewed, extended"
recs = "subsequent, subsequently, initial, initially, sudden, current, currently, future, unexpected, previous, previously, old, ongoing, imminent, anticipated, unprecedented, proposed, startling, preliminary, ensuing, repeated, reported, new, earlier, earliest, early, existing, further, revised, improved"
exts = "greatly, awfully, drastically, intensely, acutely, almighty, exceptionally, excessively, exceedingly, tremendously, importantly, significantly, notably, outstandingly"
comps = "easy, easier, easiest, vague, vaguer, vaguest, weak, weaker, weakest, strong, stronger, strongest, bogus, unclear"
rel = rels.split(", ")
rec = recs.split(", ")
ext = exts.split(", ")
comp = comps.split(", ")




36
28313


In [38]:
def context_feat(citations, rel, rec,ext,comp) :
    global_citStr = []
    for key in citations :
        papers = citations[key]
        for paper in papers :
            global_citStr.extend(list(set([context.get('citStr').lower() for context in paper['cit'].findall('contexts/context')])))
    global_citStr = list(set(global_citStr))
    
    dataset = {}
    count=0
    for key in citations :
        papers = citations[key]
        data = []
        for paper in papers :
            dict1 = {}
            dict1['paper_name'] = paper['title']
            dict1['context_feat'] = []
            citStrs = list(set([context.get('citStr').lower() for context in paper['cit'].findall('contexts/context')]))
            other_cits = list(set(global_citStr)-set(citStrs))
            count_pop = 0
            alone = 0
            first = 1
            relevant = 0
            recent = 0
            extreme = 0
            compare = 0
            for context in paper['cit'].findall('contexts/context') :
                con_text = context.text.lower()
                other_con = []
                for other in other_cits :
                    if(other in con_text) :
                        other_con.append(other)
                if(len(other_con)>0) :
                    alone = 1

                other_ind = []
                for other in other_con :
                    other_ind.append(con_text.find(other))

                con_ind = con_text.find(citStrs[0])
                for i in range(1, len(citStrs)) :
                    ind = con_text.find(citStrs[i])
                    if(ind<con_ind) :
                        con_ind = ind

                for ind in other_ind :
                    if(ind<con_ind) :
                        first = 0
                        break
                
                for word in rel :
                    if(word in con_text) :
                        relevant = 1
                        break
                        
                for word in rec :
                    if(word in con_text) :
                        recent = 1
                        break
                        
                for word in ext :
                    if(word in con_text) :
                        extreme = 1
                        break
                    
                for word in comp :
                    if(word in con_text) :
                        compare = 1
                        break
                
            dict1['context_feat'] = []
            dict1['context_feat'].append(alone)
            dict1['context_feat'].append(first)
            dict1['context_feat'].append(relevant)
            dict1['context_feat'].append(recent)
            dict1['context_feat'].append(extreme)
            dict1['context_feat'].append(compare)
            
            
            data.append(dict1)
        dataset[key] = data
        count+=1
        if(count%100==0) :
            print(count)
        
        
    return dataset   

In [39]:
confeat = context_feat(citations, rel, rec, ext, comp)
pickle_dump(confeat, "alf_context_feat")

100
200
300
400
500
600
700
800
900
1000
1100
