In [1]:
from collections import Counter
import pickle
import re
import string


import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import xml.etree.ElementTree as ET

  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""


Gets final and initial lib data from endnote library exported to xml format

In [2]:
finaltree = ET.parse('finalLibrary.xml')
finalroot = finaltree.getroot()
initialtree = ET.parse('initialLibrary.xml')
initialroot = initialtree.getroot()

class for the individual papers

In [3]:
class Paper(object):
    
    # takes info list --> [[title], [abstract], [keywords], isbn, relevant] 
    def __init__(self, info):
        
        # base attributes
        
        self.OrigTitle     = info[0]
        self.title         = info[1]
        self.OrigKeywords  = info[2]
        self.keywords      = info[3]
        self.OrigAbstract  = info[4]
        self.abstract      = info[5] 
        self.relevant      = info[6]
        
        self.title_vec = None
        self.keyword_vec = None
        self.abstract_vec = None
             

methods for cleaning and processing text

In [4]:
def stemmer(text):
    ps = PorterStemmer()
    stemmed_text = []
    for word in text:
        stemmed_text.append(ps.stem(word))
    return stemmed_text


def lemmatizer(text):
    lz = WordNetLemmatizer()
    lemma_text = []
    for word in text:
        
        lemma_text.append(lz.lemmatize(word))
    return lemma_text

# text = "One of the key challenges in osteochondral tissue engineering is to achieve mechanical properties in the engineered tissue that are equivalent to the native tissue. Detailed knowledge of the mechanotransduction pathways occuring in the native tissue is necessary before manipulation towards the aimed properties in the engineered tissue"

# print(lemmatizer(text.split()))

# cleans and processes text
removes stopwords <br>
lowercases <br>
remove punctuation (except hyphens)<br>
lemmatizes<br>

In [5]:
def cleanText(text):    
    stopWords = set(stopwords.words('english'))
    words = text.split()    
    words = [word.lower() for word in words]  

    # remove prevent '-' from being removed when removing punctuation    
    puncList = list(string.punctuation)
    puncList.remove('-')
    punc = ""
    for p in puncList:
        punc += p
    
    table  = str.maketrans('', '', punc)
    words = [word.translate(table) for word in words]    
        
    cleanedWords = []
    for word in words:
        if word in stopWords:
            pass
        else:
            cleanedWords.append(word)
            
            
#   cleanedWords = lemmatizer(cleanedWords)
    cleanedWords = stemmer(cleanedWords)
    
    return cleanedWords


cleanText() example output

In [36]:
# text = "One of the key challenges in osteochondral tissue engineering is to achieve mechanical properties in the engineered tissue that are equivalent to the native tissue. Detailed knowledge of the mechanotransduction pathways occuring in the native tissue is necessary before manipulation towards the aimed properties in the engineered tissue"
# print(text)
# cleanText(text)

function for cleaning the keywords

In [37]:
# NOTE: should we stem or lemmatize keywords?

def cleanKeyWords(keywords):
    
    cleanedWords = []
    for keyword in keywords:
        
        # splits cases like  '*Actins/me [Metabolism]' into just 'Actins'
        keyword = keyword.split('/')
        keyword  = keyword[0] 

        cleanedWords.append(keyword)
        
    # lowercase, remove punc
    cleanedWords = [word.lower() for word in cleanedWords]
    
    puncList = list(string.punctuation)
    puncList.remove('-')
    punc = ""
    for p in puncList:
        punc += p
    
    table  = str.maketrans('', '', punc)
    cleanedWords = [word.translate(table) for word in cleanedWords]
    
    
    # removes keywords that contain numbers for simplicity
    final_set = []
    for word in cleanedWords:
        
        contains_digit = False
        for char in word:
                        
            if char.isdigit():
                contains_digit = True
        if not contains_digit:
            final_set.append(word)
            
    
    # removes duplicates
    
    final_set = list(set(final_set))                     
    return final_set
        


cleanKeywords() example output

In [38]:
# words = ["apple", "Purinergic (ATP-mediated) signaling", "*Actins/me [Metabolism]"]
# print(words)
# cleanKeyWords(words)

loops through xml tree, gets relevant info into record data to be instantiated into Paper object

In [49]:
def getENLdata(root, relevant):
    data = []   

    for record in root.iter('record'): # iterate through all records
        entry = []
        
        try: # capture title if present
            titles = record.find('titles')
            title = titles.find('title')
            title_text = title.find('style').text
            entry.append(title_text)
            entry.append(cleanText(title_text))

        except AttributeError:
            entry.append('')
            entry.append('')



        
        try: # capture keywords if present
            keywords = record.find('keywords')
            word_list = []
            for word in keywords:
                word_text = word.find('style').text
                word_list.append(word_text)    
            entry.append(word_list)
            entry.append(cleanKeyWords(word_list))            

        except TypeError:
            entry.append([])       
            entry.append([])
            
        
        try: # capture abstract if present            
            abstract = record.find('abstract')
            abstract_text = abstract.find('style').text
            entry.append(abstract_text)
            entry.append(cleanText(abstract_text))
        
        except AttributeError:
            entry.append([])      
            entry.append([])
            

        entry.append(relevant)        
        data.append(entry)
        
    return data

In [50]:
final_data   = getENLdata(finalroot, True)
initial_data = getENLdata(initialroot, False)

convert entry lists into objects


In [53]:
def objectifier(entryList):
    records = []
    for rec in entryList:
#         print(len(rec))
        record = Paper(rec)
        records.append(record)
    return records   


In [54]:
finalObjs = objectifier(final_data)
initialObjs = objectifier(initial_data)

mark initial papers as relevant or not

In [55]:
# hacky temporary solution

final_titles = []
for rec in finalObjs:
    final_titles.append(rec.title)    
collection = finalObjs

for rec in initialObjs:
    if rec.title in final_titles:
        pass
    else:
        collection.append(rec)


turns list of paper objects into dataframe

In [60]:
def convertToFrame(collection):
    frame_data = []
    for paper in collection:
        data = [paper.OrigTitle, paper.title,paper.OrigKeywords, paper.keywords,paper.OrigAbstract, paper.abstract, paper.relevant]
        frame_data.append(data)
    
    df = pd.DataFrame(frame_data, columns = ['original title', 'title', 'original keywords','keywords','original abstract', 'abstract', 'relevant'])
    return df


In [61]:
papers_df = convertToFrame(collection)

In [62]:
# papers_df.head()

In [63]:
def nGrammer(corpus, n):
    # NOTE: automatically includes uni-grams for now
    ''' corpus: string
        n: list of ngram options
            e.g. n = [2,3] will return a list consisting of the uni, bi
            and trigrams found in corpus
    '''
    grams = []
    for ng in n:
        n_gram = ngrams(corpus.split(), ng)
        for gram in n_gram:
            gram = gram[0] + " " + gram[1]
            grams.append(gram)
    return grams
        


# we want to get every uni and bigram from the titles and abstract in the vocabulary

In [64]:

titles = papers_df['title'].values.tolist() # gets list of all title lists
titles = [' '.join(title) for title in titles] # converts each title from list of words to string
title_corpus = []
for title in titles:
    grams = nGrammer(title, [2])
    title_corpus.append(title.split() + grams)
# each entry in title corpus is a list of the unigrams and bigrams in the title


abstracts = papers_df['abstract'].values.tolist()
abstracts = [' '.join(title) for title in abstracts]

abstract_corpus = []
for abstract in abstracts:
    grams = nGrammer(abstract, [2])
    abstract_corpus.append(abstract.split() + grams)
    

keyword_corpus = papers_df['keywords'].values.tolist()
keywords = [' '.join(title) for title in keyword_corpus]


all_paper_text = []
for index, rec in enumerate(title_corpus):
    entry_text = rec + keyword_corpus[index] + abstract_corpus[index]
    all_paper_text.append(" ".join(entry_text))
    
all_text_corpus = []

for index, title in enumerate(title_corpus):
    corpus = title + keyword_corpus[index] + abstract_corpus[index]
    corpus = list(set(corpus))
    all_text_corpus.append(corpus)


 # flattens list to create vocab list
flat_title_corpus = [item for sublist in title_corpus for item in sublist]
flat_abstract_corpus = [item for sublist in abstract_corpus for item in sublist]
flat_keyword_corpus = [item for sublist in keyword_corpus for item in sublist]


# each entry will have uni and bigrams from a paper's title, keywords, and abstract

    
    
corpus = flat_title_corpus + flat_keyword_corpus + flat_abstract_corpus
corpus = [word for word in corpus if len(word) > 1]

# CORPUS IS NOW LIST OF ALL UNI AND BIGRAMS THAT APPEAR IN THE TITLES, KEYWORDS, AND ABSTRACTS OF ALL PAPERS
# CORPUS CAN NOW BE USED AS THE VOCABULARY

with open('corpus.pkl', 'wb') as f:
    pickle.dump(corpus, f)
    

In [65]:
all_text_corpus[0]

['energi deficit',
 'biosynthesi',
 'grown',
 '24',
 'declin 15',
 'releas atp',
 'also found',
 '24 h',
 'animals',
 'membran subject',
 'use rhodamine-123',
 'assay astrocyt',
 'h postinjuri',
 'follow injuri',
 'atp',
 'may',
 'injur',
 'demonstr injuri',
 'declin 43-52',
 'deltapsim declin',
 'plu glial',
 'sever',
 'astrocyt injuri',
 'brain',
 'hypothesi stretch-induc',
 'glial cultur',
 'adenosine triphosphate',
 'physical stimulation',
 'glial',
 'dysfunct astrocyt',
 'drop moder',
 'drop significantli',
 'cellular atp',
 '24-48',
 'plu',
 'vitro model',
 'atp cultur',
 'transport use',
 'deficit traumat',
 'potenti deltapsim',
 'luciferin-luciferas assay',
 'declin 22-28',
 'cultur astrocyt',
 'model',
 'deficit',
 'atp astrocyt',
 'decreas',
 'model tbi',
 'moder stretch',
 'energi',
 'find demonstr',
 'stretch',
 'cultur declin',
 'stretch-induc',
 'measur use',
 'cultur grown',
 'deltapsim drop',
 'homeostasi neurotransmiss',
 'neuron releas',
 'deltapsim atp',
 'cellular',

In [66]:
# custom method for getting enforcing max_features on our corpus
# necessary because we're using custom vocab

def nMostCommon(term_list, n):
    '''
        term_list: list of ngrams
        n: number of most common terms to return
        e.g. n = 1000 returns list of 1000 most common terms in corpus        
    
    '''
    count = Counter(term_list)    
    highest_freq_terms = count.most_common(n)
    
    # get just the txt
    highest_freq_terms = [term[0] for term in highest_freq_terms]
    return highest_freq_terms
 
        


here the text info for each paper is vectorized

In [67]:
# NOTE, TRY FREQ BoW, TF-IDF

vect = CountVectorizer(binary = True, vocabulary = nMostCommon(corpus, 10000))

title_vectors = vect.fit_transform(titles)
abstract_vectors = vect.fit_transform(abstracts)
keyword_vectors = vect.fit_transform(keywords)
combined_text_vectors = vect.fit_transform(all_paper_text)


add text corpora to dataframe

In [68]:
papers_df = papers_df.assign(title_corpus = title_corpus)
papers_df = papers_df.assign(keyword_corpus = keyword_corpus)
papers_df = papers_df.assign(abstract_corpus = abstract_corpus)
papers_df = papers_df.assign(all_text_corpus = all_text_corpus)
papers_df = papers_df.assign(all_text = all_paper_text)

In [69]:
index = list(np.arange(1,len(papers_df)+1, 1))
max_id_length = (len(str(index[-1])))
IDs = []
for ID in index:
    
#     print(max_id_length - len(str(ID)))
    prepend = ''
    for index in range(max_id_length - len(str(ID))):
        prepend += '0'
#     print(prepend)
    IDs.append(prepend+str(ID))
papers_df.insert(1, 'paper_id', IDs)

converts text vectors from sparse matrix to an array, adds them to dataframe

In [70]:
title_vectors         = [title for title in title_vectors.toarray()]
keyword_vectors       = [keywords for keywords in keyword_vectors.toarray()]
abstract_vectors      = [abstract for abstract in abstract_vectors.toarray()]
combined_text_vectors = [combined_text for combined_text in combined_text_vectors.toarray()]

In [71]:
# type(abstract_vectors)

In [72]:
papers_df = papers_df.assign(title_vector = title_vectors)
papers_df = papers_df.assign(keyword_vector = keyword_vectors)
papers_df = papers_df.assign(abstract_vector = abstract_vectors)

papers_df = papers_df.assign(combined_text_vector = combined_text_vectors)



In [73]:
papers_df.iloc[0].title_vector

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [74]:
len(papers_df.iloc[0].combined_text_vector)

10000

In [75]:
papers_df.to_pickle("data_frame_no_paper.pkl")

In [76]:
papers_df

Unnamed: 0,original title,paper_id,title,original keywords,keywords,original abstract,abstract,relevant,title_corpus,keyword_corpus,abstract_corpus,all_text_corpus,all_text,title_vector,keyword_vector,abstract_vector,combined_text_vector
0,Stretch-induced injury alters mitochondrial me...,0001,"[stretch-induc, injuri, alter, mitochondri, me...","[*Adenosine Triphosphate/me [Metabolism], Anim...","[astrocytes, rats, rats sprague-dawley, time f...",Energy deficit after traumatic brain injury (T...,"[energi, deficit, traumat, brain, injuri, tbi,...",True,"[stretch-induc, injuri, alter, mitochondri, me...","[astrocytes, rats, rats sprague-dawley, time f...","[energi, deficit, traumat, brain, injuri, tbi,...","[energi deficit, biosynthesi, grown, 24, decli...",stretch-induc injuri alter mitochondri membran...,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ..."
1,Imaging exocytosis of ATP-containing vesicles ...,0002,"[imag, exocytosi, atp-contain, vesicl, tirf, m...",[],[],Nucleotide release constitutes the first step ...,"[nucleotid, releas, constitut, first, step, pu...",True,"[imag, exocytosi, atp-contain, vesicl, tirf, m...",[],"[nucleotid, releas, constitut, first, step, pu...","[atp efflux, storag site, bioluminesc, fluores...",imag exocytosi atp-contain vesicl tirf microsc...,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, ...","[1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, ..."
2,Low-intensity pulsed ultrasound-dependent oste...,0003,"[low-intens, puls, ultrasound-depend, osteobla...",[Adenosine Triphosphate/aa [Analogs & Derivati...,"[cell count, cell proliferation, rats, rats wi...",Low-intensity pulsed ultrasound (LIPUS) is com...,"[low-intens, puls, ultrasound, lipu, commonli,...",True,"[low-intens, puls, ultrasound-depend, osteobla...","[cell count, cell proliferation, rats, rats wi...","[low-intens, puls, ultrasound, lipu, commonli,...","[shown furthermor, osteoblasts, osteoblast, no...",low-intens puls ultrasound-depend osteoblast p...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ..."
3,ATP signalling is crucial for the response of ...,0004,"[atp, signal, crucial, respons, human, keratin...",[Adenosine Triphosphatases/ai [Antagonists & I...,"[osmotic pressure, mechanotransduction cellula...",Touch is detected through receptors located in...,"[touch, detect, receptor, locat, skin, activ, ...",True,"[atp, signal, crucial, respons, human, keratin...","[osmotic pressure, mechanotransduction cellula...","[touch, detect, receptor, locat, skin, activ, ...","[capacit, adenosin triphosph, sensori, phospho...",atp signal crucial respons human keratinocyt m...,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, ..."
4,Mechanosensitive release of adenosine 5'-triph...,0005,"[mechanosensit, releas, adenosin, 5-triphosph,...","[*Adenosine Triphosphate/me [Metabolism], Anim...","[disease models animal, eye proteins, animals,...",As adenosine 5'-triphosphate (ATP) released fr...,"[adenosin, 5-triphosph, atp, releas, astrocyt,...",True,"[mechanosensit, releas, adenosin, 5-triphosph,...","[disease models animal, eye proteins, animals,...","[adenosin, 5-triphosph, atp, releas, astrocyt,...","[sustain strain, pharmacolog analysi, reduc ba...",mechanosensit releas adenosin 5-triphosph pann...,"[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, ...","[0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, ..."
5,Feline interstitial cystitis results in mechan...,0006,"[felin, interstiti, cystiti, result, mechan, h...","[*Adenosine Triphosphate/me [Metabolism], Anim...","[female, cystitis interstitial, cat diseases, ...",ATP can be released from a variety of cell typ...,"[atp, releas, varieti, cell, type, mechan, sti...",True,"[felin, interstiti, cystiti, result, mechan, h...","[female, cystitis interstitial, cat diseases, ...","[atp, releas, varieti, cell, type, mechan, sti...","[solut, swelling-evok exposur, bioluminesc, no...",felin interstiti cystiti result mechan hyperse...,"[0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, ...","[1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, ..."
6,Altered urinary bladder function in mice lacki...,0007,"[alter, urinari, bladder, function, mice, lack...","[Acetic Acid/pd [Pharmacology], *Adenosine Tri...","[neurons afferent, acetic acid, muscle contrac...","In the urinary bladder, the capsaicin-gated io...","[urinari, bladder, capsaicin-g, ion, channel, ...",True,"[alter, urinari, bladder, function, mice, lack...","[neurons afferent, acetic acid, muscle contrac...","[urinari, bladder, capsaicin-g, ion, channel, ...","[neurons afferent, trpv1-- urotheli, ion chann...",alter urinari bladder function mice lack vanil...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7,Renal epithelial cells can release ATP by vesi...,0008,"[renal, epitheli, cell, releas, atp, vesicular...",[],[],Renal epithelial cells have the ability to rel...,"[renal, epitheli, cell, abil, releas, nucleoti...",True,"[renal, epitheli, cell, releas, atp, vesicular...",[],"[renal, epitheli, cell, abil, releas, nucleoti...","[intercal cell, releas atp, cell collect, dire...",renal epitheli cell releas atp vesicular fusio...,"[1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, ...","[1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, ..."
8,Extracellular osmolarity modulates G protein-c...,0009,"[extracellular, osmolar, modul, g, protein-cou...","[*Adenosine Triphosphate/me [Metabolism], Anio...","[hypotonic solutions, thrombin, humans, astroc...",We previously reported that ATP release from 1...,"[previous, report, atp, releas, 1321n1, human,...",True,"[extracellular, osmolar, modul, g, protein-cou...","[hypotonic solutions, thrombin, humans, astroc...","[previous, report, atp, releas, 1321n1, human,...","[potenc, notabl par1-sensit, condit strong, ta...",extracellular osmolar modul g protein-coupl re...,"[1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, ...","[1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, ..."
9,Synergistic effect of acute hypoxia on flow-in...,0010,"[synergist, effect, acut, hypoxia, flow-induc,...","[Acute hypoxia, atp, Cultures, Increased flow,...","[acute hypoxia, controlled study, cell secreti...",Human umbilical vein endothelial cells (HUVECs...,"[human, umbil, vein, endotheli, cell, huvec, p...",True,"[synergist, effect, acut, hypoxia, flow-induc,...","[acute hypoxia, controlled study, cell secreti...","[human, umbil, vein, endotheli, cell, huvec, p...","[normox hypox, cell significantli, conclud, ce...",synergist effect acut hypoxia flow-induc relea...,"[1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, ..."
