## PARSE SCIENTIFIC JOURNAL PDF FILES AND EXTRACT SPECIFIC KEYWORDS SUCH AS: TAXONOMY, NSC (UNIQUE IDENTIFIER), AUTHORS, COMPOUND NAMES

In [52]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from IPython.display import display
from spacy import displacy
from nltk.stem import WordNetLemmatizer 
from nltk.stem import *
import spacy
import re
import pickle
import os
from sklearn.preprocessing import LabelEncoder
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, ComplementNB
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from nltk.classify import ClassifierI
from statistics import mode, StatisticsError
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load("en_core_web_sm") #for stop words
nlp_sci = spacy.load("en_core_sci_sm") #only for displaying using diplacy render
nlp.vocab["."].is_stop = True
nlp.vocab["/"].is_stop = True
nlp.vocab["("].is_stop = True
nlp.vocab[")"].is_stop = True
cwd = os.getcwd()

#### Enable functions

In [2]:
def last_char(word):
    '''
    Extract the last two characters of the word
    '''
    try:
        return word[-2:]
    except IndexError:
        return word[-1]
    
def first_char(word):
    '''
    Extract the first two characters of the word
    '''
    try:
        return word[:2]
    except IndexError:
        return word[0]

def find_features(word):
    '''
    feature extraction and store into dictionary for training
    '''
    return {'word_length':len(word),\ #numeric
            'last_letters':last_char(word),\ #letters
            'first_letters':first_char(word),\ #letters
            'lemma':lemma_binary(word),\ #float (mean of 1's)
            'stem':stem_compute(word)['output'],\ #numeric 0 or 1
            'convert_chr':convert_chr(word) #numeric, sum
        }

lemmatiser = WordNetLemmatizer()
stemmer = PorterStemmer()

def convert_chr(name):
    '''
    Convert the alphabet-ised word to a number
    '''
    chr_list = []
    for l in name:
        chr_list.append(ord(l.lower()))
    return sum(chr_list)

def lemma_binary(word):
    '''
    Output overlapping words from a lemmatised word compared to the original word
    '''
    l_word = lemmatiser.lemmatize(word)
    word_lemma_dict = {'word_length' : len(word), 'lw_length' : len(l_word)}
    max_length = max(word_lemma_dict.values())
    longer_length_word = [k for k,v in word_lemma_dict.items() if v == max_length][0]
    for _ in range(max_length - word_lemma_dict['lw_length']):
        l_word += '0'
    binary_output = []
    for w,lw in zip(word,l_word):
        if w == lw:
            binary_output.append(1)
        else:
            binary_output.append(0)

    return binary_output.count(1)/len(binary_output)

def stem_compute(word):
    '''
    If the stem word account for more than 75% of the original word, it most likely is a name or species name
    '''
    stem_dict = {}
    stem_dict['s_word'] = stemmer.stem(word)
    stem_dict['word_length'] = len(word)
    stem_dict['stem_length'] = len(stem_dict['s_word'])
    stem_dict['ratio'] = stem_dict['stem_length'] / stem_dict['word_length']
    if stem_dict['ratio'] > .75:
        stem_dict['output'] = 1
    else:
        stem_dict['output'] = 0
    return stem_dict

In [50]:
class VoteClassifier(ClassifierI):
    def __init__(self, classifiers): #list of classifiers
        self._classifiers_dict = classifiers
        
    def clf_name(self): #list all the classifer names
        return list(self._classifiers_dict.keys())
    
    def classify(self, string_features):
        votes = []
        for c in self._classifiers_dict.values(): #the 8 diff algorithms
            v = c.classify(find_features(string_features.lower())) #either True or False
            votes.append(v) 
        try:
            res = mode(votes)
        except StatisticsError as e:
            res = True #if equal number of True's and False, just set to True
        return res #return value (boolean)
    
    def confidence(self, string_features):
        #count how many were in 'True' using the 8 different algorithms
        votes = []
        for c in self._classifiers_dict.values():
            v = c.classify(find_features(string_features.lower()))
            votes.append(v)
        try:
            choice_votes = votes.count(mode(votes)) #count the amt of True's or False's
        except StatisticsError as e:
            choice_votes = len(votes)/2 #should be 4 since half
        conf = choice_votes / len(votes) #how many out of the 8 were True or False
        return conf
        
def vote_clf():
    '''
    Load the trained or pre-trained pickled algorithm
    
    returns the voted classifier dictionary
    '''
    try:
        voted_classifier = VoteClassifier(classifier_models)
        print('loaded from environmental variable ...')
    except NameError as e:
        print(f"'{e}' error - attempting to load classifiers from pickle file")
        cwd = cwd + '/models/'
        clfs = ['K Nearest Neighbors',
                'Decision Tree',
                'Random Forest',
                'Logistic Regression',
                'SGD Classifier',
                'Multinomial',
                'Bernoulli',
                'LinearSVC',
                'ComplementNB']
        classifiers_models = {}
        for clf_n in clfs:
            with open(f'{cwd}{clf_n}.pickle', 'rb') as clf_file:
                classifiers_models[clf_n] = pickle.load(clf_file)
        voted_classifier = VoteClassifier(classifiers_models)
    return voted_classifier

In [4]:
def check_if_name_related(q_word,verbose=False):
    '''
    Check if related to a name based on the voter classification class (using a ensemble vote - mode)
    
    returns True or False
    '''
    return_result = {}
    return_result['Classification'] = voted_classifier.classify(q_word)
    return_result['Confidence'] = voted_classifier.confidence(q_word)*100
    if verbose:
        print(f"Classification: {return_result['Classification']} Confidence: {return_result['Confidence']}", )
    return return_result['Classification']

In [51]:
voted_classifier = vote_clf()

'name 'classifier_models' is not defined' error - attempting to load classifiers from pickle file


### open pdf file for testing

In [56]:
def loadtxtf(file):
    with open(f'{cwd}/raw_pdfs/{file}', 'r') as f:
        read_output = f.readlines()
    read_output = '\n'.join(read_output).strip().replace('\n','')
    return read_output

In [57]:
pdf_file_1 = loadtxtf('jm00094a012.pdf_OUTPUT.txt')
pdf_file_1

'J. Med. Chem. 1992,35,3007-3011 3007 A Pentahalogenated Monoterpene from the Red Alga Portieria hornemannii Produces a Novel Cytotoxicity Profile against a Diverse Panel of Human Tumor Cell Linest Richard W.  Fuller:  John H. Cardellina 11:  Yoko Kato:  Linda S. Brinen,* Jon Clardy,s Kenneth M.  Snader,ll and Michael R. Boyd*?* Laboratory of  Drug Discovery Research and Development, Developmental  Therapeutics Program, National Cancer Institute, Frederick  Cancer Research & Development  Center, Bldg  1052, Room 121, Frederick,  Maryland  21 702-1201, Department  of Chemistry, Baker Laboratory, Cornell  University, Ithaca, New  York, 14853-1301,  and Natural Products Branch, Developmental Therapeutics Program, National  Cancer Institute, Frederick  Cancer Research  & Development  Center, Bldg  1052, Room  109, Frederick, Maryland  21 702-1201.  Received  January 27, 1992 A plyhalogenated acyclic monoterpene, 6(R)-bromo-3(S)-(bromomethyl)-7-methyl-2,3,%trichloro-l-octene (1) was obtaine

In [8]:
pdf_file_2 = loadtxtf('np50121a025.pdf_OUTPUT.txt')
pdf_file_2

'1120 Journal of  Natural Praducts Val. 58, NO. 7, pp.  1120-1 125, July  1995 MYCALOLIDES D AND E, N E W  CYTOTOXIC MACROLIDES FROM A COLLECTION OF THE STONY CORAL TUBASTREA FAULMVERl MOHMIM~D A. RASHID, KIRK R. GUSTAFSON, JOHN H. CARDELLINA 11,  and MICHAEL R. BOYD* Lcrbwatwy of Drug Discwey Resurcb and DeveIopnt, Developmntal Thapeuticrs Program, Division of Cancer Treatment, National Cancer Institute, Building 1052, Rwm 121, Frederick, Maryland 21 702-1201 ABSTRACT.-Fractionation of  a  cytotoxic  extract  of  the  stony  coral  Tubastru faulkneri yielded a series of cytotoxic polyoxazole macrolides and several noncytotoxic indole derivatives. Two new macrolides, mycalolides D 111 and E 121, were isolated and identified, in addition to the known compound  mycalolide C [31. The macrolide structures were elucidated  by  detailed analysis of their spectroscopic data and by comparison with related compounds. Stony  (scleractinian)  corals  in  the genus Tubastreu (Dendrophylliidae) hav

In [9]:
pdf_file_3 = loadtxtf('11809069.pdf_OUTPUT.txt')
pdf_file_3

'J. Nat. Prod. 2002, 65, 65-6865A New Triterpene Saponin from Pittosporum viridiflorum from the MadagascarRainforest1Youngwan Seo,†,‡ John M. Berger,† Jeannine Hoch,† Kim M. Neddermann,§ Isia Bursuker,§Steven W. Mamber,§ and David G. I. Kingston*,†Department of Chemistry, M/C 0212, Virginia Polytechnic Institute and State University, Blacksburg, Virginia 24061, andBristol-Myers Squibb Pharmaceutical Research Institute, 5 Research Parkway, Wallingford, Connecticut 06492-7660Received June 29, 2001A novel triterpenoid saponin, pittoviridoside (1), which possesses an unusual 2,3,4-trisubstituted glycosidiclinkage, has been isolated from Pittosporum viridiflorum using the engineered yeast strains 1138, 1140,1353, and Sc-7 for bioactivity-guided fractionation. The structure of this compound was determined to be3-O-[(cid:226)-D-glucopyranosyl(1f2)]-[R-D-arabinopyranosyl(1f3)],[R-l-arabinofuranosyl(1f4)-(cid:226)-D-glucuronopy-ranosyl-21-angeloyl-22-senecioylolean-12-en-3(cid:226),15R,16R,21(c

In [10]:
pdf_o1 = nlp(pdf_file_1)
pdf_o2 = nlp(pdf_file_2)
pdf_o3 = nlp(pdf_file_3)

In [11]:
pdf_pt_1 = [(n, i, i.tag_) for n,i in enumerate(pdf_o1)]
pdf_pt_2 = [(n, i, i.tag_) for n,i in enumerate(pdf_o2)]
pdf_pt_3 = [(n, i, i.tag_) for n,i in enumerate(pdf_o3)]

In [41]:
def filter_text_stopw(pdf_text):
    '''
    filter text based on stop words; regular english web
    '''
    token_list = []
    for token in pdf_text:
        token_list.append(token.text)

    filtered_pdf =[] 
    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_pdf.append(word)
            
    return filtered_pdf

def filter_text_pos(pdf_pt):
    '''
    filter text based on part of speech tag
    '''
    try:
        filtered_nnp = []
        for pt in pdf_pt:
            q_str = str(pt[1])
            if pt[2] == "NNP" and '-' not in q_str and len(q_str) > 5:
                #if not re.search('\d+', q_str) and re.search('^!?[A-Z]{1}', q_str):
                    #print(f"{pt[1]}-{pt[1].nbor()}-{pt[2]}")
                    filtered_nnp.append(pt)
        
    except IndexError as e:
        pass #last index doesn't have a neighbour
    
    return filtered_nnp

def extract_nsc(filtered_nnp):
    vouch_nsc = re.findall(r'([C|N|Q|F|M]{1}\d{2,}\w+)',' '.join([str(x[1]) for x in filtered_nnp]))
    vouch_nsc = [r for r in vouch_nsc if len(r) < 10 and len(r) > 4]
    return vouch_nsc

def output_display(filtered_nnp):
    output_display_text = ''
    output=[]
    try:
        for i in filtered_nnp:
            string_word = str(i[1]).lower()
            res = check_if_name_related(string_word)
            if res and not re.search(r'[\()]|\w{3,}[.]\w',string_word):
                string_word = ''.join(' ' if not e.isalnum() else e for e in string_word.strip())
                output_display_text += f' {string_word}'
                output.append(string_word)
                #print(f'{cnt}: {string_word.lower()}')
    except TypeError as e:
        pass
    return set(output),output_display_text.strip()

def output_display2(pdf_o):
    output_display_text = ''
    output = []
    #filter output based on entity type
    er_output = [(i, i.label_, i.label) for i in pdf_o.ents if i.label_ in ['GPE', 'PERSON', 'ORG', 'NORP']]
    er_output
    for i in er_output:
        try:
            string_word = str(i[0]).lower()
            res = check_if_name_related(string_word)
            if res and not re.search('^\w{1,3}[-]\d+', string_word):
                if not re.search('^\d+',string_word):
                    #print(f'{cnt}: {string_word}')
                    #remove special characters from string_word
                    string_word = ''.join(' ' if not e.isalnum() else e for e in string_word.strip())
                    #output as one large string for displacy render
                    output_display_text += f' {string_word}'
                    #create a list of all the keywords
                    output.append(string_word.strip())
        except TypeError as e:
            pass
    return set(output), output_display_text.strip()

def dplcy(output_dply):
    displacy.render(nlp_sci(output_dply), style='ent')

In [13]:
f_pdf1 = filter_text_pos(pdf_pt_1)
f_pdf2 = filter_text_pos(pdf_pt_2)
f_pdf3 = filter_text_pos(pdf_pt_3)

#### First algorithm results

In [42]:
out1 = output_display(f_pdf1) # A Pentahalogenated Monoterpene from the Red Alga Portieria hornemannii Produces a Novel Cytotoxicity Profile against a Diverse Panel of Human Tumor Cell Lines
out2 = output_display(f_pdf2) # MYCALOLIDES D AND E, NEW CYTOTOXIC MACROLIDES FROM A COLLECTION OF THE STONY CORAL TUBASTREA FAULMVERl
out3 = output_display(f_pdf3) # A New Triterpene Saponin from Pittosporum viridiflorum from the Madagascar Rainforest

#### Second algorithm results

In [44]:
out1_ = output_display2(pdf_o1)
out2_ = output_display2(pdf_o2)
out3_ = output_display2(pdf_o3)

In [43]:
# FIRST ALGORITHM OUTPUT
dplcy(out3[1])

In [45]:
# SECOND ALGORITHM OUTPUT
dplcy(out3_[1])

In [28]:
#extract NSC unique identifier
extract_nsc(f_pdf3)

['C62H96O27', 'N056221', 'C12500']