## PARSE SCIENTIFIC JOURNAL PDF FILES AND EXTRACT SPECIFIC KEYWORDS SUCH AS: TAXONOMY, NSC (UNIQUE IDENTIFIER), AUTHORS, COMPOUND NAMES

In [9]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from IPython.display import display
from spacy import displacy
from nltk.stem import WordNetLemmatizer 
from nltk.stem import *
import spacy
import re
import random
import pickle
import os
#import en_core_web_sm
#nlp = en_core_web_sm.load()

ModuleNotFoundError: No module named 'en_core_web_sm'

In [11]:
stopwords = nltk.corpus.stopwords.words('english')

nlp = spacy.load("en_core_web_sm")
from spacy.lang.en.stop_words import STOP_WORDS
nlp.vocab["."].is_stop = True
nlp.vocab["/"].is_stop = True
nlp.vocab["("].is_stop = True
nlp.vocab[")"].is_stop = True
#could add more stop words

### brown, inaugural, and reuters corpus words

In [15]:
# run `python -m nltk.downloader corpora` in console
from nltk.corpus import brown
from nltk.corpus import inaugural
from nltk.corpus import reuters

inaugural_words = inaugural.words()
brown_words = brown.words()
reuters_words = reuters.words()
print(f"brown: {len(brown_words)}; inaugural: {len(inaugural_words)}; reuters: {len(reuters_words)}")

brown: 1161192; inaugural: 152901; reuters: 1720901


In [4]:
#brown_words = [w for w in brown_words if not re.search('\d+', w) \
#              and not re.search(r"[.,`@_!#$%^&*()<>?/\|}{~:-\\']",w)]

In [5]:
#inaugural_words = [w for w in inaugural_words if not re.search('\d+', w) \
#              and not re.search(r"[.,`@_!#$%^&*()<>?/\|}{~:-\\']",w)]

In [16]:
comb_words = set(inaugural_words).union(brown_words).union(reuters_words)
len(comb_words)

83219

In [17]:
# sort by most common
comb_words_sort = list(nltk.FreqDist(comb_words).keys())

In [18]:
comb_words_sort = [x.lower() for x in comb_words_sort]

In [7]:
comb_words_sort[:100]

['49ers',
 'liu',
 'sublimed',
 'flick',
 'thirty-year',
 'submissions',
 'acme',
 'puertollano',
 'sure-sure',
 'agucha',
 'entrepreneurs',
 '600-mile',
 'hexcel',
 'salivary',
 'denunciations',
 "lawyer's",
 'bugging',
 'foods',
 'bees',
 'cji',
 'interplanetary',
 'reformed',
 'epics',
 'photochemical',
 'placement',
 'guin',
 'charlayne',
 'bestes',
 'cacex',
 "university's",
 'end',
 'cavaliere',
 'single-minded',
 'postponing',
 'spiralis',
 'bat',
 'adherence',
 'escorts',
 'dvh',
 'resilience',
 'boy-furiendo',
 'anti-french',
 'rehearse',
 'heilman',
 'christine',
 'pavlovsky',
 'cosmetics',
 'inferences',
 'aspires',
 '2,200,000',
 'superposed',
 'banner',
 'meg',
 'virgil',
 'unnecessary',
 'skeoch',
 'flood-ravaged',
 'sacredly',
 'everywhere',
 'silently',
 'olympics',
 'knife-edge',
 'hydraulic',
 'cardinals',
 'reverence',
 'marcile',
 'criticized',
 'metallgesellschaft',
 'fourteen-team',
 'twentieth',
 "forbes's",
 'cancelled',
 'roof',
 'jules',
 'hnsn',
 'plocek',
 '

### taxonomy names from csv

In [20]:
import pandas as pd
taxon_file = 'taxon_names.csv'
taxon_name_df = pd.read_csv(taxon_file)
taxon_name_df.head(150)
# unit_name1 + unit_name2 is the full taxonomy name; can just make a set of each column

Unnamed: 0,unit_name1,unit_name2
0,Bacteria,
1,Schizomycetes,
2,Archangiaceae,
3,Pseudomonadales,
4,Rhodobacteriineae,
...,...,...
145,Flavobacterium,proteus
146,Flavobacterium,rigense
147,Flavobacterium,solare
148,Flavobacterium,tabidum


In [21]:
uniq_taxon_names = list(set(taxon_name_df.unit_name1).union(set(taxon_name_df.unit_name2)))

In [22]:
uniq_taxon_names = [n for n in uniq_taxon_names if str(n) != 'nan' and not re.search(r"[.,`@_!#$%^&*()<>?/\|}{~:-\\']",n)]
random.shuffle(uniq_taxon_names)

In [23]:
len(uniq_taxon_names)

175523

In [24]:
len(comb_words_sort)

83219

#### make training sets

In [25]:
def last_char(word):
    try:
        return word[-2:]
    except IndexError:
        return word[-1]
    
def first_char(word):
    try:
        return word[:2]
    except IndexError:
        return word[0]

def find_features(word):
    return {'word_length':len(word),\
            'last_letters':last_char(word),\
            'first_letters':first_char(word),\
            'lemma':lemma_binary(word),\
            'stem':stem_compute(word)['output'],\
            'convert_chr':convert_chr(word)
        }

In [26]:
lemmatiser = WordNetLemmatizer()
stemmer = PorterStemmer()

def convert_chr(name):
	chr_list = []
	for l in name:
		chr_list.append(ord(l.lower()))
	return sum(chr_list)

def lemma_binary(word):
    '''
    Output overlapping words from a lemmatised word compared to the original word
    '''
    l_word = lemmatiser.lemmatize(word)
    word_lemma_dict = {'word_length' : len(word), 'lw_length' : len(l_word)}
    max_length = max(word_lemma_dict.values())
    longer_length_word = [k for k,v in word_lemma_dict.items() if v == max_length][0]
    for _ in range(max_length - word_lemma_dict['lw_length']):
        l_word += '0'
    binary_output = []
    for w,lw in zip(word,l_word):
        if w == lw:
            binary_output.append(1)
        else:
            binary_output.append(0)

    return binary_output.count(1)/len(binary_output)

def stem_compute(word):
    '''
    If the stem word account for more than 75% of the original word, it most likely is a name or species name
    '''
    stem_dict = {}
    stem_dict['s_word'] = stemmer.stem(word)
    stem_dict['word_length'] = len(word)
    stem_dict['stem_length'] = len(stem_dict['s_word'])
    stem_dict['ratio'] = stem_dict['stem_length'] / stem_dict['word_length']
    if stem_dict['ratio'] > .75:
        stem_dict['output'] = 1
    else:
        stem_dict['output'] = 0
    return stem_dict

In [27]:
stem_compute('parienus')

{'s_word': 'parienu',
 'word_length': 8,
 'stem_length': 7,
 'ratio': 0.875,
 'output': 1}

In [29]:
tup_taxon_words = [(f, find_features(f),True) for f in uniq_taxon_names[:len(comb_words_sort)]] #bc brown words sort is shorter than uniq_taxon_names
tup_comb_words = [(f, find_features(f), False) for f in comb_words_sort]

LookupError: 
**********************************************************************
  Resource [93momw-1.4[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('omw-1.4')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/omw-1.4[0m

  Searched in:
    - 'C:\\Users\\thiag/nltk_data'
    - 'C:\\Users\\thiag\\AppData\\Local\\Programs\\Python\\Python310\\nltk_data'
    - 'C:\\Users\\thiag\\AppData\\Local\\Programs\\Python\\Python310\\share\\nltk_data'
    - 'C:\\Users\\thiag\\AppData\\Local\\Programs\\Python\\Python310\\lib\\nltk_data'
    - 'C:\\Users\\thiag\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [None]:
tup_taxon_words[:50]

In [105]:
master_tup_set = tup_taxon_words + tup_comb_words
random.shuffle(master_tup_set)

In [106]:
master_tup_set[:100]

[('brunnoides',
  {'word_length': 10,
   'last_letters': 'es',
   'first_letters': 'br',
   'lemma': 1.0,
   'stem': 1,
   'convert_chr': 1081},
  True),
 ('kaswabilengae',
  {'word_length': 13,
   'last_letters': 'ae',
   'first_letters': 'ka',
   'lemma': 1.0,
   'stem': 1,
   'convert_chr': 1358},
  True),
 ('fusatus',
  {'word_length': 7,
   'last_letters': 'us',
   'first_letters': 'fu',
   'lemma': 1.0,
   'stem': 1,
   'convert_chr': 779},
  True),
 ('espagnol',
  {'word_length': 8,
   'last_letters': 'ol',
   'first_letters': 'es',
   'lemma': 1.0,
   'stem': 1,
   'convert_chr': 857},
  False),
 ('elongatula',
  {'word_length': 10,
   'last_letters': 'la',
   'first_letters': 'el',
   'lemma': 1.0,
   'stem': 1,
   'convert_chr': 1068},
  True),
 ('gascae',
  {'word_length': 6,
   'last_letters': 'ae',
   'first_letters': 'ga',
   'lemma': 1.0,
   'stem': 1,
   'convert_chr': 612},
  True),
 ('livingwell',
  {'word_length': 10,
   'last_letters': 'll',
   'first_letters': 'li'

In [107]:
len(master_tup_set)

166370

In [108]:
num_set = int(0.7*len(master_tup_set)) #testing set will be 70% of the corpus (dataset)
devt_num_set = int(len(master_tup_set) - (0.2*len(master_tup_set))) #development set will be 20%
training_set = master_tup_set[:num_set]
devtest_set = master_tup_set[num_set:devt_num_set]
testing_set = master_tup_set[(devt_num_set):] #testing will be remaining 10% of datset

In [109]:
len(devtest_set) + len(training_set) + len(testing_set) == len(master_tup_set)

True

In [23]:
len(master_tup_set)

166370

In [24]:
len(devtest_set)

16638

In [25]:
len(training_set)

116458

In [26]:
len(testing_set)

33274

In [110]:
training_set_ = [(k,v) for (n,k,v) in training_set]
devtest_set_ = [(k,v) for (n,k,v) in devtest_set]

### Naive Bayes

In [111]:
NB_classifier = nltk.NaiveBayesClassifier.train(training_set_)

In [112]:
print(nltk.classify.accuracy(NB_classifier, devtest_set_))

0.9225868493809352


In [113]:
NB_classifier.show_most_informative_features(50)

Most Informative Features
            last_letters = 'ly'            False : True   =    932.0 : 1.0
            last_letters = 'ts'            False : True   =    766.1 : 1.0
            last_letters = 'ed'            False : True   =    640.3 : 1.0
            last_letters = 'ic'            False : True   =    383.2 : 1.0
            last_letters = 'ds'            False : True   =    271.1 : 1.0
            last_letters = 'ss'            False : True   =    237.9 : 1.0
            last_letters = 'rd'            False : True   =    210.7 : 1.0
            last_letters = 'nt'            False : True   =    207.1 : 1.0
            last_letters = 'st'            False : True   =    184.4 : 1.0
                   lemma = 0.9166666666666666  False : True   =    178.2 : 1.0
            last_letters = 'rs'            False : True   =    152.2 : 1.0
            last_letters = 'll'            False : True   =    149.9 : 1.0
            last_letters = 'ii'             True : False  =     96.0 :

### 10 different algorithms to classify

In [8]:
from sklearn.preprocessing import LabelEncoder
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, ComplementNB
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

#from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

In [115]:
# Define models to train
names = ["K Nearest Neighbors", #1
         "Decision Tree", #2
         "Random Forest", #3
         "Logistic Regression", #4
         "SGD Classifier",#5
         "Multinomial", #6
         #"SVC Linear", #7 stalls; doesn't converge
         "Bernoulli", #8
         "LinearSVC",#9
         "ComplementNB"]#10
        # "NuSVC"]#11 stalls; doesn't converge

classifiers = [
    KNeighborsClassifier(),#1
    DecisionTreeClassifier(),#2
    RandomForestClassifier(),#3
    LogisticRegression(),#4
    SGDClassifier(),#max_iter = 500),#5
    MultinomialNB(),#6
    #SVC(kernel = 'linear'),#7
    BernoulliNB(),#8
    LinearSVC(),#9
    ComplementNB()
   # NuSVC()#10
]

models = zip(names, classifiers)
classifier_models = {}

for name, model in models:
    nltk_model = SklearnClassifier(model)
    classifier_models[name] = nltk_model.train(training_set_)
    accuracy = nltk.classify.accuracy(nltk_model, devtest_set_)*100
    print("{} Accuracy: {}".format(name, accuracy))

K Nearest Neighbors Accuracy: 85.01021757422768
Decision Tree Accuracy: 92.0543334535401
Random Forest Accuracy: 92.81764635172497


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression Accuracy: 90.79216251953359
SGD Classifier Accuracy: 65.71703329727131
Multinomial Accuracy: 91.33910325760309
Bernoulli Accuracy: 90.88832792402933




LinearSVC Accuracy: 88.0574588291862
ComplementNB Accuracy: 91.30304123091719


In [34]:
from nltk.classify import ClassifierI
from statistics import mode, StatisticsError

class VoteClassifier(ClassifierI):
    def __init__(self, classifiers): #list of classifiers
        self._classifiers_dict = classifiers
        
    
    def clf_name(self):
        return list(self._classifiers_dict.keys())
    
    def classify(self, string_features):
        votes = []
        for c in self._classifiers_dict.values(): #the 8 diff algorithms
            v = c.classify(find_features(string_features.lower())) #either True or False
            votes.append(v) 
        try:
            res = mode(votes)
        except StatisticsError as e:
            res = True #if equal number of True's and False, just set to True
        return res #return value (boolean)
    
    def confidence(self, string_features):
        #count how many were in 'True' using the 8 different algorithms
        votes = []
        for c in self._classifiers_dict.values():
            v = c.classify(find_features(string_features.lower()))
            votes.append(v)
        try:
            choice_votes = votes.count(mode(votes)) #count the amt of True's or False's
        except StatisticsError as e:
            choice_votes = len(votes)/2 #should be 4 since half
        conf = choice_votes / len(votes) #how many out of the 8 were True or False
        return conf
        
def vote_clf():
    try:
        voted_classifier = VoteClassifier(classifier_models)
        print('ok')
    except NameError as e:
        print(f'{e} - attempting to load classifiers from pickle file')
        clfs = ['K Nearest Neighbors',
                'Decision Tree',
                'Random Forest',
                'Logistic Regression',
                'SGD Classifier',
                'Multinomial',
                'Bernoulli',
                'LinearSVC',
                'ComplementNB']
        classifiers_models = {}
        for clf_n in clfs:
            with open(f'{clf_n}.pickle', 'rb') as clf_file:
                classifiers_models[clf_n] = pickle.load(clf_file)
        voted_classifier = VoteClassifier(classifiers_models)
    return voted_classifier

In [36]:
voted_classifier = vote_clf()

name 'classifier_models' is not defined - attempting to load classifiers from pickle file


In [37]:
def check_if_taxon_related(q_word,verbose=False):
    return_result = {}
    return_result['Classification'] = voted_classifier.classify(q_word)
    return_result['Confidence'] = voted_classifier.confidence(q_word)*100
    if verbose:
        print(f"Classification: {return_result['Classification']} Confidence: {return_result['Confidence']}", )
    return return_result['Classification']

In [186]:
def save_classifier(clf,clf_name,par_path):
    save_path = f'{par_path}/{clf_name}.pickle'
    with open(save_path,"wb") as clf_file:
        pickle.dump(clf, clf_file)
    print(f'... saving {save_path}')

In [187]:
#dump pickle algorithms
cwd = os.getcwd()
for k,clf in classifier_models.items():
    save_classifier(clf,k,cwd)

... saving /Users/trinhsk/Documents/GitRepos/nlp_pdf/K Nearest Neighbors.pickle
... saving /Users/trinhsk/Documents/GitRepos/nlp_pdf/Decision Tree.pickle
... saving /Users/trinhsk/Documents/GitRepos/nlp_pdf/Random Forest.pickle
... saving /Users/trinhsk/Documents/GitRepos/nlp_pdf/Logistic Regression.pickle
... saving /Users/trinhsk/Documents/GitRepos/nlp_pdf/SGD Classifier.pickle
... saving /Users/trinhsk/Documents/GitRepos/nlp_pdf/Multinomial.pickle
... saving /Users/trinhsk/Documents/GitRepos/nlp_pdf/Bernoulli.pickle
... saving /Users/trinhsk/Documents/GitRepos/nlp_pdf/LinearSVC.pickle
... saving /Users/trinhsk/Documents/GitRepos/nlp_pdf/ComplementNB.pickle


In [45]:
check_if_taxon_related('alternatives',True)

Classification: False Confidence: 55.55555555555556


False

In [129]:
errors = []
for (name,feat,tag) in devtest_set:
    guess = voted_classifier.classify(name)
    if guess != tag:
        errors.append((tag,guess,name))

In [130]:
for (tag, guess, name) in sorted(errors):
    print(f'correct={tag}\tguess={guess}\tname={name}')

correct=False	guess=True	name=abra
correct=False	guess=True	name=absentia
correct=False	guess=True	name=aca
correct=False	guess=True	name=acknowledges
correct=False	guess=True	name=acquirees
correct=False	guess=True	name=actualities
correct=False	guess=True	name=adana
correct=False	guess=True	name=addis
correct=False	guess=True	name=adelos
correct=False	guess=True	name=administrations
correct=False	guess=True	name=administrations
correct=False	guess=True	name=aegis
correct=False	guess=True	name=albania
correct=False	guess=True	name=albanians
correct=False	guess=True	name=albicans
correct=False	guess=True	name=alexandria
correct=False	guess=True	name=algae
correct=False	guess=True	name=algeciras
correct=False	guess=True	name=algoma
correct=False	guess=True	name=alicia
correct=False	guess=True	name=allons
correct=False	guess=True	name=alum
correct=False	guess=True	name=aluminio
correct=False	guess=True	name=alumnae
correct=False	guess=True	name=alundum
correct=False	guess=True	name=alusu

### open pdf file for testing

In [47]:
with open('np50121a025.pdf_OUTPUT.txt', 'r') as f:
    read_output = f.readlines()
read_output = '\n'.join(read_output).strip().replace('\n','')
read_output

'1120 Journal of  Natural Praducts Val. 58, NO. 7, pp.  1120-1 125, July  1995 MYCALOLIDES D AND E, N E W  CYTOTOXIC MACROLIDES FROM A COLLECTION OF THE STONY CORAL TUBASTREA FAULMVERl MOHMIM~D A. RASHID, KIRK R. GUSTAFSON, JOHN H. CARDELLINA 11,  and MICHAEL R. BOYD* Lcrbwatwy of Drug Discwey Resurcb and DeveIopnt, Developmntal Thapeuticrs Program, Division of Cancer Treatment, National Cancer Institute, Building 1052, Rwm 121, Frederick, Maryland 21 702-1201 ABSTRACT.-Fractionation of  a  cytotoxic  extract  of  the  stony  coral  Tubastru faulkneri yielded a series of cytotoxic polyoxazole macrolides and several noncytotoxic indole derivatives. Two new macrolides, mycalolides D 111 and E 121, were isolated and identified, in addition to the known compound  mycalolide C [31. The macrolide structures were elucidated  by  detailed analysis of their spectroscopic data and by comparison with related compounds. Stony  (scleractinian)  corals  in  the genus Tubastreu (Dendrophylliidae) hav

In [48]:
pdf_o = nlp(read_output)

In [49]:
pdf_pt = [(n, i, i.tag_) for n,i in enumerate(pdf_o)]
#part of speech tags

In [11]:
pdf_pt

[(0, 1120, 'CD'),
 (1, Journal, 'NNP'),
 (2, of, 'IN'),
 (3,  , '_SP'),
 (4, Natural, 'JJ'),
 (5, Praducts, 'NNS'),
 (6, Val, 'NNP'),
 (7, ., '.'),
 (8, 58, 'CD'),
 (9, ,, ','),
 (10, NO, 'NNP'),
 (11, ., 'NN'),
 (12, 7, 'CD'),
 (13, ,, ','),
 (14, pp, 'NNP'),
 (15, ., '.'),
 (16,  , '_SP'),
 (17, 1120, 'CD'),
 (18, -, 'SYM'),
 (19, 1, 'CD'),
 (20, 125, 'CD'),
 (21, ,, ','),
 (22, July, 'NNP'),
 (23,  , '_SP'),
 (24, 1995, 'CD'),
 (25, MYCALOLIDES, 'NNP'),
 (26, D, 'NNP'),
 (27, AND, 'CC'),
 (28, E, 'NN'),
 (29, ,, ','),
 (30, N, 'NN'),
 (31, E, 'NN'),
 (32, W, 'NN'),
 (33,  , '_SP'),
 (34, CYTOTOXIC, 'NNP'),
 (35, MACROLIDES, 'NNP'),
 (36, FROM, 'IN'),
 (37, A, 'DT'),
 (38, COLLECTION, 'NN'),
 (39, OF, 'IN'),
 (40, THE, 'DT'),
 (41, STONY, 'NN'),
 (42, CORAL, 'NNP'),
 (43, TUBASTREA, 'VBZ'),
 (44, FAULMVERl, 'NNP'),
 (45, MOHMIM, 'NNP'),
 (46, ~, 'NFP'),
 (47, D, 'NNP'),
 (48, A., 'NN'),
 (49, RASHID, 'NNP'),
 (50, ,, ','),
 (51, KIRK, 'NNP'),
 (52, R., 'NNP'),
 (53, GUSTAFSON, 'NNP')

In [50]:
token_list = []
for token in pdf_o:
    token_list.append(token.text)

filtered_pdf =[] 
for word in token_list:
    lexeme = nlp.vocab[word]
    if lexeme.is_stop == False:
        filtered_pdf.append(word) 

In [51]:
filtered_pdf[:100]

['1120',
 'Journal',
 ' ',
 'Natural',
 'Praducts',
 'Val',
 '58',
 ',',
 '7',
 ',',
 'pp',
 ' ',
 '1120',
 '-',
 '1',
 '125',
 ',',
 'July',
 ' ',
 '1995',
 'MYCALOLIDES',
 'D',
 'E',
 ',',
 'N',
 'E',
 'W',
 ' ',
 'CYTOTOXIC',
 'MACROLIDES',
 'COLLECTION',
 'STONY',
 'CORAL',
 'TUBASTREA',
 'FAULMVERl',
 'MOHMIM',
 '~',
 'D',
 'A.',
 'RASHID',
 ',',
 'KIRK',
 'R.',
 'GUSTAFSON',
 ',',
 'JOHN',
 'H.',
 'CARDELLINA',
 '11',
 ',',
 ' ',
 'MICHAEL',
 'R.',
 'BOYD',
 '*',
 'Lcrbwatwy',
 'Drug',
 'Discwey',
 'Resurcb',
 'DeveIopnt',
 ',',
 'Developmntal',
 'Thapeuticrs',
 'Program',
 ',',
 'Division',
 'Cancer',
 'Treatment',
 ',',
 'National',
 'Cancer',
 'Institute',
 ',',
 'Building',
 '1052',
 ',',
 'Rwm',
 '121',
 ',',
 'Frederick',
 ',',
 'Maryland',
 '21',
 '702',
 '-',
 '1201',
 'ABSTRACT.-Fractionation',
 ' ',
 ' ',
 'cytotoxic',
 ' ',
 'extract',
 ' ',
 ' ',
 ' ',
 'stony',
 ' ',
 'coral',
 ' ',
 'Tubastru']

In [52]:
#pt[0].nbor().tag_ == "NN" and --> use this to only get 5 matches but too greedy
try:
    filtered_nnp = []
    for pt in pdf_pt:
        q_str = str(pt[1])
        if pt[2] == "NNP" and '-' not in q_str and len(q_str) > 5:
            #if not re.search('\d+', q_str) and re.search('^!?[A-Z]{1}', q_str):
                print(f"{pt[1]}-{pt[1].nbor()}-{pt[2]}")
                filtered_nnp.append(pt)

except IndexError as e:
    pass #last index doesn't have a neighbour

Journal-of-NNP
MYCALOLIDES-D-NNP
CYTOTOXIC-MACROLIDES-NNP
MACROLIDES-FROM-NNP
FAULMVERl-MOHMIM-NNP
MOHMIM-~-NNP
RASHID-,-NNP
GUSTAFSON-,-NNP
CARDELLINA-11-NNP
MICHAEL-R.-NNP
Lcrbwatwy-of-NNP
Discwey-Resurcb-NNP
Resurcb-and-NNP
DeveIopnt-,-NNP
Developmntal-Thapeuticrs-NNP
Thapeuticrs-Program-NNP
Program-,-NNP
Division-of-NNP
Cancer-Treatment-NNP
Treatment-,-NNP
National-Cancer-NNP
Cancer-Institute-NNP
Institute-,-NNP
Building-1052-NNP
Frederick-,-NNP
Maryland-21-NNP
Tubastru-faulkneri-NNP
cytotoxic-polyoxazole-NNP
polyoxazole-macrolides-NNP
noncytotoxic-indole-NNP
Tubastreu-(-NNP
Dendrophylliidae-)-NNP
aplysinopsin-(-NNP
Tubastreu-faulkneri-NNP
Bioassay---NNP
scribed-herein-NNP
herein-is-NNP
Tubastreu-faulkneri-NNP
Barrier-Reef-NNP
Australia-,-NNP
protocol-(-NNP
Sephadex-LH-20-NNP
Rashid-et-NNP
Cytotoxic-Macrolides-NNP
Macrolides-from-NNP
Tubastreu-1121-NNP
C5,H,,N40-,-NNP
kabiramides- -NNP
Journal-of-NNP
Natural-Products-NNP
Mycalolides-D-NNP
Mycalolide-D-NNP
Mycalolide-E-NNP
1.19brs-C

In [54]:
vouch_nsc = re.findall(r'([C|N|Q|F|M]{1}\d{2,}\w+)',' '.join([str(x[1]) for x in filtered_nnp]))
vouch_nsc = [r for r in vouch_nsc if len(r) < 10]
vouch_nsc

['Q66C1269']

In [55]:
filtered_nnp

[(1, Journal, 'NNP'),
 (25, MYCALOLIDES, 'NNP'),
 (34, CYTOTOXIC, 'NNP'),
 (35, MACROLIDES, 'NNP'),
 (44, FAULMVERl, 'NNP'),
 (45, MOHMIM, 'NNP'),
 (49, RASHID, 'NNP'),
 (53, GUSTAFSON, 'NNP'),
 (57, CARDELLINA, 'NNP'),
 (62, MICHAEL, 'NNP'),
 (66, Lcrbwatwy, 'NNP'),
 (69, Discwey, 'NNP'),
 (70, Resurcb, 'NNP'),
 (72, DeveIopnt, 'NNP'),
 (74, Developmntal, 'NNP'),
 (75, Thapeuticrs, 'NNP'),
 (76, Program, 'NNP'),
 (78, Division, 'NNP'),
 (80, Cancer, 'NNP'),
 (81, Treatment, 'NNP'),
 (83, National, 'NNP'),
 (84, Cancer, 'NNP'),
 (85, Institute, 'NNP'),
 (87, Building, 'NNP'),
 (93, Frederick, 'NNP'),
 (95, Maryland, 'NNP'),
 (117, Tubastru, 'NNP'),
 (123, cytotoxic, 'NNP'),
 (124, polyoxazole, 'NNP'),
 (128, noncytotoxic, 'NNP'),
 (193, Tubastreu, 'NNP'),
 (195, Dendrophylliidae, 'NNP'),
 (220, aplysinopsin, 'NNP'),
 (264, Tubastreu, 'NNP'),
 (293, Bioassay, 'NNP'),
 (326, scribed, 'NNP'),
 (327, herein, 'NNP'),
 (372, Tubastreu, 'NNP'),
 (395, Barrier, 'NNP'),
 (398, Australia, 'NNP')

In [56]:
for i in filtered_nnp:
    print(i[1])

Journal
MYCALOLIDES
CYTOTOXIC
MACROLIDES
FAULMVERl
MOHMIM
RASHID
GUSTAFSON
CARDELLINA
MICHAEL
Lcrbwatwy
Discwey
Resurcb
DeveIopnt
Developmntal
Thapeuticrs
Program
Division
Cancer
Treatment
National
Cancer
Institute
Building
Frederick
Maryland
Tubastru
cytotoxic
polyoxazole
noncytotoxic
Tubastreu
Dendrophylliidae
aplysinopsin
Tubastreu
Bioassay
scribed
herein
Tubastreu
Barrier
Australia
protocol
Sephadex
Rashid
Cytotoxic
Macrolides
Tubastreu
C5,H,,N40
kabiramides
Journal
Natural
Mycalolides
Mycalolide
Mycalolide
1.19brs
Rashid
Cytotoxic
Macrolides
Tubastreu
proton
A34331
gletsat63.37and3.46
Compound
proton
carbonyl
methine
proton
Halichondria
Mycalolide
spectra
C46H62N1013
methine
proton
methineproton(62.75,m
mamide
Journal
Natural
methine
Halicbondria
Jaspis
Mycule
Tubastrea
EXPERIMENTAL
ANIMAL
Salamander
Cleveland
Barrier
Australia
Murphy
Institute
Marine
Science
Q66C1269
Queensland
Museum
Brisbane
EXTRACTION
CH2C12
Sephadex
hexane
fraction(l30
hexane
hexane
C,,,H,,N40
999([M
priate
C

In [65]:
output_display_text_ = ''
output=[]
cnt = 1
try:
    for i in filtered_nnp:
        #res = classifier.classify(find_features(str(i[1]).lower()))
        string_word = str(i[1])
        res = check_if_taxon_related(string_word)
        if res and not re.search(r'[\()]|\w{3,}[.]\w',string_word):
            output_display_text_ += f' {string_word}'
            output.append(string_word)
            print(f'{cnt}: {string_word.lower()}')
            cnt += 1
except TypeError as e:
    pass

0 - mycalolides
1 - macrolides
2 - cardellina
3 - tubastru
4 - tubastreu
5 - dendrophylliidae
6 - tubastreu
7 - tubastreu
8 - australia
9 - macrolides
10 - tubastreu
11 - kabiramides
12 - mycalolides
13 - macrolides
14 - tubastreu
15 - halichondria
16 - spectra
17 - halicbondria
18 - jaspis
19 - tubastrea
20 - australia
21 - museum
22 - macrolides
23 - tubastreu
24 - antinunor
25 - melanoma
26 - tubastrea
27 - sanduja
28 - heterocycles
29 - fusetani
30 - matsunaga
31 - guella
32 - mancini
33 - zibrowius
34 - pietra
35 - philadelphia
36 - cardellina
37 - cafieri
38 - fattorusso
39 - mangoni
40 - fusetani
41 - yasumuro
42 - matsunaga
43 - matsunaga
44 - fusetani
45 - koseki
46 - matsunaga
47 - fusetani
48 - koseki
49 - noguchi
50 - sankawa
51 - molinski
52 - kobayashi
53 - murata
54 - shigemori
55 - cardellina
56 - cardellina


In [58]:
set(output)

{'Australia',
 'CARDELLINA',
 'Cafieri',
 'Cardellina',
 'Dendrophylliidae',
 'Fattorusso',
 'Fusetani',
 'Guella',
 'Halicbondria',
 'Halichondria',
 'Heterocycles',
 'Jaspis',
 'Kobayashi',
 'Koseki',
 'MACROLIDES',
 'MYCALOLIDES',
 'Macrolides',
 'Mancini',
 'Mangoni',
 'Matsunaga',
 'Melanoma',
 'Molinski',
 'Murata',
 'Museum',
 'Mycalolides',
 'Noguchi',
 'Philadelphia',
 'Pietra',
 'Sanduja',
 'Sankawa',
 'Shigemori',
 'Tubastrea',
 'Tubastreu',
 'Tubastru',
 'Yasumuro',
 'Zibrowius',
 'antinunor',
 'kabiramides',
 'spectra'}

In [60]:
nlp = spacy.load("en_core_sci_sm")
text = """
Myeloid derived suppressor cells (MDSC) are immature 
myeloid cells with immunosuppressive activity. 
They accumulate in tumor-bearing mice and humans 
with different types of cancer, including hepatocellular 
carcinoma (HCC).
"""
doc = nlp(text)

In [61]:
output_display_text_.strip()

'MYCALOLIDES MACROLIDES CARDELLINA Tubastru Tubastreu Dendrophylliidae Tubastreu Tubastreu Australia Macrolides Tubastreu kabiramides Mycalolides Macrolides Tubastreu Halichondria spectra Halicbondria Jaspis Tubastrea Australia Museum Macrolides Tubastreu antinunor Melanoma Tubastrea Sanduja Heterocycles Fusetani Matsunaga Guella Mancini Zibrowius Pietra Philadelphia Cardellina Cafieri Fattorusso Mangoni Fusetani Yasumuro Matsunaga Matsunaga Fusetani Koseki Matsunaga Fusetani Koseki Noguchi Sankawa Molinski Kobayashi Murata Shigemori Cardellina Cardellina'

In [62]:
displacy.render(nlp(output_display_text_), style='ent')

In [63]:
#entry recognition
er_output = [(i, i.label_, i.label) for i in pdf_o.ents if i.label_ in ['GPE', 'PERSON', 'ORG', 'NORP']]
er_output

[(CYTOTOXIC, 'ORG', 383),
 (MOHMIM, 'ORG', 383),
 (D A. RASHID, 'PERSON', 380),
 (KIRK R. GUSTAFSON, 'PERSON', 380),
 (JOHN H. CARDELLINA 11, 'PERSON', 380),
 (MICHAEL R. BOYD, 'PERSON', 380),
 (Discwey Resurcb, 'PERSON', 380),
 (DeveIopnt, 'ORG', 383),
 (Developmntal Thapeuticrs Program, 'ORG', 383),
 (Division of Cancer Treatment, 'ORG', 383),
 (National Cancer Institute, 'ORG', 383),
 (Frederick, 'GPE', 384),
 (Maryland, 'GPE', 384),
 (Tubastru, 'GPE', 384),
 (scleractinian, 'NORP', 381),
 (Tubastreu (Dendrophylliidae, 'ORG', 383),
 (1,2, 'PERSON', 380),
 (Australian, 'NORP', 381),
 (Tubastreu, 'NORP', 381),
 (faulkneri Wells, 'PERSON', 380),
 (NCI, 'ORG', 383),
 (Bioassay, 'PERSON', 380),
 (scribed herein, 'PERSON', 380),
 (Sephadex, 'ORG', 383),
 (hplc, 'GPE', 384),
 (Rashid, 'PERSON', 380),
 (ZIE, 'ORG', 383),
 (C5,H,,N40, 'ORG', 383),
 (fabms, 'ORG', 383),
 (max, 'PERSON', 380),
 (C-N, 'ORG', 383),
 (Journal of Natural Products, 'ORG', 383),
 (Nmr Data, 'ORG', 383),
 (Mycalolide

In [64]:
cnt = 1
output_display_text = ''
output2=[]
for i in er_output:
    try:
        #res = classifier.classify(find_features(string.lower()))
        string_word = str(i[0]).lower()
        res = check_if_taxon_related(string_word)
        if res and not re.search('^\w{1,3}[-]\d+', string_word):
            if not re.search('^\d+',string_word):
                print(f'{cnt}: {string_word}')
                output_display_text += f' {string_word.strip()}'
                output2.append(string_word.strip())
                cnt+=1
    except TypeError as e:
        pass

1: tubastru
2: tubastreu (dendrophylliidae
3: tubastreu
4: nmr data
5: cytotoxic macrolides
6: halichondria
7: mycalolides c
8: tubastrea
9: australia
10: the queensland museum
11: cytotoxic macrolides
12: data
13: lox imvi
14: tubastrea
15: sakai
16: higa
17: heterocycles
18: n. fusetani
19: s.  matsunaga
20: g. guella
21: i. mancini
22: h. zibrowius
23: f. pietra
24: acta
25: philadelphia
26: j.h. cardellina
27: c. cafieri
28: a.  mangoni
29: n. fusetani
30: s.  matsunaga
31: s. matsunaga
32: n. fusetani
33: k. koseki
34: s.  matsunaga
35: n. fusetani
36: k. koseki
37: h. noguchi
38: kobayashi
39: murata
40: h. shigemori
41: j.-r. dai


In [66]:
output_display_text.strip()

'tubastru tubastreu (dendrophylliidae tubastreu nmr data cytotoxic macrolides halichondria mycalolides c tubastrea australia the queensland museum cytotoxic macrolides data lox imvi tubastrea sakai higa heterocycles n. fusetani s.  matsunaga g. guella i. mancini h. zibrowius f. pietra acta philadelphia j.h. cardellina c. cafieri a.  mangoni n. fusetani s.  matsunaga s. matsunaga n. fusetani k. koseki s.  matsunaga n. fusetani k. koseki h. noguchi kobayashi murata h. shigemori j.-r. dai'

In [67]:
displacy.render(nlp(output_display_text), style='ent')