## PARSE SCIENTIFIC JOURNAL PDF FILES AND EXTRACT SPECIFIC KEYWORDS SUCH AS: TAXONOMY, NSC (UNIQUE IDENTIFIER), AUTHORS, COMPOUND NAMES

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from IPython.display import display
from spacy import displacy
from nltk.stem import WordNetLemmatizer 
from nltk.stem import *
import spacy
import re
import random
import pickle
import os
#import en_core_web_sm
#nlp = en_core_web_sm.load()

In [3]:
stopwords = nltk.corpus.stopwords.words('english')

nlp = spacy.load("en_core_web_sm")
from spacy.lang.en.stop_words import STOP_WORDS
nlp.vocab["."].is_stop = True
nlp.vocab["/"].is_stop = True
nlp.vocab["("].is_stop = True
nlp.vocab[")"].is_stop = True
#could add more stop words

### brown, inaugural, and reuters corpus words

In [4]:
# run `python -m nltk.downloader corpora` in console
from nltk.corpus import brown
from nltk.corpus import inaugural
from nltk.corpus import reuters

inaugural_words = inaugural.words()
brown_words = brown.words()
reuters_words = reuters.words()
print(f"brown: {len(brown_words)}; inaugural: {len(inaugural_words)}; reuters: {len(reuters_words)}")

brown: 1161192; inaugural: 152901; reuters: 1720901


In [5]:
#brown_words = [w for w in brown_words if not re.search('\d+', w) \
#              and not re.search(r"[.,`@_!#$%^&*()<>?/\|}{~:-\\']",w)]

In [6]:
#inaugural_words = [w for w in inaugural_words if not re.search('\d+', w) \
#              and not re.search(r"[.,`@_!#$%^&*()<>?/\|}{~:-\\']",w)]

In [7]:
comb_words = set(inaugural_words).union(brown_words).union(reuters_words)
len(comb_words)

83219

In [8]:
# sort by most common
comb_words_sort = list(nltk.FreqDist(comb_words).keys())

In [9]:
comb_words_sort = [x.lower() for x in comb_words_sort]

In [10]:
comb_words_sort[:100]

['7070',
 'ethicists',
 'speaking',
 'bowing',
 '6-5',
 'ses',
 'slaves',
 'prompting',
 '-.10',
 'earley',
 'bravado',
 'safer',
 'thing',
 'twice',
 'resumption',
 'crocked',
 'plume',
 'eve',
 'gv',
 'beverly',
 'glocester',
 'armco',
 'avant-garde',
 'discontents',
 'february',
 'ccfp',
 'imc',
 'humanness',
 'kemp',
 'jamestown',
 'softest',
 'must',
 'hlco',
 'life',
 '7,484,268',
 'frontal',
 "adams'",
 'unfailingly',
 'footstool',
 '768',
 'hotrod',
 'esther',
 'teen',
 '489',
 'desolate',
 'cons',
 'explosive',
 'metallgesellschaft',
 'stoneware',
 'cal.',
 '080',
 'mavis',
 'mincing',
 'coke',
 'deep-eyed',
 'coupon',
 'fiddlesticks',
 'moody',
 'drumlin',
 '531',
 'stabilise',
 'wine',
 'ethylene',
 'callicoon',
 'harms',
 'radically',
 'warships',
 'whence',
 'erudition',
 'unreclaimed',
 'euromarket',
 'lubricating',
 'cytoplasm',
 'shall',
 'oyajima',
 'pdq',
 'ethylene',
 'dancer',
 'mineral-rich',
 'gig',
 'doers',
 'growing',
 'twelve-year',
 '010',
 'oatnut',
 'dot',


### taxonomy names from csv

In [11]:
import pandas as pd
taxon_file = 'taxon_names.csv'
taxon_name_df = pd.read_csv(taxon_file)
taxon_name_df.head(150)
# unit_name1 + unit_name2 is the full taxonomy name; can just make a set of each column

Unnamed: 0,unit_name1,unit_name2
0,Bacteria,
1,Schizomycetes,
2,Archangiaceae,
3,Pseudomonadales,
4,Rhodobacteriineae,
...,...,...
145,Flavobacterium,proteus
146,Flavobacterium,rigense
147,Flavobacterium,solare
148,Flavobacterium,tabidum


In [12]:
uniq_taxon_names = list(set(taxon_name_df.unit_name1).union(set(taxon_name_df.unit_name2)))

In [13]:
uniq_taxon_names = [n for n in uniq_taxon_names if str(n) != 'nan' and not re.search(r"[.,`@_!#$%^&*()<>?/\|}{~:-\\']",n)]
random.shuffle(uniq_taxon_names)

In [14]:
len(uniq_taxon_names)

175523

In [15]:
len(comb_words_sort)

83219

#### make training sets

In [16]:
def last_char(word):
    try:
        return word[-2:]
    except IndexError:
        return word[-1]
    
def first_char(word):
    try:
        return word[:2]
    except IndexError:
        return word[0]

def find_features(word):
    return {'word_length':len(word),\
            'last_letters':last_char(word),\
            'first_letters':first_char(word),\
            'lemma':lemma_binary(word),\
            'stem':stem_compute(word)['output'],\
            'convert_chr':convert_chr(word)
        }

In [17]:
lemmatiser = WordNetLemmatizer()
stemmer = PorterStemmer()

def convert_chr(name):
	chr_list = []
	for l in name:
		chr_list.append(ord(l.lower()))
	return sum(chr_list)

def lemma_binary(word):
    '''
    Output overlapping words from a lemmatised word compared to the original word
    '''
    l_word = lemmatiser.lemmatize(word)
    word_lemma_dict = {'word_length' : len(word), 'lw_length' : len(l_word)}
    max_length = max(word_lemma_dict.values())
    longer_length_word = [k for k,v in word_lemma_dict.items() if v == max_length][0]
    for _ in range(max_length - word_lemma_dict['lw_length']):
        l_word += '0'
    binary_output = []
    for w,lw in zip(word,l_word):
        if w == lw:
            binary_output.append(1)
        else:
            binary_output.append(0)

    return binary_output.count(1)/len(binary_output)

def stem_compute(word):
    '''
    If the stem word account for more than 75% of the original word, it most likely is a name or species name
    '''
    stem_dict = {}
    stem_dict['s_word'] = stemmer.stem(word)
    stem_dict['word_length'] = len(word)
    stem_dict['stem_length'] = len(stem_dict['s_word'])
    stem_dict['ratio'] = stem_dict['stem_length'] / stem_dict['word_length']
    if stem_dict['ratio'] > .75:
        stem_dict['output'] = 1
    else:
        stem_dict['output'] = 0
    return stem_dict

In [18]:
stem_compute('parienus')

{'s_word': 'parienu',
 'word_length': 8,
 'stem_length': 7,
 'ratio': 0.875,
 'output': 1}

In [19]:
tup_taxon_words = [(f, find_features(f),True) for f in uniq_taxon_names[:len(comb_words_sort)]] #bc brown words sort is shorter than uniq_taxon_names
tup_comb_words = [(f, find_features(f), False) for f in comb_words_sort]

In [20]:
tup_taxon_words[:50]

[('guyotensis',
  {'word_length': 10,
   'last_letters': 'is',
   'first_letters': 'gu',
   'lemma': 1.0,
   'stem': 1,
   'convert_chr': 1114},
  True),
 ('ilamica',
  {'word_length': 7,
   'last_letters': 'ca',
   'first_letters': 'il',
   'lemma': 1.0,
   'stem': 1,
   'convert_chr': 720},
  True),
 ('gulestanica',
  {'word_length': 11,
   'last_letters': 'ca',
   'first_letters': 'gu',
   'lemma': 1.0,
   'stem': 1,
   'convert_chr': 1168},
  True),
 ('antigana',
  {'word_length': 8,
   'last_letters': 'na',
   'first_letters': 'an',
   'lemma': 1.0,
   'stem': 1,
   'convert_chr': 835},
  True),
 ('balliana',
  {'word_length': 8,
   'last_letters': 'na',
   'first_letters': 'ba',
   'lemma': 1.0,
   'stem': 1,
   'convert_chr': 820},
  True),
 ('shumardii',
  {'word_length': 9,
   'last_letters': 'ii',
   'first_letters': 'sh',
   'lemma': 1.0,
   'stem': 1,
   'convert_chr': 966},
  True),
 ('sissonii',
  {'word_length': 8,
   'last_letters': 'ii',
   'first_letters': 'si',
   'l

In [21]:
master_tup_set = tup_taxon_words + tup_comb_words
random.shuffle(master_tup_set)

In [22]:
master_tup_set[:100]

[('atrofuscum',
  {'word_length': 10,
   'last_letters': 'um',
   'first_letters': 'at',
   'lemma': 1.0,
   'stem': 1,
   'convert_chr': 1097},
  True),
 ('scaldiana',
  {'word_length': 9,
   'last_letters': 'na',
   'first_letters': 'sc',
   'lemma': 1.0,
   'stem': 1,
   'convert_chr': 928},
  True),
 ('agiles',
  {'word_length': 6,
   'last_letters': 'es',
   'first_letters': 'ag',
   'lemma': 1.0,
   'stem': 0,
   'convert_chr': 629},
  True),
 ('swinging',
  {'word_length': 8,
   'last_letters': 'ng',
   'first_letters': 'sw',
   'lemma': 1.0,
   'stem': 0,
   'convert_chr': 870},
  False),
 ('rend',
  {'word_length': 4,
   'last_letters': 'nd',
   'first_letters': 're',
   'lemma': 1.0,
   'stem': 1,
   'convert_chr': 425},
  False),
 ('dishes',
  {'word_length': 6,
   'last_letters': 'es',
   'first_letters': 'di',
   'lemma': 0.6666666666666666,
   'stem': 0,
   'convert_chr': 640},
  False),
 ('emergency',
  {'word_length': 9,
   'last_letters': 'cy',
   'first_letters': 'em'

In [23]:
len(master_tup_set)

166438

In [24]:
num_set = int(0.7*len(master_tup_set)) #testing set will be 70% of the corpus (dataset)
devt_num_set = int(len(master_tup_set) - (0.2*len(master_tup_set))) #development set will be 20%
training_set = master_tup_set[:num_set]
devtest_set = master_tup_set[num_set:devt_num_set]
testing_set = master_tup_set[(devt_num_set):] #testing will be remaining 10% of datset

In [25]:
len(devtest_set) + len(training_set) + len(testing_set) == len(master_tup_set)

True

In [26]:
len(master_tup_set)

166438

In [27]:
len(devtest_set)

16644

In [28]:
len(training_set)

116506

In [29]:
len(testing_set)

33288

In [30]:
training_set_ = [(k,v) for (n,k,v) in training_set]
devtest_set_ = [(k,v) for (n,k,v) in devtest_set]

### Naive Bayes

In [31]:
NB_classifier = nltk.NaiveBayesClassifier.train(training_set_)

In [32]:
print(nltk.classify.accuracy(NB_classifier, devtest_set_))

0.9257390050468637


In [33]:
NB_classifier.show_most_informative_features(50)

Most Informative Features
            last_letters = 'ed'            False : True   =   1183.2 : 1.0
                   lemma = 0.9090909090909091  False : True   =    285.6 : 1.0
            last_letters = 'ls'            False : True   =    272.2 : 1.0
            last_letters = 'ds'            False : True   =    269.6 : 1.0
            last_letters = 'ly'            False : True   =    260.4 : 1.0
            last_letters = 'st'            False : True   =    254.7 : 1.0
            last_letters = 'nt'            False : True   =    202.3 : 1.0
            last_letters = 'ty'            False : True   =    179.1 : 1.0
            last_letters = 'ss'            False : True   =    172.5 : 1.0
                   lemma = 0.9166666666666666  False : True   =    171.4 : 1.0
            last_letters = 'rs'            False : True   =    166.3 : 1.0
            last_letters = 'ks'            False : True   =    152.0 : 1.0
            last_letters = 'rt'            False : True   =    133

### 10 different algorithms to classify

In [34]:
from sklearn.preprocessing import LabelEncoder
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, ComplementNB
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

#from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

In [35]:
# Define models to train
names = ["K Nearest Neighbors", #1
         "Decision Tree", #2
         "Random Forest", #3
         "Logistic Regression", #4
         "SGD Classifier",#5
         "Multinomial", #6
         #"SVC Linear", #7 stalls; doesn't converge
         "Bernoulli", #8
         "LinearSVC",#9
         "ComplementNB"]#10
        # "NuSVC"]#11 stalls; doesn't converge

classifiers = [
    KNeighborsClassifier(),#1
    DecisionTreeClassifier(),#2
    RandomForestClassifier(),#3
    LogisticRegression(),#4
    SGDClassifier(),#max_iter = 500),#5
    MultinomialNB(),#6
    #SVC(kernel = 'linear'),#7
    BernoulliNB(),#8
    LinearSVC(),#9
    ComplementNB()
   # NuSVC()#10
]

models = zip(names, classifiers)
classifier_models = {}

for name, model in models:
    nltk_model = SklearnClassifier(model)
    classifier_models[name] = nltk_model.train(training_set_)
    accuracy = nltk.classify.accuracy(nltk_model, devtest_set_)*100
    print("{} Accuracy: {}".format(name, accuracy))

K Nearest Neighbors Accuracy: 84.9014659937515
Decision Tree Accuracy: 92.00312424897861
Random Forest Accuracy: 92.59192501802451


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 90.20067291516463
SGD Classifier Accuracy: 87.49098774333093
Multinomial Accuracy: 91.18000480653689
Bernoulli Accuracy: 90.86157173756308




LinearSVC Accuracy: 57.071617399663545
ComplementNB Accuracy: 91.19202114876231


In [36]:
from nltk.classify import ClassifierI
from statistics import mode, StatisticsError

class VoteClassifier(ClassifierI):
    def __init__(self, classifiers): #list of classifiers
        self._classifiers_dict = classifiers
        
    
    def clf_name(self):
        return list(self._classifiers_dict.keys())
    
    def classify(self, string_features):
        votes = []
        for c in self._classifiers_dict.values(): #the 8 diff algorithms
            v = c.classify(find_features(string_features.lower())) #either True or False
            votes.append(v) 
        try:
            res = mode(votes)
        except StatisticsError as e:
            res = True #if equal number of True's and False, just set to True
        return res #return value (boolean)
    
    def confidence(self, string_features):
        #count how many were in 'True' using the 8 different algorithms
        votes = []
        for c in self._classifiers_dict.values():
            v = c.classify(find_features(string_features.lower()))
            votes.append(v)
        try:
            choice_votes = votes.count(mode(votes)) #count the amt of True's or False's
        except StatisticsError as e:
            choice_votes = len(votes)/2 #should be 4 since half
        conf = choice_votes / len(votes) #how many out of the 8 were True or False
        return conf
        
def vote_clf():
    try:
        voted_classifier = VoteClassifier(classifier_models)
        print('ok')
    except NameError as e:
        print(f'{e} - attempting to load classifiers from pickle file')
        clfs = ['K Nearest Neighbors',
                'Decision Tree',
                'Random Forest',
                'Logistic Regression',
                'SGD Classifier',
                'Multinomial',
                'Bernoulli',
                'LinearSVC',
                'ComplementNB']
        classifiers_models = {}
        for clf_n in clfs:
            with open(f'{clf_n}.pickle', 'rb') as clf_file:
                classifiers_models[clf_n] = pickle.load(clf_file)
        voted_classifier = VoteClassifier(classifiers_models)
    return voted_classifier

In [37]:
voted_classifier = vote_clf()

ok


In [38]:
def check_if_taxon_related(q_word,verbose=False):
    return_result = {}
    return_result['Classification'] = voted_classifier.classify(q_word)
    return_result['Confidence'] = voted_classifier.confidence(q_word)*100
    if verbose:
        print(f"Classification: {return_result['Classification']} Confidence: {return_result['Confidence']}", )
    return return_result['Classification']

In [39]:
def save_classifier(clf,clf_name,par_path):
    save_path = f'{par_path}/{clf_name}.pickle'
    with open(save_path,"wb") as clf_file:
        pickle.dump(clf, clf_file)
    print(f'... saving {save_path}')

In [40]:
#dump pickle algorithms
cwd = os.getcwd()
for k,clf in classifier_models.items():
    save_classifier(clf,k,cwd)

... saving C:\Users\thiag\OneDrive\Documentos\GitHub\sigtic/K Nearest Neighbors.pickle
... saving C:\Users\thiag\OneDrive\Documentos\GitHub\sigtic/Decision Tree.pickle
... saving C:\Users\thiag\OneDrive\Documentos\GitHub\sigtic/Random Forest.pickle
... saving C:\Users\thiag\OneDrive\Documentos\GitHub\sigtic/Logistic Regression.pickle
... saving C:\Users\thiag\OneDrive\Documentos\GitHub\sigtic/SGD Classifier.pickle
... saving C:\Users\thiag\OneDrive\Documentos\GitHub\sigtic/Multinomial.pickle
... saving C:\Users\thiag\OneDrive\Documentos\GitHub\sigtic/Bernoulli.pickle
... saving C:\Users\thiag\OneDrive\Documentos\GitHub\sigtic/LinearSVC.pickle
... saving C:\Users\thiag\OneDrive\Documentos\GitHub\sigtic/ComplementNB.pickle


In [41]:
check_if_taxon_related('alternatives',True)

Classification: False Confidence: 55.55555555555556


False

In [42]:
errors = []
for (name,feat,tag) in devtest_set:
    guess = voted_classifier.classify(name)
    if guess != tag:
        errors.append((tag,guess,name))

In [43]:
for (tag, guess, name) in sorted(errors):
    print(f'correct={tag}\tguess={guess}\tname={name}')

correct=False	guess=True	name=acadia
correct=False	guess=True	name=accelerator
correct=False	guess=True	name=acquah
correct=False	guess=True	name=actualities
correct=False	guess=True	name=ada
correct=False	guess=True	name=adulterous
correct=False	guess=True	name=advertise
correct=False	guess=True	name=aegis
correct=False	guess=True	name=aerates
correct=False	guess=True	name=africa
correct=False	guess=True	name=agenda
correct=False	guess=True	name=agrianalysis
correct=False	guess=True	name=agrippa
correct=False	guess=True	name=ais
correct=False	guess=True	name=alameda
correct=False	guess=True	name=albanians
correct=False	guess=True	name=albicans
correct=False	guess=True	name=alcoma
correct=False	guess=True	name=algae
correct=False	guess=True	name=algeciras
correct=False	guess=True	name=alibi
correct=False	guess=True	name=alienus
correct=False	guess=True	name=allegis
correct=False	guess=True	name=allegretti
correct=False	guess=True	name=aluminio
correct=False	guess=True	name=alundum
corr

### open pdf file for testing

In [44]:
with open('raw_pdfs/np50121a025.pdf_OUTPUT.txt', 'r', encoding="utf8") as f:
    read_output = f.readlines()
read_output = '\n'.join(read_output).strip().replace('\n','')
read_output

'1120 Journal of  Natural Praducts Val. 58, NO. 7, pp.  1120-1 125, July  1995 MYCALOLIDES D AND E, N E W  CYTOTOXIC MACROLIDES FROM A COLLECTION OF THE STONY CORAL TUBASTREA FAULMVERl MOHMIM~D A. RASHID, KIRK R. GUSTAFSON, JOHN H. CARDELLINA 11,  and MICHAEL R. BOYD* Lcrbwatwy of Drug Discwey Resurcb and DeveIopnt, Developmntal Thapeuticrs Program, Division of Cancer Treatment, National Cancer Institute, Building 1052, Rwm 121, Frederick, Maryland 21 702-1201 ABSTRACT.-Fractionation of  a  cytotoxic  extract  of  the  stony  coral  Tubastru faulkneri yielded a series of cytotoxic polyoxazole macrolides and several noncytotoxic indole derivatives. Two new macrolides, mycalolides D 111 and E 121, were isolated and identified, in addition to the known compound  mycalolide C [31. The macrolide structures were elucidated  by  detailed analysis of their spectroscopic data and by comparison with related compounds. Stony  (scleractinian)  corals  in  the genus Tubastreu (Dendrophylliidae) hav

In [45]:
pdf_o = nlp(read_output)

In [46]:
pdf_pt = [(n, i, i.tag_) for n,i in enumerate(pdf_o)]
#part of speech tags

In [47]:
pdf_pt

[(0, 1120, 'CD'),
 (1, Journal, 'NNP'),
 (2, of, 'IN'),
 (3,  , '_SP'),
 (4, Natural, 'NNP'),
 (5, Praducts, 'NNPS'),
 (6, Val, 'NNP'),
 (7, ., '.'),
 (8, 58, 'CD'),
 (9, ,, ','),
 (10, NO, 'NNP'),
 (11, ., 'NNP'),
 (12, 7, 'CD'),
 (13, ,, ','),
 (14, pp, 'NNP'),
 (15, ., '.'),
 (16,  , '_SP'),
 (17, 1120, 'CD'),
 (18, -, 'SYM'),
 (19, 1, 'CD'),
 (20, 125, 'CD'),
 (21, ,, ','),
 (22, July, 'NNP'),
 (23,  , '_SP'),
 (24, 1995, 'CD'),
 (25, MYCALOLIDES, 'NNP'),
 (26, D, 'NNP'),
 (27, AND, 'CC'),
 (28, E, 'NN'),
 (29, ,, ','),
 (30, N, 'CD'),
 (31, E, 'NN'),
 (32, W, 'NN'),
 (33,  , '_SP'),
 (34, CYTOTOXIC, 'NNP'),
 (35, MACROLIDES, 'NNS'),
 (36, FROM, 'IN'),
 (37, A, 'DT'),
 (38, COLLECTION, 'NN'),
 (39, OF, 'IN'),
 (40, THE, 'DT'),
 (41, STONY, 'NNP'),
 (42, CORAL, 'NNP'),
 (43, TUBASTREA, 'JJ'),
 (44, FAULMVERl, 'NNS'),
 (45, MOHMIM, 'NN'),
 (46, ~, 'XX'),
 (47, D, 'NNP'),
 (48, A., 'NN'),
 (49, RASHID, 'NN'),
 (50, ,, ','),
 (51, KIRK, 'NNP'),
 (52, R., 'NNP'),
 (53, GUSTAFSON, 'NNP')

In [48]:
token_list = []
for token in pdf_o:
    token_list.append(token.text)

filtered_pdf =[] 
for word in token_list:
    lexeme = nlp.vocab[word]
    if lexeme.is_stop == False:
        filtered_pdf.append(word) 

In [49]:
filtered_pdf[:100]

['1120',
 'Journal',
 ' ',
 'Natural',
 'Praducts',
 'Val',
 '58',
 ',',
 '7',
 ',',
 'pp',
 ' ',
 '1120',
 '-',
 '1',
 '125',
 ',',
 'July',
 ' ',
 '1995',
 'MYCALOLIDES',
 'D',
 'E',
 ',',
 'N',
 'E',
 'W',
 ' ',
 'CYTOTOXIC',
 'MACROLIDES',
 'COLLECTION',
 'STONY',
 'CORAL',
 'TUBASTREA',
 'FAULMVERl',
 'MOHMIM',
 '~',
 'D',
 'A.',
 'RASHID',
 ',',
 'KIRK',
 'R.',
 'GUSTAFSON',
 ',',
 'JOHN',
 'H.',
 'CARDELLINA',
 '11',
 ',',
 ' ',
 'MICHAEL',
 'R.',
 'BOYD',
 '*',
 'Lcrbwatwy',
 'Drug',
 'Discwey',
 'Resurcb',
 'DeveIopnt',
 ',',
 'Developmntal',
 'Thapeuticrs',
 'Program',
 ',',
 'Division',
 'Cancer',
 'Treatment',
 ',',
 'National',
 'Cancer',
 'Institute',
 ',',
 'Building',
 '1052',
 ',',
 'Rwm',
 '121',
 ',',
 'Frederick',
 ',',
 'Maryland',
 '21',
 '702',
 '-',
 '1201',
 'ABSTRACT.-Fractionation',
 ' ',
 ' ',
 'cytotoxic',
 ' ',
 'extract',
 ' ',
 ' ',
 ' ',
 'stony',
 ' ',
 'coral',
 ' ',
 'Tubastru']

In [50]:
#pt[0].nbor().tag_ == "NN" and --> use this to only get 5 matches but too greedy
try:
    filtered_nnp = []
    for pt in pdf_pt:
        q_str = str(pt[1])
        if pt[2] == "NNP" and '-' not in q_str and len(q_str) > 5:
            #if not re.search('\d+', q_str) and re.search('^!?[A-Z]{1}', q_str):
                print(f"{pt[1]}-{pt[1].nbor()}-{pt[2]}")
                filtered_nnp.append(pt)

except IndexError as e:
    pass #last index doesn't have a neighbour

Journal-of-NNP
Natural-Praducts-NNP
MYCALOLIDES-D-NNP
CYTOTOXIC-MACROLIDES-NNP
GUSTAFSON-,-NNP
CARDELLINA-11-NNP
MICHAEL-R.-NNP
Lcrbwatwy-of-NNP
Discwey-Resurcb-NNP
Resurcb-and-NNP
DeveIopnt-,-NNP
Developmntal-Thapeuticrs-NNP
Thapeuticrs-Program-NNP
Program-,-NNP
Division-of-NNP
Cancer-Treatment-NNP
Treatment-,-NNP
National-Cancer-NNP
Cancer-Institute-NNP
Institute-,-NNP
Building-1052-NNP
Frederick-,-NNP
Maryland-21-NNP
cytotoxic- -NNP
Tubastru-faulkneri-NNP
faulkneri-yielded-NNP
cytotoxic-polyoxazole-NNP
scleractinian-)-NNP
Tubastreu-(-NNP
Dendrophylliidae-)-NNP
Australian- -NNP
Tubastreu-faulkneri-NNP
faulkneri-Wells-NNP
cytotoxic-frac--NNP
Tubastreu-faulkneri-NNP
faulkneri-.-NNP
faulkneri-,-NNP
Barrier-Reef-NNP
Australia-,-NNP
permeation- -NNP
Sephadex-LH-20-NNP
mycalolides-C-NNP
Cytotoxic-Macrolides-NNP
Tubastreu-1121-NNP
C51H72N4016-.-NNP
C5,H,,N40-,-NNP
1700,1652-cm--NNP
ulapualides-(-NNP
Journal-of-NNP
Natural-Products-NNP
Mycalolides-D-NNP
Mycalolide-D-NNP
Mycalolide-E-NNP
Corr

In [51]:
vouch_nsc = re.findall(r'([C|N|Q|F|M]{1}\d{2,}\w+)',' '.join([str(x[1]) for x in filtered_nnp]))
vouch_nsc = [r for r in vouch_nsc if len(r) < 10]
vouch_nsc

['Q66C1269']

In [52]:
filtered_nnp

[(1, Journal, 'NNP'),
 (4, Natural, 'NNP'),
 (25, MYCALOLIDES, 'NNP'),
 (34, CYTOTOXIC, 'NNP'),
 (53, GUSTAFSON, 'NNP'),
 (57, CARDELLINA, 'NNP'),
 (62, MICHAEL, 'NNP'),
 (66, Lcrbwatwy, 'NNP'),
 (69, Discwey, 'NNP'),
 (70, Resurcb, 'NNP'),
 (72, DeveIopnt, 'NNP'),
 (74, Developmntal, 'NNP'),
 (75, Thapeuticrs, 'NNP'),
 (76, Program, 'NNP'),
 (78, Division, 'NNP'),
 (80, Cancer, 'NNP'),
 (81, Treatment, 'NNP'),
 (83, National, 'NNP'),
 (84, Cancer, 'NNP'),
 (85, Institute, 'NNP'),
 (87, Building, 'NNP'),
 (93, Frederick, 'NNP'),
 (95, Maryland, 'NNP'),
 (105, cytotoxic, 'NNP'),
 (117, Tubastru, 'NNP'),
 (118, faulkneri, 'NNP'),
 (123, cytotoxic, 'NNP'),
 (184, scleractinian, 'NNP'),
 (193, Tubastreu, 'NNP'),
 (195, Dendrophylliidae, 'NNP'),
 (258, Australian, 'NNP'),
 (264, Tubastreu, 'NNP'),
 (265, faulkneri, 'NNP'),
 (319, cytotoxic, 'NNP'),
 (374, Tubastreu, 'NNP'),
 (375, faulkneri, 'NNP'),
 (392, faulkneri, 'NNP'),
 (397, Barrier, 'NNP'),
 (400, Australia, 'NNP'),
 (439, permeatio

In [53]:
for i in filtered_nnp:
    print(i[1])

Journal
Natural
MYCALOLIDES
CYTOTOXIC
GUSTAFSON
CARDELLINA
MICHAEL
Lcrbwatwy
Discwey
Resurcb
DeveIopnt
Developmntal
Thapeuticrs
Program
Division
Cancer
Treatment
National
Cancer
Institute
Building
Frederick
Maryland
cytotoxic
Tubastru
faulkneri
cytotoxic
scleractinian
Tubastreu
Dendrophylliidae
Australian
Tubastreu
faulkneri
cytotoxic
Tubastreu
faulkneri
faulkneri
Barrier
Australia
permeation
Sephadex
mycalolides
Cytotoxic
Tubastreu
C51H72N4016
C5,H,,N40
1700,1652
ulapualides
Journal
Natural
Mycalolides
Mycalolide
Mycalolide
Correlations
Rashid
Cytotoxic
Tubastreu
A34331
acetate
methyl
gletsat63.37and3.46
Compound
methine
proton
Halichondria
Mycalolide
C46H62N1013
methyl
Mycalolide
trisoxazole
methine
proton
terminal
Journal
Natural
proton
Mycalolides
Mycalolide
Halicbondria
Jaspis
Mycule
faulkneri
Tubastrea
EXPERIMENTAL
GENERAL
EXPERIMENTAL
Tubastm
fuulkkneri
Salamander
Cleveland
Barrier
northeast
Australia
Murphy
Australian
Institute
Marine
Science
Q66C1269
Queensland
Museum
Brisbane

In [54]:
output_display_text_ = ''
output=[]
cnt = 1
try:
    for i in filtered_nnp:
        #res = classifier.classify(find_features(str(i[1]).lower()))
        string_word = str(i[1])
        res = check_if_taxon_related(string_word)
        if res and not re.search(r'[\()]|\w{3,}[.]\w',string_word):
            output_display_text_ += f' {string_word}'
            output.append(string_word)
            print(f'{cnt}: {string_word.lower()}')
            cnt += 1
except TypeError as e:
    pass

1: mycalolides
2: cardellina
3: tubastru
4: faulkneri
5: tubastreu
6: dendrophylliidae
7: tubastreu
8: faulkneri
9: tubastreu
10: faulkneri
11: faulkneri
12: australia
13: mycalolides
14: tubastreu
15: ulapualides
16: mycalolides
17: tubastreu
18: halichondria
19: mycalolides
20: halicbondria
21: jaspis
22: faulkneri
23: tubastrea
24: fuulkkneri
25: australia
26: museum
27: mycalolides
28: tubastreu
29: melanoma
30: tubastrea
31: sanduja
32: heterocycles
33: fusetani
34: matsunaga
35: guella
36: mancini
37: zibrowius
38: pietra
39: philadelphia
40: cardellina
41: cafieri
42: fattorusso
43: mahajanah
44: mangoni
45: fusetani
46: yasumuro
47: matsunaga
48: matsunaga
49: fusetani
50: koseki
51: matsunaga
52: fusetani
53: koseki
54: noguchi
55: sankawa
56: molinski
57: kobayashi
58: murata
59: shigemori
60: cardellina
61: cardellina


In [55]:
set(output)

{'Australia',
 'CARDELLINA',
 'Cafieri',
 'Cardellina',
 'Dendrophylliidae',
 'Fattorusso',
 'Fusetani',
 'Guella',
 'Halicbondria',
 'Halichondria',
 'Heterocycles',
 'Jaspis',
 'Kobayashi',
 'Koseki',
 'MYCALOLIDES',
 'Mahajanah',
 'Mancini',
 'Mangoni',
 'Matsunaga',
 'Melanoma',
 'Molinski',
 'Murata',
 'Museum',
 'Mycalolides',
 'Noguchi',
 'Philadelphia',
 'Pietra',
 'Sanduja',
 'Sankawa',
 'Shigemori',
 'Tubastrea',
 'Tubastreu',
 'Tubastru',
 'Yasumuro',
 'Zibrowius',
 'faulkneri',
 'fuulkkneri',
 'mycalolides',
 'ulapualides'}

In [57]:
# nlp = spacy.load("en_core_sci_sm")
# text = """
# Myeloid derived suppressor cells (MDSC) are immature 
# myeloid cells with immunosuppressive activity. 
# They accumulate in tumor-bearing mice and humans 
# with different types of cancer, including hepatocellular 
# carcinoma (HCC).
# """
# doc = nlp(text)

In [58]:
output_display_text_.strip()

'MYCALOLIDES CARDELLINA Tubastru faulkneri Tubastreu Dendrophylliidae Tubastreu faulkneri Tubastreu faulkneri faulkneri Australia mycalolides Tubastreu ulapualides Mycalolides Tubastreu Halichondria Mycalolides Halicbondria Jaspis faulkneri Tubastrea fuulkkneri Australia Museum mycalolides Tubastreu Melanoma Tubastrea Sanduja Heterocycles Fusetani Matsunaga Guella Mancini Zibrowius Pietra Philadelphia Cardellina Cafieri Fattorusso Mahajanah Mangoni Fusetani Yasumuro Matsunaga Matsunaga Fusetani Koseki Matsunaga Fusetani Koseki Noguchi Sankawa Molinski Kobayashi Murata Shigemori Cardellina Cardellina'

In [59]:
displacy.render(nlp(output_display_text_), style='ent')

In [60]:
#entry recognition
er_output = [(i, i.label_, i.label) for i in pdf_o.ents if i.label_ in ['GPE', 'PERSON', 'ORG', 'NORP']]
er_output

[(Journal of  Natural Praducts Val, 'ORG', 383),
 (D A. RASHID, 'PERSON', 380),
 (KIRK R. GUSTAFSON, 'PERSON', 380),
 (JOHN H. CARDELLINA, 'PERSON', 380),
 (MICHAEL R. BOYD, 'PERSON', 380),
 (Discwey Resurcb, 'PERSON', 380),
 (DeveIopnt, 'ORG', 383),
 (Developmntal Thapeuticrs Program, 'ORG', 383),
 (Division of Cancer Treatment, 'ORG', 383),
 (National Cancer Institute, 'ORG', 383),
 (Frederick, 'GPE', 384),
 (Maryland, 'GPE', 384),
 (Tubastru, 'PERSON', 380),
 (Australian, 'NORP', 381),
 (Tubastreu, 'PERSON', 380),
 (Wells, 'ORG', 383),
 (NCI, 'ORG', 383),
 (Bioassay, 'PERSON', 380),
 (Tubastreu, 'ORG', 383),
 (CC1,-, 'ORG', 383),
 (CC1,-, 'ORG', 383),
 (CHC1, 'ORG', 383),
 (vlc, 'ORG', 383),
 (hplc, 'PERSON', 380),
 (hplc, 'GPE', 384),
 (hplc, 'GPE', 384),
 (Cytotoxic Macrolides, 'PERSON', 380),
 (Journal of Natural Products, 'ORG', 383),
 (Mycalolides D 111, 'ORG', 383),
 (Me-27, 'GPE', 384),
 (c-5 c-3, 'ORG', 383),
 (c-4, 'ORG', 383),
 (C-9, 'ORG', 383),
 (C-9-Me, 'ORG', 383),
 (6

In [61]:
cnt = 1
output_display_text = ''
output2=[]
for i in er_output:
    try:
        #res = classifier.classify(find_features(string.lower()))
        string_word = str(i[0]).lower()
        res = check_if_taxon_related(string_word)
        if res and not re.search('^\w{1,3}[-]\d+', string_word):
            if not re.search('^\d+',string_word):
                print(f'{cnt}: {string_word}')
                output_display_text += f' {string_word.strip()}'
                output2.append(string_word.strip())
                cnt+=1
    except TypeError as e:
        pass

1: john h. cardellina
2: developmntal thapeuticrs program
3: tubastru
4: tubastreu
5: tubastreu
6: cytotoxic macrolides
7: halichondria
8: halicbondria
9: jaspis
10: t. faulkneri
11: tubastrea
12: australia
13: the queensland museum
14: u  max
15: lox imvi
16: niddk
17: r.  sakai
18: heterocycles
19: n. fusetani
20: s.  matsunaga
21: g. guella
22: i. mancini
23: h. zibrowius
24: f. pietra
25: acta
26: philadelphia
27: c. cafieri
28: a.  mangoni
29: n. fusetani
30: s. matsunaga
31: n. fusetani
32: k. koseki
33: m. noma
34: s.  matsunaga
35: n. fusetani
36: k. koseki
37: m. noma
38: h. noguchi
39: kobayashi
40: murata
41: j.-r. dai


In [62]:
output_display_text.strip()

'john h. cardellina developmntal thapeuticrs program tubastru tubastreu tubastreu cytotoxic macrolides halichondria halicbondria jaspis t. faulkneri tubastrea australia the queensland museum u  max lox imvi niddk r.  sakai heterocycles n. fusetani s.  matsunaga g. guella i. mancini h. zibrowius f. pietra acta philadelphia c. cafieri a.  mangoni n. fusetani s. matsunaga n. fusetani k. koseki m. noma s.  matsunaga n. fusetani k. koseki m. noma h. noguchi kobayashi murata j.-r. dai'

In [63]:
displacy.render(nlp(output_display_text), style='ent')