# 4. Named Entity Recognition

In [1]:
%run __init__.py

In [3]:
import os
import pandas as pd



## Agriculture

In [4]:
AGRICULTURE_DATASET_DIR = os.path.join(DATA_DIR, 'agriculture')
PMC_FILE_PATH = os.path.join(AGRICULTURE_DATASET_DIR, 'pmc_dataframe.pkl')

pmc_df = pd.read_pickle(PMC_FILE_PATH)
publications = pmc_df['text_cleaned'].values

In [38]:
import en_core_sci_md
import spacy

from spacy import displacy
from collections import Counter

nlp = en_core_sci_md.load()

In [31]:
from pprint import pprint

disallowed_types = ['CARDINAL', 'DATE', 'MONEY', 'ORDINAL', 'PERCENT', 'QUANTITY', 'TIME']

doc = nlp(publications[-1])
ents = [(X.text, X.label_) for X in doc.ents if X.label_ not in disallowed_types
        and len(X.text) > 2]
ents[:10]

[('Introduction Ectomycorrhizal fungi', 'ENTITY'),
 ('live', 'ENTITY'),
 ('symbiosis', 'ENTITY'),
 ('tree', 'ENTITY'),
 ('shrubs', 'ENTITY'),
 ('forest functioning', 'ENTITY'),
 ('biogeochemical cycles 1', 'ENTITY'),
 ('boreal', 'ENTITY'),
 ('forests', 'ENTITY'),
 ('carbon', 'ENTITY')]

In [37]:
[ent for ent in doc.ents if ent.label_ not in disallowed_types and len(ent.text) > 2]

[Introduction Ectomycorrhizal fungi,
 live,
 symbiosis,
 tree,
 shrubs,
 forest functioning,
 biogeochemical cycles 1,
 boreal,
 forests,
 carbon,
 stored,
 soil,
 roots,
 root-associated microorganisms,
 ectomycorrhizal fungi 2,
 forest ecosystems,
 ectomycorrhizal trees,
 implanted,
 agroforestry,
 ecosystems,
 orchards,
 non-wood products,
 edible fungi,
 inoculation,
 tree,
 seedlings,
 ectomycorrhizal fungi,
 nurseries,
 controlled mycorrhization,
 years,
 technique,
 grow truffles,
 True truffles,
 Tuber spp,
 ectomycorrhizal,
 Ascomycetes,
 hypogeous fruiting bodies,
 genus Tuber,
 temperate areas,
 species,
 species,
 Europe,
 organoleptic,
 properties,
 social value,
 species,
 Tuber melanosporum Vittad,
 Périgord black,
 Tuber magnatum Pico,
 white Italian,
 truffle,
 Tuber aestivum Vittad,
 summer,
 Burgundy truffle,
 commercialization,
 seedlings,
 inoculated,
 aestivum,
 melanosporum,
 progress,
 improve,
 quality,
 inoculated plants 3,
 melanosporum plantations,
 Mediterr

In [32]:
labels = [x[0] for x in ents]
Counter(labels)

Counter({'Introduction Ectomycorrhizal fungi': 1,
         'live': 1,
         'symbiosis': 1,
         'tree': 28,
         'shrubs': 1,
         'forest functioning': 1,
         'biogeochemical cycles 1': 1,
         'boreal': 1,
         'forests': 1,
         'carbon': 2,
         'stored': 3,
         'soil': 61,
         'roots': 2,
         'root-associated microorganisms': 1,
         'ectomycorrhizal fungi 2': 1,
         'forest ecosystems': 1,
         'ectomycorrhizal trees': 1,
         'implanted': 6,
         'agroforestry': 1,
         'ecosystems': 1,
         'orchards': 4,
         'non-wood products': 1,
         'edible fungi': 1,
         'inoculation': 1,
         'seedlings': 5,
         'ectomycorrhizal fungi': 3,
         'nurseries': 1,
         'controlled mycorrhization': 1,
         'years': 8,
         'technique': 1,
         'grow truffles': 1,
         'True truffles': 1,
         'Tuber spp': 2,
         'ectomycorrhizal': 2,
         'Ascomycetes': 

In [33]:
texts = [x[1] for x in ents]
Counter(texts).most_common(3)

[('ENTITY', 1426)]

In [34]:
displacy.render(doc[:500], jupyter=True, style='ent')

In [None]:
class NamedEntityRecognizer():
    def __init__(self, spacy_model):
        self.nlp = spacy_model.load()
    
    def get_entities()