# 4. Named Entity Recognition

In [1]:
%run __init__.py

In [2]:
import pandas as pd

GIT_FILE_PATH = os.path.join(NOTEBOOK_1_RESULTS_DIR, 'git_dataframe.pkl')

git_df = pd.read_pickle(GIT_FILE_PATH)



In [3]:
git_repositories = git_df['full_text_cleaned'].values

In [4]:
MAX_CHAR_LENGTH = 100000

git_repositories = [r[:MAX_CHAR_LENGTH] for r in git_repositories]

In [17]:
git_repositories[21]

'Translate neo4j query output to YAML file suitable for importing to NCATS reasoner. neo4j-to-reasoner. Translate neo4j query output to YAML file suitable for importing to NCATS reasoner Authentication NOTES README cypher to reasoner driver q1 driver q2 import command lowercase nodes orphadata2cui chlorcyclizine asthma 10000 imatinib asthma full2 neo4j output snippet print edges q1 neo4j to reasoner q2 neo4j to reasoner reset database q1 disease list cui q2 drugandcondition list cui search umls sum pmids data index q1 results q2 results'

In [10]:
from collections import Counter

from sklearn.base import TransformerMixin, BaseEstimator
from spacy import displacy


class NamedEntityRecognizer(BaseEstimator, TransformerMixin):
    def __init__(self, spacy_model, disable=None, min_entity_counts=None,
                 max_entities=None):
        self.nlp = spacy_model.load()
        self.disable = disable if disable is not None else []
        self.min_entity_counts = min_entity_counts
        self.max_entities = max_entities
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, *args, **kwargs):
        entities_texts = [self.get_entities(text) for text in X]
        if self.min_entity_counts is None:
            return entities_texts
        
        return [[entity_label 
                for entity_label, entity_count in Counter(entities_text).most_common(self.max_entities)
                if entity_count >= self.min_entity_counts]
                for entities_text in entities_texts]
    
    def get_entities(self, text):
        doc = self.nlp(text)
        return [x.text for x in doc.ents 
                if x.label_ not in self.disable
                and len(x.text) > 2]
    
    def get_most_common_entities(self, text, n=10):
        entities = self.get_entities(text)
        return Counter(entities).most_common(n)
    
    def visualize_entities(self, text, jupyter=True):
        doc = self.nlp(text)
        displacy.render(doc, jupyter=jupyter, style='ent')

In [11]:
import en_core_web_md

ner = NamedEntityRecognizer(en_core_web_md)

In [12]:
import en_core_sci_lg

ner_2 = NamedEntityRecognizer(en_core_sci_lg)

In [18]:
text = git_repositories[21]

ents = ner.transform([text])
ents

[['NCATS', 'NCATS', 'Authentication', 'orphadata2cui', '10000', 'umls']]

In [19]:
ner.visualize_entities(text)

In [20]:
ner_2.transform([text])

[['Translate',
  'YAML',
  'importing',
  'NCATS reasoner',
  'Translate neo4j query',
  'YAML',
  'importing',
  'NCATS reasoner Authentication NOTES README cypher',
  'reasoner',
  'lowercase nodes',
  'chlorcyclizine asthma',
  'imatinib',
  'asthma',
  'full2']]

In [21]:
ner_2.visualize_entities(text)

## Saving results
Finally, we are going to save the named entity recognizer class with the parameters that we have selected for further use in the following phases.

In [15]:
from herc_common.utils import save_object

output_filename = "ner_system.pkl"
save_object(ner_2, os.path.join(NOTEBOOK_4_RESULTS_DIR, output_filename))