## NLTK NE Chunker
In this notebook we explore NLTK ne_chunk for Named Entity Recognition. We go on to implement our own Music Instruments recognizer.

In [1]:
from nltk import word_tokenize, pos_tag, ne_chunk
import re

import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Test a sentence
sentence = "Barack stands at the Oval inside White House"

In [3]:
# NLTK ne_chunk works on part-of-speech tags
chunked = ne_chunk(pos_tag(word_tokenize(sentence)))
print(chunked)

(S
  (GPE Barack/NNP)
  stands/VBZ
  at/IN
  the/DT
  (FACILITY Oval/NNP)
  inside/IN
  (FACILITY White/NNP House/NNP))


In [4]:
# conll tags can be generated directly from the ne_tree returned by ne_chunk. we may use this functionality in future.
from nltk.chunk import conlltags2tree, tree2conlltags
ne_tree = ne_chunk(pos_tag(word_tokenize(sentence)))
iob_tagged = tree2conlltags(ne_tree)

In [5]:
print(iob_tagged)

[('Barack', 'NNP', 'B-GPE'), ('stands', 'VBZ', 'O'), ('at', 'IN', 'O'), ('the', 'DT', 'O'), ('Oval', 'NNP', 'B-FACILITY'), ('inside', 'IN', 'O'), ('White', 'NNP', 'B-FACILITY'), ('House', 'NNP', 'I-FACILITY')]


In [6]:
ne_tree = conlltags2tree(iob_tagged)
print(ne_tree)

(S
  (GPE Barack/NNP)
  stands/VBZ
  at/IN
  the/DT
  (FACILITY Oval/NNP)
  inside/IN
  (FACILITY White/NNP House/NNP))


## Train custom NER 

In [7]:
# load the wiki extracted data
parent_dir = '/Users/saurabh/workspace/datasets/wikimusic/'

# load instruments_line.csv
mi_f_name = parent_dir + "instruments_line1.csv"
mi_df = pd.read_csv(mi_f_name, delimiter='|')
mi_df.head(2)
mi_df.count()

title    604
line1    604
dtype: int64

In [8]:
# unused
# adapted from https://nlpforhackers.io/named-entity-extraction/
# to created conll type tags for our custom tag "MUSIC"
def to_conll_iob(annotated_sentence):
    """
    `annotated_sentence` = list of triplets [(w1, t1, iob1), ...]
    Transform a pseudo-IOB notation: O, PERSON, PERSON, O, O, LOCATION, O
    to proper IOB notation: O, B-PERSON, I-PERSON, O, O, B-LOCATION, O
    """
    proper_iob_tokens = []
    for idx, annotated_token in enumerate(annotated_sentence):
        (tag, word), ner = annotated_token
 
        if ner != 'O':
            if idx == 0:
                ner = "B-" + ner
            elif annotated_sentence[idx - 1][2] == ner:
                ner = "I-" + ner
            else:
                ner = "B-" + ner
        proper_iob_tokens.append(((tag, word), ner))
    return proper_iob_tokens

In [9]:
# for annotating wiki extracted data in a format that ne_chunk would understand
def annotate_data(row):
    retval = []
    title, line1 = row
    title_tkns = word_tokenize(title.lower())
    tkns = word_tokenize(line1)
    indices = []
    for idx, tkn in enumerate(tkns):
        if tkn.lower() in title_tkns:
            indices.append(idx)
    tagged = pos_tag(tkns)
    for idx, tagged_tokens in enumerate(tagged):
        if idx not in indices:
            retval.append((tagged_tokens, 'O'))
        else:
            retval.append((tagged_tokens, 'MUSIC'))
    return retval

In [10]:
# pos tagging
mi_df["annotated"] = mi_df.apply(annotate_data, axis=1)
mi_df.head(2)

Unnamed: 0,title,line1,annotated
0,Agung a tamlang,The Agung a Tamlang is a type of Philippine sl...,"[((The, DT), O), ((Agung, NNP), MUSIC), ((a, D..."
1,Slit drum,A slit drum is a hollow percussion instrument,"[((A, DT), O), ((slit, NN), MUSIC), ((drum, NN..."


In [11]:
# mi_df["annotated"] = mi_df["annotated"].apply(to_conll_iob)
# mi_df.head(2)

In [12]:
# split the data into train and test set
msk = np.random.rand(len(mi_df)) < 0.8
train = mi_df[msk]
test = mi_df[~msk]
print(len(train))
print(len(test))

489
115


In [13]:
# adapted from https://nlpforhackers.io/named-entity-extraction/
# This is the custom trainer built on top of ChunkParserI provided by NLTK
import pickle
from collections import Iterable
from nltk.tag import ClassifierBasedTagger
from nltk.chunk import ChunkParserI
 
 
class NamedEntityChunker(ChunkParserI):
    def __init__(self, train_sents, **kwargs):
        assert isinstance(train_sents, Iterable)
 
        self.feature_detector = features
        self.tagger = ClassifierBasedTagger(
            train=train_sents,
            feature_detector=features,
            **kwargs)
 
    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)
        return chunks

In [14]:
# adapted from https://nlpforhackers.io/named-entity-extraction/
# The features that are extracted by custom trainer
import string
from nltk.stem.snowball import SnowballStemmer
 
 
def features(tokens, index, history):
    """
    `tokens`  = a POS-tagged sentence [(w1, t1), ...]
    `index`   = the index of the token we want to extract features for
    `history` = the previous predicted IOB tags
    """
 
    # init the stemmer
    stemmer = SnowballStemmer('english')
 
    # Pad the sequence with placeholders
    tokens = [('[START2]', '[START2]'), ('[START1]', '[START1]')] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')]
    history = ['[START2]', '[START1]'] + list(history)
 
    # shift the index with 2, to accommodate the padding
    index += 2
 
    word, pos = tokens[index]
    prevword, prevpos = tokens[index - 1]
    prevprevword, prevprevpos = tokens[index - 2]
    nextword, nextpos = tokens[index + 1]
    nextnextword, nextnextpos = tokens[index + 2]
    previob = history[index - 1]
    contains_dash = '-' in word
    contains_dot = '.' in word
    allascii = all([True for c in word if c in string.ascii_lowercase])
 
    allcaps = word == word.capitalize()
    capitalized = word[0] in string.ascii_uppercase
 
    prevallcaps = prevword == prevword.capitalize()
    prevcapitalized = prevword[0] in string.ascii_uppercase
 
    nextallcaps = prevword == prevword.capitalize()
    nextcapitalized = prevword[0] in string.ascii_uppercase
 
    return {
        'word': word,
        'lemma': stemmer.stem(word),
        'pos': pos,
        'all-ascii': allascii,
 
        'next-word': nextword,
        'next-lemma': stemmer.stem(nextword),
        'next-pos': nextpos,
 
        'next-next-word': nextnextword,
        'nextnextpos': nextnextpos,
 
        'prev-word': prevword,
        'prev-lemma': stemmer.stem(prevword),
        'prev-pos': prevpos,
 
        'prev-prev-word': prevprevword,
        'prev-prev-pos': prevprevpos,
 
        'prev-iob': previob,
 
        'contains-dash': contains_dash,
        'contains-dot': contains_dot,
 
        'all-caps': allcaps,
        'capitalized': capitalized,
 
        'prev-all-caps': prevallcaps,
        'prev-capitalized': prevcapitalized,
 
        'next-all-caps': nextallcaps,
        'next-capitalized': nextcapitalized,
    }

In [15]:
# train cutome NER
chunker = NamedEntityChunker(train["annotated"].values.tolist())

In [16]:
# test one sentence
chunker.parse(pos_tag(word_tokenize("Is it not a slit drum?")))

[(('Is', 'VBZ'), 'O'),
 (('it', 'PRP'), 'MUSIC'),
 (('not', 'RB'), 'O'),
 (('a', 'DT'), 'O'),
 (('slit', 'NN'), 'MUSIC'),
 (('drum', 'NN'), 'MUSIC'),
 (('?', '.'), 'O')]

In [17]:
test_truth = test["annotated"].values.tolist()
test_pred = [chunker.parse(pos_tag(word_tokenize(sent))) for sent in test["line1"].values.tolist()]

In [18]:
# compute accuracy of custom NER Tagger
count = 0
for i in range(len(test)):
    tags = test_pred[i]
    true_tags = test_truth[i]
    accurate = True
    for tag, true_tag in zip(tags, true_tags):
        if true_tag[1] == "MUSIC" and tag[1] != "MUSIC":
            accurate = False
    if accurate:
        count += 1
print(count)
print(count/len(test))

88
0.7652173913043478


## Relationship Extraction
under construction

In [19]:
from nltk.sem.relextract import tree2semi_rel, semi_rel2reldict

In [20]:
pairs = tree2semi_rel(chunked)
print(pairs)

[[[], Tree('GPE', [('Barack', 'NNP')])], [[('stands', 'VBZ'), ('at', 'IN'), ('the', 'DT')], Tree('FACILITY', [('Oval', 'NNP')])], [[('inside', 'IN')], Tree('FACILITY', [('White', 'NNP'), ('House', 'NNP')])]]


In [21]:
reldicts = semi_rel2reldict(pairs)
print(reldicts)

[defaultdict(<class 'str'>, {'lcon': '', 'subjclass': 'GPE', 'subjtext': 'Barack/NNP', 'subjsym': 'barack', 'filler': 'stands/VBZ at/IN the/DT', 'untagged_filler': 'stands at the', 'objclass': 'FACILITY', 'objtext': 'Oval/NNP', 'objsym': 'oval', 'rcon': 'inside/IN'})]


In [22]:
from nltk.sem.relextract import rtuple

In [23]:
pattern = re.compile(r'.*\bat\b.*')
relfilter = lambda x: (x['subjclass'] == subjclass and
                       len(x['filler'].split()) <= window and
                       pattern.match(x['filler']) and
                       x['objclass'] == objclass)

In [24]:
subjclass = 'GPE'
objclass = 'FACILITY'
window = 5
rels = list(filter(relfilter, reldicts))

In [25]:
print(rels)

[defaultdict(<class 'str'>, {'lcon': '', 'subjclass': 'GPE', 'subjtext': 'Barack/NNP', 'subjsym': 'barack', 'filler': 'stands/VBZ at/IN the/DT', 'untagged_filler': 'stands at the', 'objclass': 'FACILITY', 'objtext': 'Oval/NNP', 'objsym': 'oval', 'rcon': 'inside/IN'})]


In [26]:
for rel in rels:
    print(rtuple(rel))

[GPE: 'Barack/NNP'] 'stands/VBZ at/IN the/DT' [FACILITY: 'Oval/NNP']
