## NLTK and StanforNLP

Nltk has built in class for Named Entity Recognition for Stanford NER Tagger. For this purpose first we need to download Stanford Named Entity recognizer from https://nlp.stanford.edu/software/CRF-NER.shtml. We will demo using english language NER and then go on to built our own custom Music Instruments Recognizer.

In [1]:
import nltk
from nltk.tag.stanford import StanfordNERTagger
from nltk import ngrams
from nltk.tokenize import WordPunctTokenizer
import re

import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
# load nltk corpus stopwords. if not present, download it.
try:
    from nltk.corpus import stopwords
    stopset = set(stopwords.words('english'))
except:
    nltk.download('stopwords')
    stopset = set(stopwords.words('english'))

In [3]:
# create ner tagger based on stanford ner tagger
jar = './stanford-ner-tagger/stanford-ner.jar'
model = './stanford-ner-tagger/english.all.3class.distsim.crf.ser.gz'
ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')

In [92]:
# Test a sentence
sentence = "Barack Obama stands at the Oval, addressing the nation."
words = nltk.word_tokenize(sentence)
print(ner_tagger.tag(words))

[('Barack', 'PERSON'), ('Obama', 'PERSON'), ('stands', 'O'), ('at', 'O'), ('the', 'O'), ('Oval', 'LOCATION'), (',', 'O'), ('addressing', 'O'), ('the', 'O'), ('nation', 'O'), ('.', 'O')]


## Train Music Instruments NER
Load the music data as scraped from Wikipedia page of list of Instruments. Use that to create custom NER tagged ("MUSIC") data. We use this data to train Stanford NER.

In [6]:
parent_dir = '/Users/saurabh/workspace/datasets/wikimusic/'

In [7]:
# load instruments_line.csv
mi_f_name = parent_dir + "instruments_line1.csv"
mi_df = pd.read_csv(mi_f_name, delimiter='|')
mi_df.head(2)
mi_df.count()

title    604
line1    604
dtype: int64

In [85]:
# split the data into train and test set
msk = np.random.rand(len(tagged_data)) < 0.8
train = mi_df[msk]
test = mi_df[~msk]
print(len(train))
print(len(test))

491
113


In [8]:
# regex to match any title such as 'Clave (rhythm)' to remove the bracket and the words inside.
in_brackets = r'\([^)]*\)'

In [9]:
# lambda function to to get a flattened list from list of lists.
flatten = lambda l: [item for sublist in l for item in sublist]

In [19]:
# create combinations of title, such as, 'agung a tamlang' should also encompass its synonyms or 'agung' and 'tamlang'. This method is still under creation.
def title_combinations(title):
    title = title.lower()
    title = re.sub(in_brackets, '', title).strip()
    nn_grams = [title]
    t_split = title.split(r'\s+')
    # hack to handle ngrams generated that start with stopwords
    if any(word in stopset for word in t_split):
        return nn_grams
    for i in range(2, len(title)-1):
        the_grams = ngrams(t_split, i)
        str_grams = [" ".join(words) for words in the_grams]
        nn_grams.extend(str_grams)
    return list(set(nn_grams))

In [20]:
# test title combinations
t = "aung a lung (musical instrument)"
title_combinations(t)

['aung a lung']

In [12]:
wtkz = WordPunctTokenizer()

In [29]:
def append_to_data(text, ret_data, tag=None, use_ner=False):
    tkns = wtkz.tokenize(text)
    if tag is None:
        if use_ner:
            ret_data.extend(ner_tagger.tag(tkns))
        else:
            for tkn in tkns:
                ret_data.append((tkn, 'O'))
    else:
        for tkn in tkns:
            ret_data.append((tkn, tag))
    return ret_data

In [30]:
print(append_to_data("A slit drum is a hollow percussion instrument", []))
print(append_to_data("aung a lung", [], "MUSIC"))

[('A', 'O'), ('slit', 'O'), ('drum', 'O'), ('is', 'O'), ('a', 'O'), ('hollow', 'O'), ('percussion', 'O'), ('instrument', 'O')]
[('aung', 'MUSIC'), ('a', 'MUSIC'), ('lung', 'MUSIC')]


In [43]:
def split_and_mark(row, use_ner=False):
    ret_data = []
    title, line1 = row
    lline1 = line1.lower()
    ltitle = title.lower()
    ltitle_combs = title_combinations(title)
    idx = line1.lower().find(title.lower())
    if idx == -1:
        ret_data = append_to_data(line1, ret_data)
    else:
        end_idx = idx+len(title)
        sd = line1[:idx]
        ret_data = append_to_data(sd, ret_data, use_ner=use_ner)
        match = line1[idx:end_idx]
        ret_data = append_to_data(match, ret_data, tag="MUSIC")
        sd = line1[end_idx:]
        ret_data = append_to_data(sd, ret_data, use_ner=use_ner)
    return ret_data

In [44]:
split_and_mark(("slit drum", "A slit drum is a hollow percussion instrument"))

[('A', 'O'),
 ('slit', 'MUSIC'),
 ('drum', 'MUSIC'),
 ('is', 'O'),
 ('a', 'O'),
 ('hollow', 'O'),
 ('percussion', 'O'),
 ('instrument', 'O')]

In [86]:
tagged_data = train.apply(split_and_mark, axis=1)
tagged_data.count()

491

In [55]:
# all_sents = mi_df.loc[:, "line1"].values.tolist()
# tkzd_sents = [wtkz.tokenize(sent) for sent in all_sents]
# tagged_sents = ner_tagger.tag_sents(tkzd_sents)

In [78]:
write_data = ''
for dlist in train:
    for dtup in dlist:
        write_data += dtup[0] + '\t' + dtup[1] + '\n'
    write_data += '\n'

with open("./stanford-ner-tagger/train/mi_tagged_split.txt", "w") as f:
    f.write(write_data)

In [63]:
# !cd stanford-ner-tagger/
# !java -cp "stanford-ner.jar:lib/*" -mx4g edu.stanford.nlp.ie.crf.CRFClassifier -prop train/prop.txt

In [79]:
# load custom ner model
m_jar = './stanford-ner-tagger/stanford-ner.jar'
m_model = './stanford-ner-tagger/dummy-ner-model-music.ser.gz'
m_ner_tagger = StanfordNERTagger(m_model, m_jar, encoding='utf8')

In [80]:
# test custom NER model.
sentence = "Shiva's damru is an instrument that is hollow located at New York."
words = wtkz.tokenize(sentence)
print(m_ner_tagger.tag(words))

[('Shiva', 'MUSIC'), ("'", 'MUSIC'), ('s', 'O'), ('damru', 'O'), ('is', 'O'), ('an', 'O'), ('instrument', 'O'), ('that', 'O'), ('is', 'O'), ('hollow', 'O'), ('located', 'O'), ('at', 'O'), ('New', 'O'), ('York', 'O'), ('.', 'O')]


In [89]:
# custom NER tagged test data
all_sents = test.loc[:, "line1"].values.tolist()
tkzd_sents = [wtkz.tokenize(sent) for sent in all_sents]
tagged_sents = m_ner_tagger.tag_sents(tkzd_sents)

In [91]:
tagged_sents[:1]

[[('A', 'O'),
  ('wood', 'MUSIC'),
  ('block', 'MUSIC'),
  ('(', 'O'),
  ('also', 'O'),
  ('spelled', 'O'),
  ('as', 'O'),
  ('a', 'O'),
  ('single', 'O'),
  ('word', 'O'),
  (',', 'O'),
  ('woodblock', 'O'),
  (')', 'O'),
  ('is', 'O'),
  ('a', 'O'),
  ('small', 'O'),
  ('slit', 'O'),
  ('drum', 'O'),
  ('made', 'O'),
  ('from', 'O'),
  ('a', 'O'),
  ('single', 'O'),
  ('piece', 'O'),
  ('of', 'O'),
  ('wood', 'O'),
  ('and', 'O'),
  ('used', 'O'),
  ('as', 'O'),
  ('a', 'O'),
  ('percussion', 'O'),
  ('instrument', 'O')]]

In [97]:
# truth values of test data
test_truth = test.apply(split_and_mark, axis=1).values.tolist()

In [101]:
# compute accuracy of custom NER Tagger
count = 0
for i in range(len(test)):
    tags = tagged_sents[i]
    true_tags = test_truth[i]
    accurate = True
    for tag, true_tag in zip(tags, true_tags):
        if tag[1] == "MUSIC" and true_tags[1] != "MUSIC":
            accurate = False
    if accurate:
        count += 1
print(count)
print(count/len(test))

49
0.4336283185840708
