# Task1 Named Entity Recognition

In [1]:
# imports
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.tag import pos_tag
from nltk import ne_chunk

corpus_root = "./corpus1/"

In [2]:
# pos tagging all files
newcorpus = PlaintextCorpusReader(corpus_root, ".*", encoding='latin-1')
# print(newcorpus.words())
corpus1_words = newcorpus.words()
pos_tags = pos_tag(corpus1_words)

## NLTK NER

In [3]:
# without category labels
ner_without_labels = ne_chunk(pos_tags, binary=True)
# with category labels
ner_with_labels = ne_chunk(pos_tags)

In [5]:
# print results
# print("NER without category labels:")
# print(ner_without_labels)
print("NER with category labels:")
# print(ner_with_labels)
print("Open nltk_ner.txt to check results")
with open('nltk_ner.txt', 'w') as f:
    f.write(str(ner_with_labels))

NER with category labels:
Open nltk_ner.txt to check results


### All words/phrases that being regarded as ORGANIZATION (NLTK)

In [22]:
# print ORGANIZATION (duplicates removed)
nltk_org = []
for elem in ner_with_labels.subtrees():
    phrase = []
    if elem.label() == 'ORGANIZATION':
        for item in elem.leaves():
            phrase.append(item[0])
        phrase = ' '.join(phrase)
        nltk_org.append(phrase)
    
nltk_org = list(set(nltk_org))
print(len(nltk_org))
print(nltk_org)

223
['Bureau', 'Confederation', 'Articles', 'Association', 'OUR', 'Administrations', 'Fellow Citizens', 'Dingley Act', 'Houses', 'Fort Sumter', 'Senate', 'Negro', 'Reflecting', 'Information Age', 'Parents', 'Communist', 'Federal Establishment', 'Massachusetts Congress', 'Panama Canal', 'Army', 'Invisible Hand', 'Great Society', 'FAILURE OF', 'Santo Domingo', 'Christians', 'Democratic Party', 'Moment', 'Social', 'Lord', 'Bill of Rights', 'Administration', 'Forty', 'Cabinet', 'Almighty God', 'National Government', 'Constitutional Convention', 'Mississippi', 'Commonwealth', 'Democrats', 'Almighty Ruler', 'WORLD', 'Republican Party', 'Century', 'Parchment', 'General', 'American Dream', 'Helvetic Confederacy', 'Treasury Department', 'Beneficent Creator', 'African Americans', 'Congress', 'State', 'Helvetic', 'Board', 'Human Race', 'Divine Being', 'Executive Departments', 'National Union', 'Mediterranean', 'Federal Union', 'Golden Rule', 'Author', 'Homestead Act', 'Congressional', 'Representa

In [57]:
# print ORGANIZATION (with indexes)
from nltk import tree2conlltags
nltk_org_index = []
index = 0
nltk_ner_list = tree2conlltags(ner_with_labels)
while index < len(nltk_ner_list):
    phrase = []
    if nltk_ner_list[index][2] == 'B-ORGANIZATION':
        start = index
        phrase.append(nltk_ner_list[index][0])
        index += 1
        while nltk_ner_list[index][2] == 'I-ORGANIZATION':
            phrase.append(nltk_ner_list[index][0])
            index += 1
        end = index-1
        phrase = ' '.join(phrase)
        nltk_org_index.append((phrase, start, end))
    else:
        index += 1
    
print(len(nltk_org_index))
print(nltk_org_index)

895
[('Senate', 5, 5), ('House', 9, 9), ('Great Author', 472, 473), ('Invisible Hand', 517, 518), ('Constitution', 1104, 1104), ('House', 1272, 1272), ('Representatives', 1274, 1274), ('Parent', 1454, 1454), ('Human Race', 1457, 1458), ('Constitution', 1614, 1614), ('Helvetic', 1916, 1916), ('Congress', 1989, 1989), ('Executive', 2398, 2398), ('Senate', 2400, 2400), ('Congress', 2448, 2448), ('Congress', 2668, 2668), ('Legislature', 2689, 2689), ('Houses', 3310, 3310), ('Congress', 3312, 3312), ('State', 3457, 3457), ('Union', 3482, 3482), ('Houses', 3762, 3762), ('Congress', 3764, 3764), ('States', 3772, 3772), ('Congress', 3785, 3785), ('Legislature', 3915, 3915), ('Christianity', 4095, 4095), ('Patron', 4223, 4223), ('Fountain', 4228, 4228), ('Justice', 4230, 4230), ('Fellow Citizens', 4272, 4273), ('Republican', 5154, 5154), ('State', 5526, 5526), ('General', 5554, 5554), ('Constitution', 6215, 6215), ('State', 6562, 6562), ('States', 6723, 6723), ('Mississippi', 6984, 6984), ('Gen

## Stanford NER

In [60]:
from nltk.tag import StanfordNERTagger
st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz', 'stanford-ner.jar')
stanford_ner = st.tag(corpus1_words)
print("Open stanford_ner.txt to check results")
with open('stanford_ner.txt', 'w') as f:
    f.write(str(stanford_ner))

Open stanford_ner.txt to check results


In [75]:
stanford_org = []
index = 0
for elem in stanford_ner:
    if elem[1] == 'ORGANIZATION':
        stanford_org.append((elem[0], index))
    index += 1
    
        
stanford_org_phrases = []
stanford_org_index = []
i = 0
while i < len(stanford_org):
    phrase = []
    phrase.append(stanford_org[i][0])
    start = stanford_org[i][1]
    i += 1  
    while i < len(stanford_org) and stanford_org[i][1] == stanford_org[i-1][1]+1:
        phrase.append(stanford_org[i][0])
        i += 1
    phrase = ' '.join(phrase)
    end = stanford_org[i-1][1]
    stanford_org_index.append((phrase, start, end))
    stanford_org_phrases.append(phrase)

### All words/phrases that being regarded as ORGANIZATION (Stanford)

In [76]:
# print ORGANIZATION (duplicates removed)
stanford_org_phrases = list(set(stanford_org_phrases))
print(len(stanford_org_phrases))
print(stanford_org_phrases)

72
['State', 'National Congress', 'Federal Government', 'Commerce and Labor', 'National Union', 'Permanent Court of International Justice', 'Federal Union', 'Board of Engineers and Naval Commissioners', 'Pork Chop Hill', 'Chamber of Congress', 'Forty - ninth Congress', 'Nebraska - Kansas', 'Republican Administration', 'Barbary Powers', 'Christianity', 'Congressional', 'United Nations', 'Parliament', 'Democratic republic', 'Senate', 'Treasury', 'General Government', 'Union', 'Change Liberty', 'Massachusetts Congress', 'Army', 'Bureau of Corporations', 'League of Nations', 'Liberty Bell', 'Fountain of Justice', 'Peabody', 'Santo Domingo', 'Navy', 'Tribunal', 'Department of Commerce and Labor', 'Democratic Party', 'St', 'Government', 'interior administration', 'Department of Justice', 'United Stages', 'Articles of Association', 'Medicare', 'General Government of the Union', 'Social Security', 'Administration', 'Department of Agriculture', 'Executive Magistrate', 'Executive', 'Mormon Churc

In [77]:
# print ORGANIZATION (with indexes)
print(len(stanford_org_index))
print(stanford_org_index)

302
[('Senate', 5, 5), ('House of Representatives', 1272, 1274), ('Congress', 1989, 1989), ('Senate', 2400, 2400), ('Congress', 2448, 2448), ('Chamber of Congress', 2666, 2668), ('Legislature', 2689, 2689), ('Congress', 3312, 3312), ('interior administration', 3659, 3660), ('Congress', 3764, 3764), ('Congress', 3785, 3785), ('Legislature', 3915, 3915), ('Fountain of Justice', 4228, 4230), ('State', 5526, 5526), ('General Government', 5554, 5555), ('National Government', 11608, 11609), ('Army', 11956, 11956), ('Navy', 11959, 11959), ('Government', 11997, 11997), ('Army', 12936, 12936), ('Navy', 12938, 12938), ('Treasury', 13883, 13883), ('Legislature', 14049, 14049), ('Legislature', 14198, 14198), ('Board of Engineers and Naval Commissioners', 15622, 15627), ('Congress', 15631, 15631), ('Navy', 15880, 15880), ('St', 15900, 15900), ('Congress', 16398, 16398), ('Barbary Powers', 17529, 17530), ('Congress', 17914, 17914), ('Treasury', 17989, 17989), ('Congress', 18080, 18080), ('Congress',

## Analyse Results

1. Compare the results(with indexes) from nltk and stanford NER
2. If the start and end indexes of ORGANIZATION from nltk and stanford are the same, then it is an exact match
3. If the start and end indexes of ORGANIZATION from nltk and stanford are overlapped but not exact matched, then it is a partial overlap
4. print out the statistics

In [78]:
exact_match = 0
partial_overlap = 0
for stanford_elem in stanford_org_index:
    for nltk_elem in nltk_org_index:
        if stanford_elem[1] == nltk_elem[1] and stanford_elem[2] == nltk_elem[2]:
            exact_match += 1
            break
        elif not (stanford_elem[1] > nltk_elem[2] or stanford_elem[2] < nltk_elem[1]):
            partial_overlap += 1
            break

print("exact match: " + str(exact_match))
print("partial overlap: " + str(partial_overlap))

exact match: 227
partial overlap: 36
