# Analysis: Named People
## Post Annotation and Aggregation

A comparison of automated Named Entity Recognition and manual annotation

***

**Table of Contents**

  [I. Loading](#load)

  [II. Named Entity Recognition with SpaCy](#ner)
  
  [III. Manual Annotation of People's Names](#annot)
  
  [IV. Comparison](#comp)
  
***

<a id="load"></a>
### I. Loading

In [1]:
# To use custom functions
import utils

# To work with CSV data
import pandas as pd

# To work with TXT data
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
# nltk.download('punkt')
from nltk.corpus import PlaintextCorpusReader
# nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
# nltk.download('stopwords')
from nltk.tag import pos_tag

# For named entity recognition (NER)
import spacy
from spacy import displacy
from collections import Counter
try:
    import en_core_web_sm
except ImportError:
    print("Downlading en_core_web_sm model")
    import sys
    !{sys.executable} -m spacy download en_core_web_sm
import en_core_web_sm
nlp = en_core_web_sm.load()

# For fuzzy string matching
# https://github.com/seatgeek/thefuzz
from thefuzz import fuzz, process

# For statistical calculations
import numpy as np

# To export JSON data
import json

Load the Plaintext files of archival catalog metadata descriptions:

In [20]:
datadir = "../data/doc_clf_data/model_input/all_model_input/"
descs = PlaintextCorpusReader(datadir, ".+_docs\.txt")

In [21]:
tokens = descs.words()
print(tokens[0:20])

['Thomas', 'Jaffrey', 'McNair', 'was', 'born', 'on', '1', 'March', '1927', '.', 'He', 'was', 'educated', 'at', 'George', 'Watson', "'", 's', 'College', 'in']


In [22]:
sentences = descs.sents()
print(sentences[0:5])

[['Thomas', 'Jaffrey', 'McNair', 'was', 'born', 'on', '1', 'March', '1927', '.'], ['He', 'was', 'educated', 'at', 'George', 'Watson', "'", 's', 'College', 'in', 'Edinburgh', ',', 'and', 'he', 'studied', 'at', 'Edinburgh', 'University', ',', 'graduating', 'MB', ',', 'CHB', 'in', '1949', ',', 'and', 'being', 'awarded', 'later', 'on', 'with', 'the', 'degree', 'of', 'MD', '(', '1960', ')', 'for', 'his', 'thesis', 'Observations', 'on', 'visceral', 'pain', 'with', 'special', 'reference', 'to', 'pain', 'originating', 'in', 'the', 'testis', '.'], ['In', '1951', 'he', 'married', 'Dr', '.', 'Sybil', 'Monteith', 'Dick', 'Wood', ',', 'and', 'between', '1950', 'and', '1952', 'he', 'served', 'as', 'a', 'Flight', '-', 'Lieutenant', 'in', 'the', 'RAF', '.'], ['Also', 'in', '1950', 'he', 'was', 'a', 'Medical', 'Officer', 'at', 'Marlu', 'in', 'Ghana', '(', 'then', 'called', 'the', 'Gold', 'Coast', ').'], ['McNair', 'became', 'House', 'Surgeon', ',', 'then', 'Registrar', ',', 'and', 'Clinical', 'Tutor', 

<a id="ner"></a>
## II. Name Entity Recognition with spaCy
Run named entity recognition (NER) to estimate the names in the dataset and get a sense for the value in manually labeling names during the annotation process. 

In [23]:
fileids = descs.fileids()

In [24]:
sentences = []
for fileid in fileids:
    file = descs.raw(fileid)
    sentences += nltk.sent_tokenize(file)

In [25]:
person_list = []
for s in sentences:
    s_ne = nlp(s)
    for entity in s_ne.ents:
        if entity.label_ == 'PERSON':
            person_list += [entity.text] 

In [26]:
unique_persons = list(set(person_list))
print(len(unique_persons))

9366


In [27]:
print(unique_persons[100:150])

['Szilard', 'Gardiner', 'Brigitta Lasky', 'Robert Lanza', 'von Thomsen', 'Brown', 'Hawthorn', 'S.K.', 'Pringle', "Daniel Stewart's", 'Ka', 'Bertaux', 'Lady', 'Laidlaw', 'Leitung von', 'Economic Adviser', 'Stefan Kopeć', 'James Lumsden', 'Howard Orsmond', 'James Drever', 'CV', 'Coggan', 'Eric Ashby', 'Sofie Szecsi', "Crum Brown's", 'Giovanni Battista', 'Charles de\n|', 'Kaikorai Woollen Factory', 'Alex J D Porteous', 'Arthur Koestler] / L.C.', 'B. Pallante', 'Clerks', 'Weidenfeld', 'Ken Pierce Butler', 'Cutting Hemp', 'Lurline Bay', 'John Bailliecorrepondence', 'Raphael Falk', 'William Calder', 'Barrett Hamilton', 'Ian Gilmour', 'Kincaid Mackenzie', "Scottish Gaelic'Proofs", 'Dr W. Siller', 'Maureen', 'Ascher', 'Barkin', 'Arthur C. ClarkeAlvarez', 'Lady Luck', 'HB van Dyke']


Not perfect...some non-person entities labeled such as `JerusalemPalestineSelzer` and `Arithmetique`.

In [28]:
print(len(person_list))

30548


## III. Manual Annotation of People's Names

In [31]:
df = pd.read_csv("../data/aggregated_data/aggregated_final.csv", index_col=0)
df.head()

Unnamed: 0,agg_ann_id,file,text,ann_offsets,label,category,associated_genders,description_id
0,0,Coll-1157_00100.ann,knighted,"(1407, 1415)",Gendered-Role,Linguistic,Unclear,2364
1,1,Coll-1310_02300.ann,knighthood,"(9625, 9635)",Gendered-Role,Linguistic,Unclear,4542
2,2,Coll-1281_00100.ann,Prince Regent,"(2426, 2439)",Gendered-Role,Linguistic,Unclear,3660
3,3,Coll-1310_02700.ann,knighthood,"(9993, 10003)",Gendered-Role,Linguistic,Unclear,4678
4,4,Coll-1310_02900.ann,Sir,"(7192, 7195)",Gendered-Role,Linguistic,Unclear,4732


In [43]:
df_ppl = df.loc[df.category == "Person-Name"]
df_ppl = df_ppl.drop_duplicates()
df_ppl.head()

Unnamed: 0,ann_id,file,text,ann_offsets,label,category,associated_genders,description_id
7,7,Coll-1036_00500.ann,Mrs Norman Macleod,"(36375, 36393)",Feminine,Person-Name,Unclear,1082
14,14,Coll-1010_00100.ann,Dr. Nelly Renee Deme,"(40, 60)",Unknown,Person-Name,Unclear,855
15,15,Coll-1036_00300.ann,Marjory Kennedy-Fraser,"(14570, 14592)",Unknown,Person-Name,Unclear,1038
16,16,Coll-1036_00300.ann,Marjory Kennedy Fraser,"(14698, 14720)",Unknown,Person-Name,Unclear,1038
17,17,Coll-1036_00300.ann,Marjory Kennedy-Fraser,"(14924, 14946)",Unknown,Person-Name,Unclear,1038


In [44]:
total_ppl = df_ppl.shape[0]
df_mas = df_ppl.loc[df_ppl.label == "Masculine"]
df_fem = df_ppl.loc[df_ppl.label == "Feminine"]
df_unk = df_ppl.loc[df_ppl.label == "Unknown"]
total_mas = df_mas.shape[0]
total_fem = df_fem.shape[0]
total_unk = df_unk.shape[0]
print("Total people:", total_ppl)
print("Total Masculine:", total_mas)
print("Total Feminine:", total_fem)
print("Total Unknown:", total_unk)

Total people: 31157
Total Masculine: 6087
Total Feminine: 1836
Total Unknown: 23234


In [45]:
unique_ppl = set(list(df_ppl.text))
unique_mas = set(list(df_mas.text))
unique_fem = set(list(df_fem.text))
unique_unk = set(list(df_unk.text))
print("Unique people names:", len(unique_ppl))
print("Unique masculine-labeled names:", len(unique_mas))
print("Unique feminine-labeled names:", len(unique_fem))
print("Unique unknown-labeled names:", len(unique_unk))

Unique people names: 10288
Unique masculine-labeled names: 2121
Unique feminine-labeled names: 655
Unique unknown-labeled names: 8316


### IV. Automated Annotation of People's Names
Compare the Person Name annotations of the highest performing Person Name and Occupation classifier (with Linguistic labels as features) to the manual and spaCy annotation of Person Names.

First, join the original text data, from the aggregated dataset, to the classifier's prediction data:

In [46]:
f = "../data/aggregated_data/aggregated_final.csv"
df = pd.read_csv(f, index_col=0)
df = df.loc[df.category == "Person-Name"]
df.head()

Unnamed: 0,agg_ann_id,file,text,ann_offsets,label,category,associated_genders,description_id
7,7,Coll-1036_00500.ann,Mrs Norman Macleod,"(36375, 36393)",Feminine,Person-Name,Unclear,1082
14,14,Coll-1010_00100.ann,Dr. Nelly Renee Deme,"(40, 60)",Unknown,Person-Name,Unclear,855
15,15,Coll-1036_00300.ann,Marjory Kennedy-Fraser,"(14570, 14592)",Unknown,Person-Name,Unclear,1038
16,16,Coll-1036_00300.ann,Marjory Kennedy Fraser,"(14698, 14720)",Unknown,Person-Name,Unclear,1038
17,17,Coll-1036_00300.ann,Marjory Kennedy-Fraser,"(14924, 14946)",Unknown,Person-Name,Unclear,1038


In [55]:
pnoc_pred1 = "../data/token_clf_data/experiment1/5fold/output/crf-arow_pers_o_baseline_fastText100_annot_evaluation.csv"
df_pnoc1 = pd.read_csv(pnoc_pred1, index_col=0, low_memory=False)
df_pnoc1 = df_pnoc1.drop_duplicates()
df_pnoc1.head()

Unnamed: 0,description_id,sentence_id,ann_id,pred_ling_label,expected_label,predicted_label,_merge,agreement
69508,1082,2590,7.0,Gendered-Role,,Unknown,right_only,false positive
7063,1082,2590,7.0,Gendered-Role,Feminine,,left_only,false negative
66942,855,1097,14.0,O,,Feminine,right_only,false positive
2814,855,1097,14.0,O,Unknown,Unknown,both,true positive
67675,1038,1485,15.0,Gendered-Role,,Feminine,right_only,false positive


In [56]:
df_pnoc1.shape

(103049, 8)

In [57]:
df = df.rename(columns={"agg_ann_id":"ann_id"})
subdf = df[["ann_id", "text"]]
df_pnoc1 = subdf.join(df_pnoc1.set_index("ann_id"), how="outer", on="ann_id")
df_pnoc1.head()

Unnamed: 0,ann_id,text,description_id,sentence_id,pred_ling_label,expected_label,predicted_label,_merge,agreement
7.0,7,Mrs Norman Macleod,1082.0,2590.0,Gendered-Role,,Unknown,right_only,false positive
7.0,7,Mrs Norman Macleod,1082.0,2590.0,Gendered-Role,Feminine,,left_only,false negative
14.0,14,Dr. Nelly Renee Deme,855.0,1097.0,O,,Feminine,right_only,false positive
14.0,14,Dr. Nelly Renee Deme,855.0,1097.0,O,Unknown,Unknown,both,true positive
15.0,15,Marjory Kennedy-Fraser,1038.0,1485.0,Gendered-Role,,Feminine,right_only,false positive


Count the total predicted person names, as well as total predicted feminine, masculine, and unknown names:

In [58]:
clf_df_ppl = df_pnoc1.loc[~df_pnoc1.predicted_label.isna()]
clf_df_fem = clf_df_ppl.loc[clf_df_ppl.predicted_label == "Feminine"]
clf_df_mas = clf_df_ppl.loc[clf_df_ppl.predicted_label == "Masculine"]
clf_df_unk = clf_df_ppl.loc[clf_df_ppl.predicted_label == "Unknown"]

clf_total_ppl = clf_df_ppl.shape[0]
clf_total_fem = clf_df_fem.shape[0]
clf_total_mas = clf_df_mas.shape[0]
clf_total_unk = clf_df_unk.shape[0]

print("Total people:", clf_total_ppl)
print("Total Masculine:", clf_total_mas)
print("Total Feminine:", clf_total_fem)
print("Total Unknown:", clf_total_unk)

Total people: 15836
Total Masculine: 5175
Total Feminine: 1725
Total Unknown: 7098


In [59]:
clf_unique_ppl = set(list(clf_df_ppl.text))
clf_unique_mas = set(list(clf_df_mas.text))
clf_unique_fem = set(list(clf_df_fem.text))
clf_unique_unk = set(list(clf_df_unk.text))
print("Unique people names:", len(clf_unique_ppl))
print("Unique masculine-labeled names:", len(clf_unique_mas))
print("Unique feminine-labeled names:", len(clf_unique_fem))
print("Unique unknown-labeled names:", len(clf_unique_unk))

Unique people names: 4188
Unique masculine-labeled names: 1657
Unique feminine-labeled names: 537
Unique unknown-labeled names: 3013


<a id="comp"></a>
## IV. Comparison

Compare the number of total and unique names found in with the classifier to the manual annotation process.

In [64]:
clf_df_pnoc1_tp = clf_df_ppl.loc[clf_df_ppl.agreement == "true positive"]
clf_df_pnoc1_fp = clf_df_ppl.loc[clf_df_ppl.agreement == "false positive"]
print("Total correct names:",clf_df_pnoc1_tp.shape[0])
print("Total names classified:",clf_df_pnoc1_tp.shape[0]+clf_df_pnoc1_fp.shape[0])

Total correct names: 11024
Total names classified: 15836


Compare the number of unique and total people spaCy found to those the annotators and classifier found with exact and fuzzy string matching.

In [65]:
print("Total people names in spaCy: ", len(person_list))
print("Total people names annotated:", total_ppl)
print("Total people names classified:", clf_total_ppl)
print("\nUnique people names in spaCy:  ", len(unique_persons))
print("Unique people names annotated:", len(unique_ppl))
print("Unique people names classified:", len(clf_unique_ppl))

Total people names in spaCy:  30548
Total people names annotated: 31157
Total people names classified: 15836

Unique people names in spaCy:   9366
Unique people names annotated: 10288
Unique people names classified: 4188


More names of people were labeled during annotation than with spaCy, but...

In [75]:
exact_match_ann = [person_name for person_name in unique_ppl if person_name in unique_persons]
exact_match_clf = [person_name for person_name in clf_unique_ppl if person_name in unique_persons]
print("Annotation names in spaCy names:", len(exact_match_ann))
print("Classifier names in spaCy", len(exact_match_clf))

Annotation names in spaCy names: 3174
Classifier names in spaCy 1811


In [70]:
fem_match = [n for n in unique_fem if n in unique_persons]
mas_match = [n for n in unique_mas if n in unique_persons]
unk_match = [n for n in unique_unk if n in unique_persons]
print("Feminine-labeled names found by spaCy:", len(fem_match))
print("Masculine-labeled names found by spaCy:", len(mas_match))
print("Unknown-labeled names found by spaCy:", len(unk_match))

Feminine-labeled names found by spaCy: 250
Masculine-labeled names found by spaCy: 747
Unknown-labeled names found by spaCy: 2702


In [74]:
fem_match = [n for n in unique_fem if n in clf_unique_ppl] #clf_unique_fem] 305 (count where grammatical gender matched)
mas_match = [n for n in unique_mas if n in clf_unique_ppl] #clf_unique_mas] 958 (count where grammatical gender matched)
unk_match = [n for n in unique_unk if n in clf_unique_ppl] #clf_unique_unk] 2492 (count where grammatical gender matched)
print("Feminine-labeled names found by own classifier:", len(fem_match))
print("Masculine-labeled names found by own classifier:", len(mas_match))
print("Unknown-labeled names found by own classifier:", len(unk_match))

Feminine-labeled names found by own classifier: 431
Masculine-labeled names found by own classifier: 1363
Unknown-labeled names found by own classifier: 3064


#### Fuzzy String Matching
Evaluate overlaps more loosely using fuzzy string matching.

In [82]:
# Compare each manually annotated person name to all spaCy-labeled person names
def getAnnotFuzzyMatches(score_method, min_score):
    all_fuzzy_matches = []
    no_fuzzy_match = 0
    for n in unique_ppl:
        fuzzy_matches = process.extractBests(n, unique_persons, scorer=score_method, score_cutoff=min_score)
        if len(fuzzy_matches) == 0:
            no_fuzzy_match += 1
        else:
            all_fuzzy_matches = all_fuzzy_matches + fuzzy_matches
    return no_fuzzy_match, all_fuzzy_matches

# Compare each classified person name to all spaCy-labeled person names
def getClfFuzzyMatches(score_method, min_score):
    all_fuzzy_matches = []
    no_fuzzy_match = 0
    for n in clf_unique_ppl:
        fuzzy_matches = process.extractBests(n, unique_persons, scorer=score_method, score_cutoff=min_score)
        if len(fuzzy_matches) == 0:
            no_fuzzy_match += 1
        else:
            all_fuzzy_matches = all_fuzzy_matches + fuzzy_matches
    return no_fuzzy_match, all_fuzzy_matches

# Compare each spaCy-labeled person name to all manually annotated person names
def getSpacyFuzzyMatches(score_method, min_score):
    all_fuzzy_matches = []
    no_fuzzy_match = 0
    for n in unique_persons:
        fuzzy_matches = process.extractBests(n, unique_ppl, scorer=score_method, score_cutoff=min_score)
        if len(fuzzy_matches) == 0:
            no_fuzzy_match += 1
        else:
            all_fuzzy_matches = all_fuzzy_matches + fuzzy_matches
    return no_fuzzy_match, all_fuzzy_matches

# Compare each spaCy-labeled person name to all classified person names
def getSpacyFuzzyMatchesWithClf(score_method, min_score):
    all_fuzzy_matches = []
    no_fuzzy_match = 0
    for n in unique_persons:
        fuzzy_matches = process.extractBests(n, clf_unique_ppl, scorer=score_method, score_cutoff=min_score)
        if len(fuzzy_matches) == 0:
            no_fuzzy_match += 1
        else:
            all_fuzzy_matches = all_fuzzy_matches + fuzzy_matches
    return no_fuzzy_match, all_fuzzy_matches

In [83]:
score_method = fuzz.ratio
min_score = 90
no_fuzzy_match, all_fuzzy_matches = getAnnotFuzzyMatches(score_method, min_score)
print("Count of annotated person names without spaCy fuzzy matching ratios of at least {s}: {m}".format(s=min_score,m=no_fuzzy_match))

Count of annotated person names without spaCy fuzzy matching ratios of at least 90: 6209


In [90]:
score_method = fuzz.ratio
min_score = 75
no_fuzzy_match, all_fuzzy_matches = getAnnotFuzzyMatches(score_method, min_score)
print("Count of annotated person names without SpaCy fuzzy matching ratios of at least {s}: {m}".format(s=min_score,m=no_fuzzy_match))

Count of annotated person names without SpaCy fuzzy matching ratios of at least 75: 4107


In [89]:
# score_method = fuzz.ratio
# min_score = 75
# clf_no_fuzzy_match, clf_all_fuzzy_matches = getClfFuzzyMatches(score_method, min_score)
# print("Count of classified person names without SpaCy fuzzy matching ratios of at least {s}: {m}".format(s=min_score,m=clf_no_fuzzy_match))

Let's calculate the minimum, maximum, and average fuzzy matching ratios of manually annotated person names to spaCy person names: 

In [91]:
score_method = fuzz.ratio
scores = []
for n in unique_ppl:
    fuzzy_matches = process.extractOne(n, unique_persons, scorer=score_method)  # use default score_cutoff, which is 0
    scores += [fuzzy_matches[1]]  # first position in tuple is match string, second position in tuple is score

In [92]:
min_score = np.min(scores)
max_score = np.max(scores)
mean_score = np.mean(scores)
median_score = np.median(scores)

In [93]:
# Get the counts (occurrences) of each score
unique_scores, score_counts = np.unique(scores, return_counts=True)
score_counts = dict(zip(unique_scores, score_counts))
print("Mean score: "+str(mean_score))
print("Matches with minimum score of "+str(min_score)+":", score_counts[min_score])
print("Matches with maximum score of "+str(max_score)+":", score_counts[max_score])
print("Matches with median score of "+str(median_score)+":", score_counts[median_score])

Mean score: 82.16115863141525
Matches with minimum score of 38: 1
Matches with maximum score of 100: 3599
Matches with median score of 81.0: 68


Now let's do the reverse... 

#### How many of the spaCy person names appear in the manually annotated and classified person names?

In [87]:
ann_exact_match = [person_name for person_name in unique_persons if person_name in unique_ppl]
clf_exact_match = [person_name for person_name in unique_persons if person_name in clf_unique_ppl]
print(len(ann_exact_match))  # same - looks good
print(len(clf_exact_match))  # same - looks good

3174
1811


In [94]:
score_method = fuzz.ratio
min_score = 90
no_fuzzy_match, all_fuzzy_matches = getSpacyFuzzyMatches(score_method, min_score)
print("Count of spaCy names without annotated person name fuzzy matching ratios of at least {s}: {m}".format(s=min_score,m=no_fuzzy_match))

Count of spaCy names without annotated person name fuzzy matching ratios of at least 90: 5677


In [95]:
score_method = fuzz.ratio
min_score = 75
no_fuzzy_match, all_fuzzy_matches = getSpacyFuzzyMatches(score_method, min_score)
print("Count of spaCy names without annotated person name fuzzy matching ratios of at least {s}: {m}".format(s=min_score,m=no_fuzzy_match))

Count of spaCy names without annotated person name fuzzy matching ratios of at least 75: 3701


In [96]:
# score_method = fuzz.ratio
# min_score = 75
# clf_no_fuzzy_match, clf_all_fuzzy_matches = getSpacyFuzzyMatchesWithClf(score_method, min_score)
# print("Count of spaCy names without classified person name fuzzy matching ratios of at least {s}: {m}".format(s=min_score,m=clf_no_fuzzy_match))

Let's calculate the minimum, maximum, and average fuzzy matching ratios of spaCy person names to manually annotated person names: 

In [97]:
score_method = fuzz.ratio
scores = []
for n in unique_persons:
    fuzzy_matches = process.extractOne(n, unique_ppl, scorer=score_method)  # use default score_cutoff, which is 0
    scores += [fuzzy_matches[1]]  # first position in tuple is match string, second position in tuple is score

In [98]:
min_score = np.min(scores)
max_score = np.max(scores)
mean_score = np.mean(scores)
median_score = np.median(scores)

In [99]:
# Get the counts (occurrences) of each score
unique_scores, score_counts = np.unique(scores, return_counts=True)
score_counts = dict(zip(unique_scores, score_counts))
print("Mean score: "+str(mean_score))
print("Matches with minimum score of "+str(min_score)+":", score_counts[min_score])
print("Matches with maximum score of "+str(max_score)+":", score_counts[max_score])
print("Matches with median score of "+str(median_score)+":", score_counts[median_score])

Mean score: 81.56993380311766
Matches with minimum score of 14: 1
Matches with maximum score of 100: 3336
Matches with median score of 80.0: 315


In [None]:
# # Convert numpty ints to python ints for JSON file writing
# unique_scores = [int(s) for s in unique_scores]
# score_counts = [int(c) for c in score_counts]

# d_array = []
# i, maxI = 0, len(unique_scores)
# while i < maxI:
#     d = dict()
#     d["unique_score"] = unique_scores[i]
#     d["count"] = score_counts[i]
#     d_array = d_array + [d]
#     i += 1

# print(d_array)

In [None]:
# score_counts_json = json.dumps(d_array)
# f = open("analysis_data/spacy_to_annot_ppl_fuzzy_ratios.json", "w")
# f.write(score_counts_json)
# f.close()