In [None]:
from joblib import load
from datasets import load_from_disk
import numpy as np
from util.local_data_handler import *

### Load Model and Embeddings

In [None]:
CLF_MODEL_FILENAME = "logistic_regression.joblib"
clf = load(CLF_MODEL_FILENAME)
editions_embedded = load_from_disk("data/json/classification/encyclopedia_embeds.hf")

### Apply Classifier to Encyclopedia

In [None]:
X_first_ed = editions_embedded['first_ed']
X_fourth_ed = editions_embedded['fourth_ed']

X_first_ed, X_fourth_ed

In [None]:
pred_first_ed = clf.predict(X_first_ed['CLS_embed'])
pred_fourth_ed = clf.predict(X_fourth_ed['CLS_embed'])

#### Filter for 'Person' Entries

In [None]:
# Convert to 1=True and 0=False boolean lists for numpy filtering
people_filter_first_ed = [bool(i) for i in pred_first_ed]
people_filter_fourth_ed = [bool(i) for i in pred_fourth_ed]

non_people_filter_first_ed = [not bool(i) for i in pred_first_ed]
non_people_filter_fourth_ed = [not bool(i) for i in pred_fourth_ed]

In [None]:
text_1_people = np.array(X_first_ed['text'])[people_filter_first_ed]
text_1_non_people = np.array(X_first_ed['text'])[non_people_filter_first_ed]
text_4_people = np.array(X_fourth_ed['text'])[people_filter_fourth_ed]
text_4_non_people = np.array(X_fourth_ed['text'])[non_people_filter_fourth_ed]

headword_1_people = np.array(X_first_ed['headword'])[people_filter_first_ed]
headword_1_non_people = np.array(X_first_ed['headword'])[non_people_filter_first_ed]
headword_4_people = np.array(X_fourth_ed['headword'])[people_filter_fourth_ed]
headword_4_non_people = np.array(X_fourth_ed['headword'])[non_people_filter_fourth_ed]

entryId_1_people = np.array(X_first_ed['entryId'])[people_filter_first_ed]
entryId_1_non_people = np.array(X_first_ed['entryId'])[non_people_filter_first_ed]
entryId_4_people = np.array(X_fourth_ed['entryId'])[people_filter_fourth_ed]
entryId_4_non_people = np.array(X_fourth_ed['entryId'])[non_people_filter_fourth_ed]

In [None]:
print(f"FIRST ED:  {len(text_1_people)} entries classified as 'Person' out of {len(X_first_ed)} entries ({round((len(text_1_people)/len(X_first_ed))*100, 2)}%)")
print(f"FOURTH ED: {len(text_4_people)} entries classified as 'Person' out of {len(X_fourth_ed)} entries ({round((len(text_4_people)/len(X_fourth_ed))*100, 2)}%)")

In [None]:
import matplotlib.pyplot as plt


# Summing the stacked parts
first_ed_people_sum = len(text_1_people)
first_ed_non_people_sum = len(X_first_ed) - first_ed_people_sum
fourth_ed_people_sum = len(text_4_people)
fourth_ed_non_people_sum = len(X_fourth_ed) - fourth_ed_people_sum

# X-axis positions
x = np.arange(2)

# Plotting
fig, ax = plt.subplots()

# First column with stacks
# ax.bar(0, first_ed_people_sum, label='Entries', color='tomato')
ax.bar(0, len(X_first_ed), label='Entries', color='gray')
# ax.bar(0, first_ed_non_people_sum, bottom=first_ed_people_sum, label='Non-Person', color='royalblue')

# Second column with stacks
# ax.bar(1, fourth_ed_people_sum, label='Entries', color='tomato')
ax.bar(1, len(X_fourth_ed), label='Entries', color='gray')
# ax.bar(1, fourth_ed_non_people_sum, bottom=fourth_ed_people_sum, label='Non-Person', color='royalblue')

# Adding labels and title
ax.set_xticks(x)
ax.set_xticklabels(['1st edition', '4th edition'])
ax.set_ylabel('No. of Entries')
# ax.set_title('Person/Non-Person')
ax.set_title('Total no. of Entries Per Edition')
ax.legend()

handles, labels = ax.get_legend_handles_labels()
unique_labels = dict(zip(labels, handles))
ax.legend(unique_labels.values(), unique_labels.keys())

# Display the plot
plt.show()

### Save and Collect Person Entries

In [None]:
SAVE_RESULTS = False

FIRST_ED_PEOPLE_LOCATION = "data/json/classification/first_ed_people.json"
FOURTH_ED_PEOPLE_LOCATION = "data/json/classification/fourth_ed_people.json"

FIRST_ED_NON_PEOPLE_LOCATION = "data/json/classification/first_ed_non_people.json"
FOURTH_ED_NON_PEOPLE_LOCATION = "data/json/classification/fourth_ed_non_people.json"

In [None]:
import random

In [None]:
# Collect all entries labelled as 'person'
person_entries_first_ed = []
for i in range(len(text_1_people)):
    new_entry = {
        "headword": headword_1_people[i],
        "entryId":entryId_1_people[i],
        "text": text_1_people[i],
        "person": 1,
        "qid": ""
    }
    person_entries_first_ed.append(new_entry)

non_person_entries_first_ed = []
for i in range(len(text_1_non_people)):
    new_entry = {
        "headword": headword_1_non_people[i],
        "entryId":entryId_1_non_people[i],
        "text": text_1_non_people[i],
        "person": 0,
        "qid": ""
    }
    non_person_entries_first_ed.append(new_entry)

random.sample(person_entries_first_ed, 3)

In [None]:
person_entries_fourth_ed = []
for i in range(len(text_4_people)):
    new_entry = {
        "headword": headword_4_people[i],
        "entryId": entryId_4_people[i],
        "text": text_4_people[i],
        "person": 1,
        "qid": ""
    }
    person_entries_fourth_ed.append(new_entry)

non_person_entries_fourth_ed = []
for i in range(len(text_4_non_people)):
    new_entry = {
        "headword": headword_4_non_people[i],
        "entryId": entryId_4_non_people[i],
        "text": text_4_non_people[i],
        "person": 0,
        "qid": ""
    }
    non_person_entries_fourth_ed.append(new_entry)

random.sample(person_entries_fourth_ed, 3)

In [None]:
if SAVE_RESULTS:
    with open(FIRST_ED_PEOPLE_LOCATION, 'w', encoding='utf-8') as outfile:
        json.dump(person_entries_first_ed, outfile, ensure_ascii=False, indent=2)

    with open(FIRST_ED_NON_PEOPLE_LOCATION, 'w', encoding='utf-8') as outfile:
        json.dump(non_person_entries_first_ed, outfile, ensure_ascii=False, indent=2)

    with open(FOURTH_ED_PEOPLE_LOCATION, 'w', encoding='utf-8') as outfile:
        json.dump(person_entries_fourth_ed, outfile, ensure_ascii=False, indent=2)
    
    with open(FOURTH_ED_NON_PEOPLE_LOCATION, 'w', encoding='utf-8') as outfile:
        json.dump(non_person_entries_fourth_ed, outfile, ensure_ascii=False, indent=2)