In [1]:
import os
import json
from collections import defaultdict, Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /home/yc833/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/yc833/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/yc833/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def parse_infobox_line(line):
    """Extracts and concatenates occupations from the infobox line."""
    occupations = set()
    tokens = line.split('\t')
    occupation_parts = []
    for token in tokens:
        if 'occupation_' in token:
            # Extract the part of the occupation from the token
            part = token.split(':', 1)[1].strip()
            part = part.replace('[[', '').replace(']]', '').replace('*', '').strip()
            occupation_parts.append(part)

    # Join all parts into one string and add to the set of occupations
    if occupation_parts:
        full_occupation = " ".join(occupation_parts)
        occupations.add(full_occupation)

    return occupations


def remove_substrings(occupation_list):
    result = []
    for occupation in occupation_list:
        if not any(occupation in other for other in occupation_list if occupation != other):
            result.append(occupation)
    return result


def get_occupation_summaries(dataset_dir, occupations_set, char_threshold, num_samples=None):
    """Collects articles based on specified occupations up to a sample limit."""
    articles = defaultdict(list)
    subsets = ['train/train', 'valid/valid', 'test/test']
    title_files = [os.path.join(dataset_dir, f"{subset}.title") for subset in subsets]
    box_files = [os.path.join(dataset_dir, f"{subset}.box") for subset in subsets]
    sent_files = [os.path.join(dataset_dir, f"{subset}.sent") for subset in subsets]
    nb_files = [os.path.join(dataset_dir, f"{subset}.nb") for subset in subsets]
    
    for idx, (title_file, box_file, sent_file, nb_file) in tqdm(enumerate(zip(title_files, box_files, sent_files, nb_files)), total=len(title_files), desc="Processing files"):
        with open(title_file, 'r') as tfile, open(box_file, 'r') as bfile, open(sent_file, 'r') as sfile, open(nb_file, 'r') as nfile:
            title_lines = tfile.readlines()
            sent_lines = sfile.readlines()
            nb_lines = [int(line.strip()) for line in nfile.readlines()]
            start_index = 0
            for i, (bline, num_sentences) in enumerate(zip(bfile, nb_lines)):
                if num_samples is not None and all(len(articles) >= num_samples):
                    break
                # print("bline:", bline)
                entry_occupations = parse_infobox_line(bline)
                # print("entry:", entry_occupations)
                matched_occupations = [o for o in occupations_set if any(o in occ.lower() for occ in entry_occupations)]
                # print("matched", matched_occupations)
                matched_occupations = remove_substrings(matched_occupations)
                # print("matched2", matched_occupations)
                if matched_occupations:
                    summary = ' '.join(sent_lines[start_index:start_index + num_sentences]).strip()
                    name = title_lines[i].strip()
                    if len(summary) > char_threshold:
                        articles[name] = summary
                start_index += num_sentences

    return articles


In [None]:
import json
from collections import defaultdict

occupations_set = {'organist', 'narrator', 'pastor', 'musician', 'narration', 'author', 'arranger', 'educator', 'military officer', 'soloist', 'model', 'naval surgeon', 'fitness instructor', 'broadcasting', 'composer', 'data scientist', 'economist', 'journalist', 'politician', 'host', 'film director', 'guitarist', 'environmentalist', 'songwriter', 'lawyer', 'radio broadcaster', 'screenwriter', 'athlete', 'coach', 'revolutionary', 'essayist', 'comedian', 'locksmith', 'writer', 'record producer', 'entertainer', 'dancer', 'stage', 'media executive', 'actress', 'parliamentarian', 'poet', 'businessman', 'model', 'actor', 'tv personality', 'songwriter', 'professor', 'mountaineer', 'radio host', 'travel writer', 'sportsperson', 'producer', 'film actress', 'philanthropist', 'businesswoman', 'voice actor', 'geographer', 'director', 'architect', 'teacher', 'television host', 'playwright', 'animal-rights activist', 'singer', 'translator', 'novelist', 'rapper', 'deejay', 'film producer', 'entrepreneur', 'stuntman', 'sportsman', 'columnist'}

dataset_dir = 'wikipedia-biography-dataset/wikipedia-biography-dataset'
char_threshold = 200
num_samples = None

# Convert defaultdict to a regular dictionary before saving to JSON
articles_dict = get_occupation_summaries(dataset_dir, occupations_set, char_threshold, num_samples)

# Save to a JSON file
with open('output/occupation_summaries_200.json', 'w') as json_file:
    json.dump(articles_dict, json_file, indent=4)