In [2]:
occupations_set = {'organist', 'narrator', 'pastor', 'musician', 'narration', 'author', 'arranger', 'educator', 'military officer', 'soloist', 'model', 'naval surgeon', 'fitness instructor', 'broadcasting', 'composer', 'data scientist', 'economist', 'journalist', 'politician', 'host', 'film director', 'guitarist', 'environmentalist', 'songwriter', 'lawyer', 'radio broadcaster', 'screenwriter', 'athlete', 'coach', 'revolutionary', 'essayist', 'comedian', 'locksmith', 'writer', 'record producer', 'entertainer', 'dancer', 'stage', 'media executive', 'actress', 'parliamentarian', 'poet', 'businessman', 'model', 'actor', 'tv personality', 'songwriter', 'professor', 'mountaineer', 'radio host', 'travel writer', 'sportsperson', 'producer', 'film actress', 'philanthropist', 'businesswoman', 'voice actor', 'geographer', 'director', 'architect', 'teacher', 'television host', 'playwright', 'animal-rights activist', 'singer', 'translator', 'novelist', 'rapper', 'deejay', 'film producer', 'entrepreneur', 'stuntman', 'sportsman', 'columnist'}

In [3]:
len(occupations_set)

72

In [11]:
import os
import json
from collections import defaultdict, Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /home/yc833/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/yc833/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/yc833/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
def parse_infobox_line(line):
    """Extracts and concatenates occupations from the infobox line."""
    occupations = set()
    tokens = line.split('\t')
    occupation_parts = []
    for token in tokens:
        if 'occupation_' in token:
            # Extract the part of the occupation from the token
            part = token.split(':', 1)[1].strip()
            part = part.replace('[[', '').replace(']]', '').replace('*', '').strip()
            occupation_parts.append(part)

    # Join all parts into one string and add to the set of occupations
    if occupation_parts:
        full_occupation = " ".join(occupation_parts)
        occupations.add(full_occupation)

    return occupations


def remove_substrings(occupation_list):
    result = []
    for occupation in occupation_list:
        if not any(occupation in other for other in occupation_list if occupation != other):
            result.append(occupation)
    return result


def filter_articles_by_occupation(dataset_dir, occupations_set, char_threshold, num_samples):
    """Collects articles based on specified occupations up to a sample limit."""
    articles = defaultdict(list)
    subsets = ['train/train', 'valid/valid', 'test/test']
    title_files = [os.path.join(dataset_dir, f"{subset}.title") for subset in subsets]
    box_files = [os.path.join(dataset_dir, f"{subset}.box") for subset in subsets]
    sent_files = [os.path.join(dataset_dir, f"{subset}.sent") for subset in subsets]
    nb_files = [os.path.join(dataset_dir, f"{subset}.nb") for subset in subsets]

    for idx, (title_file, box_file, sent_file, nb_file) in enumerate(zip(title_files, box_files, sent_files, nb_files)):
        with open(title_file, 'r') as tfile, open(box_file, 'r') as bfile, open(sent_file, 'r') as sfile, open(nb_file, 'r') as nfile:
            title_lines = tfile.readlines()
            sent_lines = sfile.readlines()
            nb_lines = [int(line.strip()) for line in nfile.readlines()]
            start_index = 0
            for i, (bline, num_sentences) in enumerate(zip(bfile, nb_lines)):
                if all(len(articles[occ]) >= num_samples for occ in occupations_set):
                    break
                # print("bline:", bline)
                entry_occupations = parse_infobox_line(bline)
                # print("entry:", entry_occupations)
                matched_occupations = [o for o in occupations_set if any(o in occ.lower() for occ in entry_occupations)]
                # print("matched", matched_occupations)
                matched_occupations = remove_substrings(matched_occupations)
                # print("matched2", matched_occupations)
                if matched_occupations:
                    summary = ' '.join(sent_lines[start_index:start_index + num_sentences]).strip()
                    name = title_lines[i].strip()
                    if len(summary) > char_threshold:
                        for occ in matched_occupations:
                            if len(articles[occ]) < num_samples:
                                articles[occ].append({'name': name, 'summary': summary})
                                if len(articles[occ]) == num_samples:
                                    break
                start_index += num_sentences

    return articles


In [13]:
def save_data_and_report(articles, data_output_path, report_output_path, attribute_output_path):
    """Saves detailed data to JSON and generates a report on frequent words, also saved to JSON."""
    # Set up stopwords and lemmatizer
    standard_stop_words = set(stopwords.words('english'))
    custom_stop_words = ["born", "known", "also", "became", "name", "one"]
    stop_words = standard_stop_words.union(set(custom_stop_words))
    lemmatizer = WordNetLemmatizer()

    data = {}
    report_data = {}
    attribute_data = {}

    for occupation, entries in articles.items():
        word_counter = Counter()
        for entry in entries:
            words = [
                lemmatizer.lemmatize(word.lower())  # Lemmatize the word
                for word in word_tokenize(entry['summary'])
                if word.lower() not in stop_words and word.isalpha()
            ]
            word_counter.update(words)
        
        most_common_words = word_counter.most_common(20)
        data[occupation] = {
            'count': len(entries),
            'entries': entries
        }
        report_data[occupation] = {
            'count': len(entries),
            'frequent_words': [{word: count} for word, count in most_common_words]
        }
        attribute_data[occupation] = {
            'attribute': [word for word, _ in most_common_words]
        }
    
    # Print and save data
    print(len(report_data))
    print(report_data.keys())

    with open(data_output_path, 'w', encoding='utf-8') as f_data:
        json.dump(data, f_data, indent=4)

    with open(report_output_path, 'w', encoding='utf-8') as f_report:
        json.dump(report_data, f_report, indent=4)

    with open(attribute_output_path, 'w', encoding='utf-8') as f_attribute:
        json.dump(attribute_data, f_attribute, indent=4)



In [14]:
dataset_dir = 'wikipedia-biography-dataset/wikipedia-biography-dataset'
char_threshold = 200
num_samples = 100
data_output_path = 'output/occupation_data_100.json'
report_output_path = 'output/occupation_report_100.json'
attribute_output_path = 'output/occupation_attribute_100.json'

articles_by_occupation = filter_articles_by_occupation(dataset_dir, occupations_set, char_threshold, num_samples)
save_data_and_report(articles_by_occupation, data_output_path, report_output_path, attribute_output_path)

71
dict_keys(['author', 'actress', 'businesswoman', 'politician', 'composer', 'musician', 'arranger', 'film director', 'actor', 'essayist', 'writer', 'poet', 'playwright', 'songwriter', 'singer', 'travel writer', 'director', 'teacher', 'lawyer', 'screenwriter', 'record producer', 'film producer', 'architect', 'model', 'comedian', 'novelist', 'producer', 'professor', 'journalist', 'translator', 'coach', 'host', 'organist', 'rapper', 'radio host', 'entrepreneur', 'athlete', 'philanthropist', 'businessman', 'economist', 'revolutionary', 'environmentalist', 'stage', 'tv personality', 'dancer', 'educator', 'guitarist', 'entertainer', 'voice actor', 'columnist', 'narrator', 'television host', 'media executive', 'sportsman', 'broadcasting', 'film actress', 'military officer', 'pastor', 'stuntman', 'radio broadcaster', 'deejay', 'geographer', 'mountaineer', 'soloist', 'sportsperson', 'fitness instructor', 'data scientist', 'parliamentarian', 'locksmith', 'narration', 'naval surgeon'])
