In [2]:
import json
import random
from collections import OrderedDict

with open('data.json', 'r') as file:
    data = json.load(file)

# store documents grouped by 'genre' value
genre_documents = {}
# iterate through each dict
for item in data:
    # retreive genre value
    genre = item.get('genre')
    if genre:
        if genre not in genre_documents:
            # initalize empty dict
            genre_documents[genre] = OrderedDict()
        # get doc_id value 
        document_id = item.get('document_id')
        # create empty list for each new doc_id value
        if document_id not in genre_documents[genre]:
            genre_documents[genre][document_id] = []
        # append current dict to list w/in genre
        genre_documents[genre][document_id].append(item)

# Randomly select 25 documents for each genre (while preserving internal ordering)
selected_documents = {}
for genre in ['general', 'wiki', 'transcript']:
    if genre in genre_documents:
        # randomly sample 25 docs (lists) from each genre
        selected_documents[genre] = random.sample(list(genre_documents[genre].values()), 25)

for genre, documents in selected_documents.items():
    if len(documents) != 25:
        print(f"warning: less than 25 docs!")

# create flat list
selected_documents_list = []
for genre_list in selected_documents.values():
    for document_list in genre_list:
        for item in document_list:
            selected_documents_list.append(item)





In [6]:
# export to json
import json

json_file_path = 'balanced_data.json'
with open(json_file_path, 'w') as json_file:
    json.dump(selected_documents_list, json_file)  

In [1]:
# get legal data 
with open('data.json', 'r') as file:
    data = json.load(file)

# Store dictionaries for genres other than 'general', 'wiki', 'transcript'
selected_documents = OrderedDict()

# Store document_ids for genres other than 'general', 'wiki', 'transcript'
document_ids_set = set()

# Iterate through each dict
for item in data:
    # Retrieve genre value
    genre = item.get('genre')
    if genre and genre not in ['general', 'wiki', 'transcript']:
        # Get doc_id value and add to the set
        document_id = item.get('document_id')
        document_ids_set.add(document_id)
        selected_documents[document_id] = item

# random sample 250 unique sets of document_ids
document_ids_list = list(document_ids_set)
selected_document_id_sets = random.sample(document_ids_list, min(250, len(document_ids_list)))

# Retrieve the corresponding dictionaries for the selected document_ids, preserving order
selected_documents_list = [selected_documents[doc_id] for doc_id in selected_document_id_sets]

# Print the selected documents
for document in selected_documents_list:
    print(document)




{'Unnamed: 0': 6455032, 'genre': 11, 'document_id': '“Business User”', 'sentence': 'shall mean all Users of the Business web site(s) and services.', 'boundary': 'yes'}
{'Unnamed: 0': 3609944, 'genre': 377, 'document_id': 'BBQ Haiku Sweepstakes', 'sentence': 'Employees and officers of Friendly’s and its franchisees, their parents, subsidiaries, and affiliates, and their immediate families and persons living in the same household are also ineligible to participate.', 'boundary': 'yes'}
{'Unnamed: 0': 6170846, 'genre': 311, 'document_id': 'Passports and Visas: Responsibility of Passenger', 'sentence': "The Carrier may apply towards such payment or expenditure the value of any carriage unused by the Passenger, or any funds in the Carrier's possession.", 'boundary': 'yes'}
{'Unnamed: 0': 2347630, 'genre': 112, 'document_id': 'DISCLAIMER OF SERVICES AND WEBSITE', 'sentence': 'The Company may discontinue coverage of any security, sector, industry, company, topic, market or event at any time, 

In [3]:
# export to json
json_file_path = 'balanced_data_legal.json'
with open(json_file_path, 'w') as json_file:
    json.dump(selected_documents_list, json_file)  

In [6]:
# modify format of transcript + general + wiki data 
with open('balanced_data.json', 'r') as file:
    data = json.load(file)

for dictionary in data:
    dictionary.pop("Unnamed: 0", None)

json_file_path = 'balanced_data_new.json'
with open(json_file_path, 'w') as json_file:
    json.dump(data, json_file) 


In [9]:
# modify format of legal data to match that of the transcript + general + wiki data 
with open('balanced_data_legal.json', 'r') as file:
    data = json.load(file)

# mapping between original IDs and new numbers --> creating the string (legal + new #)
id_mapping = {original_id: 'legal' + str(index + 1) for index, original_id in enumerate(set(entry["document_id"] for entry in data))}

for dictionary in data:
    # delete key-value pair
    dictionary.pop("Unnamed: 0", None)
    # change genre to legal
    dictionary["genre"] = "legal"
    # convert document_id
    dictionary["document_id"] = id_mapping[dictionary["document_id"]]


json_file_path = 'balanced_data_legal_new.json'
with open(json_file_path, 'w') as json_file:
    json.dump(data, json_file) 

In [12]:
# combine legal and transcript + general + wiki data into one json file with unfiform format 
import json

with open('balanced_data_new.json', 'r') as file:
    data1 = json.load(file)
with open('balanced_data_legal_new.json', 'r') as file:
    data2 = json.load(file)

all_data = data1 + data2  

with open('all_balanced_data.json', 'w') as file:
    json.dump(all_data, file)
