In [25]:
import json
import os
from collections import defaultdict

# Input and output paths
input_directory = "/scratch/project_462000353/tlundber/tld-aggregator/tld-aggregator-results"
top_output_dir = "/scratch/project_462000353/tlundber/tld-aggregator/selected_results"
languages = ['spa_Latn', 'fra_Latn', 'hin_Deva', 'tha_Thai', 'swe_Latn']
top_n = 10  # Number of top TLDs to include
os.makedirs(top_output_dir, exist_ok=True)

def sum_and_sort_distribution(input_directory, output_file, top_n):
    master_table = defaultdict(int)  # Dictionary to hold summed frequencies

    # Iterate through JSON files in the input directory
    for root, dirs, files in os.walk(input_directory):
        for file_name in files:
            if file_name.endswith(".json") and "checkpoint" not in file_name.lower():
                file_path = os.path.join(root, file_name)
                with open(file_path, 'r') as file:
                    data = json.load(file)
                    for key, value in data.items():
                        master_table[key] += value  # Sum frequencies

    # Sort the dictionary by frequency in descending order and select top-N
    sorted_master_table = dict(sorted(master_table.items(), key=lambda item: item[1], reverse=True)[:top_n])

    # Save the sorted and filtered result to the output file
    with open(output_file, 'w') as output:
        json.dump(sorted_master_table, output, indent=4)

# Process each language
for lang in languages:
    lang_input_dir = input_directory + "/" + lang
    output_file = os.path.join(top_output_dir, f"{lang}.json")
    sum_and_sort_distribution(lang_input_dir, output_file, top_n)


In [26]:
# Input and output paths
input_directory = "/scratch/project_462000353/tlundber/tld-aggregator/selected_results-top100"
output_directory = "/scratch/project_462000353/tlundber/tld-aggregator/relative_frequencies"

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

def calculate_relative_frequencies(input_directory, output_directory):
    # Iterate through JSON files in the input directory
    for root, dirs, files in os.walk(input_directory):
        for file_name in files:
            if file_name.endswith(".json") and "checkpoint" not in file_name.lower():
                input_file_path = os.path.join(root, file_name)
                output_file_path = os.path.join(output_directory, file_name)

                # Load frequency data from the input file
                with open(input_file_path, 'r') as input_file:
                    frequency_data = json.load(input_file)

                # Calculate total frequency
                total_frequency = sum(frequency_data.values())

                # Calculate relative frequencies
                relative_frequencies = {key: value / total_frequency for key, value in frequency_data.items()}

                # Save relative frequencies to the output file
                with open(output_file_path, 'w') as output_file:
                    json.dump(relative_frequencies, output_file, indent=4)

                print(f"Processed {file_name} -> {output_file_path}")

# Run the script
calculate_relative_frequencies(input_directory, output_directory)

Processed hin_Deva.json -> /scratch/project_462000353/tlundber/tld-aggregator/relative_frequencies/hin_Deva.json
Processed tha_Thai.json -> /scratch/project_462000353/tlundber/tld-aggregator/relative_frequencies/tha_Thai.json
Processed swe_Latn.json -> /scratch/project_462000353/tlundber/tld-aggregator/relative_frequencies/swe_Latn.json
Processed spa_Latn.json -> /scratch/project_462000353/tlundber/tld-aggregator/relative_frequencies/spa_Latn.json
Processed fra_Latn.json -> /scratch/project_462000353/tlundber/tld-aggregator/relative_frequencies/fra_Latn.json
