### calculate number of tokens to use for each domain in our subset

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
#arguments - desired number of tokens for each domain
num_tokens_total = 20_000_000_000
doi = "mathematics" #domain of interest

num_tokens_doi_in_subset = 0.1 * num_tokens_total #number of tokens for domain of interest
num_tokens_other_in_subset = 0.9 * num_tokens_total #number of tokens for other domains


#### load domain data

In [3]:
#load domain data
domain_data = pd.read_json("evolm/common/scripts/ffw_sample_domain_info.json", lines=True)

In [4]:
n_tokens_total_in_ffwsample_dataset = domain_data['n_tokens_llama2'].sum()
print(f"Number of tokens in FineFineWeb-sample dataset: {n_tokens_total_in_ffwsample_dataset / 1e9} billion")

Number of tokens in FineFineWeb-sample dataset: 226.371480036 billion


#### calculate number of tokens for each domain -- to use in our subset

In [5]:
### record number of tokens for each domain
num_tokens_dict = {}

#number of tokens for domain of interest -- based on arguments
num_tokens_dict[doi] = int(num_tokens_doi_in_subset)

#compute total number of tokens in FFW dataset for non-DOI domains
domain_data_nondoi = domain_data[domain_data['domain'] != doi]
num_tokens_other_in_ffw = domain_data_nondoi['n_tokens_llama2'].sum()

for i, row in domain_data_nondoi.iterrows():
    #compute number of tokens to include for each non-DOI domain
    domain = row['domain'] #name of domain, string
    num_tokens_domain = num_tokens_other_in_subset * row['n_tokens_llama2']/num_tokens_other_in_ffw #number of tokens for domain = num_tokens_other * ratio of this non-doi domain's tokens to total non-doi tokens in ffw
    num_tokens_domain = int(np.ceil(num_tokens_domain)) #round up to the nearest integer

    #record info
    num_tokens_dict[domain] = num_tokens_domain

In [6]:
#check tokens
num_tokens_other_check = 0
for domain in num_tokens_dict.keys():
    if domain == doi:
        num_tokens_doi_check = num_tokens_dict[domain]
    else:
        num_tokens_other_check += num_tokens_dict[domain]

print('num_tokens_doi, check: ', num_tokens_doi_check / 1e9, 'B')
print('num_tokens_other, check: ', num_tokens_other_check / 1e9, 'B')
print('num_tokens_total, check: ', (num_tokens_doi_check+num_tokens_other_check) / 1e9, 'B')

print('')
print('num_tokens_doi, desired: ', num_tokens_doi_in_subset / 1e9, 'B')
print('num_tokens_other, desired: ', num_tokens_other_in_subset / 1e9, 'B')
print('num_tokens_total, desired: ', (num_tokens_doi_in_subset+num_tokens_other_in_subset) / 1e9, 'B')

num_tokens_doi, check:  2.0 B
num_tokens_other, check:  18.000000028 B
num_tokens_total, check:  20.000000028 B

num_tokens_doi, desired:  2.0 B
num_tokens_other, desired:  18.0 B
num_tokens_total, desired:  20.0 B


In [7]:
#check that all domains have enough tokens
    #e.g. especiallysmall domains and domain of interest

# small_domains = ['atmospheric_science', 'landscape_architecture', 'ocean_science', 'petroleum_and_natural_gas_engineering', 'topicality', 'weapons_science']

domains_all = [
    'aerospace', 'agronomy', 'artistic', 'astronomy', 'atmospheric_science', 
    'automotive', 'beauty', 'biology', 'celebrity', 'chemistry', 
    'christianity', 'civil_engineering', 'communication_engineering', 'computer_science_and_technology', 'design',
    'drama_and_film', 'economics', 'electronic_science', 'entertainment', 'environmental_science',
    'fashion', 'finance', 'food', 'gamble', 'game',
    'geography', 'health', 'history', 'hobby', 'hydraulic_engineering',
    'instrument_science', 'journalism_and_media_communication', 'landscape_architecture', 'law', 'library',
    'literature', 'materials_science', 'mathematics', 'mechanical_engineering', 'medical',
    'mining_engineering', 'movie', 'music_and_dance', 'news', 'nuclear_science', 
    'ocean_science', 'optical_engineering', 'painting', 'pet', 'petroleum_and_natural_gas_engineering',
    'philosophy', 'photo', 'physics', 'politics', 'psychology',
    'public_administration', 'relationship', 'sociology', 'sports', 'statistics',
    'systems_science', 'textile_science', 'topicality', 'transportation_engineering', 'travel',
    'urban_planning', 'weapons_science'
    ]

print("DOMAINS WITH INSUFFICIENT TOKENS:")
for domain in domains_all:
    n_tokens_in_domain = domain_data[domain_data['domain'] == domain]['n_tokens_llama2'].item()
    n_tokens_desired = num_tokens_dict[domain]

    if n_tokens_desired > n_tokens_in_domain:
        print(domain)
        print('  # tokens in FFW-sample dataset: ', n_tokens_in_domain / 1e9, 'B')
        print('  # tokens desired: ', n_tokens_desired / 1e9, 'B')
        print('')

#note in first run, mathematics from finefineweb-sample did not have sufficient tokens
#needed to download mathematics from finefineweb (full dataset), put in finefineweb-sample/mathematics
#then re-ran this script

DOMAINS WITH INSUFFICIENT TOKENS:


In [8]:
num_tokens_total_B = int(num_tokens_total / 1e9)

In [9]:
#save num_tokens_dict to JSON file
with open(f"evolm/common/scripts/ffw_sample_num_tokens_llama2_for_mysubset_{num_tokens_total_B}BT.json", "w") as f:
    json.dump(num_tokens_dict, f, indent=4)  # indent makes it readable