Part 1

In [2]:
import requests, random
import gzip, json
import pandas as pd
from io import BytesIO

In [None]:
#2 step randomization (option1 discussed inclass)
#   2.1 - extract a random list of urls from dolma 
#   2.2 - filter out urls containing unsafe files (refinedweb ones)
#   2.3 - true random sampling of 15 urls to extract tokens from (next cell)

#full list of Dolma dataset files are here...
dolma_url_list = "https://huggingface.co/datasets/allenai/dolma/raw/main/urls/v1_7.txt"

#download url list via python request
resp = requests.get(dolma_url_list)
all_urls = resp.text.splitlines()  #break list into separate urls like [ 'https://url1', 'https://url2', ..., 'https://urln']

#filter out unsafe or unwanted sources
safe_urls = [u for u in all_urls if "falcon-refinedweb" not in u]

#randomly sample 15
sampled_urls = random.sample(safe_urls, 15)
# print("Selected urls:\n")
# for u in sampled_urls:
#     print(u)


In [None]:
#2 step randomization (option1 discussed inclass)
#   2.5 - reading the same number of tokens from each file akin to RR (round robin)
#   2.6 - tag to record reading progress per file
#   2.7 - loop runs until the token_target is reached
token_target = 300000
token_count = 0
samples = []

#same amount of tokens to be grabbed from each sampled url
tokens_per_url = token_target // len(sampled_urls) 

#tag to record reading progress per file
progress_tracker = {url: 0 for url in sampled_urls}

#loop runs until the token_target is reached
while token_count < token_target:
    for url in sampled_urls:
        print(f"\nReading from: {url}")
        resp = requests.get(url, stream=True)
        resp.raise_for_status() #raise server side errors such as 404/500!

        with gzip.open(BytesIO(resp.content), "rt", encoding="utf-8") as f:
            for i, line in enumerate(f):
                if i < progress_tracker[url]:
                    continue  #skip lines already processed

                try:
                    obj = json.loads(line)
                except json.JSONDecodeError:
                    continue

                text = obj.get("text", "")
                metadata = obj.get("metadata", {})

                metadata_url = None

                #pull out the url field from the metadata dictionary 
                if isinstance(metadata, dict):
                    metadata_url = metadata.get("url", None)

                tokens = len(text.split())
                token_count += tokens

                #grab only the text + url fields from each row
                samples.append({
                    "text": text,
                    "url": metadata_url
                })

                progress_tracker[url] = i + 1  #update position (we read upto) IN FILE 

                #stop reading this file if it has reached its token limit
                if token_count >= token_target:
                    break
                if token_count // tokens_per_url > sampled_urls.index(url):
                    break  #move to next file once its share is done - 1 file cannot dominate the token sampling

        if token_count >= token_target:
            break

print(f"\nCollected {len(samples)} text samples from {token_count} tokens!")

Testing

In [None]:
#view collected text
print("\nExtracted text samples\n")
for i, sample in enumerate(samples, 1):
    print(f"sample #{i}:\n{sample}\n\n")

Part 2

In [11]:
#sampled dolma text - use to answer Part 2 questions
df = pd.DataFrame(samples)
df.to_excel("dolma_samples_part2.xlsx", index=False)