In [1]:
import pandas as pd
import os
import requests
import tarfile

# set paths for dataset files
DATA_DIR = "../../data"
FLORES_DIR = os.path.join(DATA_DIR, "flores200_dataset")
DOWNLOAD_URL = "https://tinyurl.com/flores200dataset"
TAR_PATH = os.path.join(DATA_DIR, "flores200.tar.gz")

# check if flores is already downloaded, otherwise download and extract it
def check_flores_exists():
    
    if not os.path.exists(FLORES_DIR):
        print("FLORES dataset not found. Downloading...")
        
        # create data directory if it doesn't exist
        os.makedirs(DATA_DIR, exist_ok=True)
        
        # download and extract flores dataset
        response = requests.get(DOWNLOAD_URL, stream=True)
        if response.status_code == 200:
            with open(TAR_PATH, 'wb') as f:
                f.write(response.raw.read())
            print("Download complete. Extracting...")
            
            with tarfile.open(TAR_PATH, "r:gz") as tar:
                tar.extractall(path=DATA_DIR)
            
            # remove tar file
            os.remove(TAR_PATH)
            print("Extraction finished.")
        else:
            raise Exception(f"Failed to download dataset. Status code: {response.status_code}")

check_flores_exists()

# load english-dutch sentence pairs into a dataframe
eng_path = os.path.join(FLORES_DIR, "dev", "eng_Latn.dev")
nld_path = os.path.join(FLORES_DIR, "dev", "nld_Latn.dev")

with open(eng_path, "r", encoding="utf-8") as f:
    english_sentences = [line.strip() for line in f]

with open(nld_path, "r", encoding="utf-8") as f:
    dutch_sentences = [line.strip() for line in f]

flores_df = pd.DataFrame({
    'english': english_sentences,
    'dutch': dutch_sentences
})

print(f"Success! Loaded {len(flores_df)} sentence pairs.")
print(flores_df.head())

Success! Loaded 997 sentence pairs.
                                             english  \
0  On Monday, scientists from the Stanford Univer...   
1  Lead researchers say this may bring early dete...   
2  The JAS 39C Gripen crashed onto a runway at ar...   
3  The pilot was identified as Squadron Leader Di...   
4  Local media reports an airport fire vehicle ro...   

                                               dutch  
0  Op maandag kondigden wetenschappers van de Sta...  
1  Hoofdonderzoekers zeggen dat dit kan leiden to...  
2  De JAS 39C Gripen stortte rond 09.30 uur lokal...  
3  De piloot werd geïdentificeerd als majoor Dilo...  
4  De lokale media meldt dat er tijdens een actie...  


In [2]:
flores_df['english'].iloc[0:5]

0    On Monday, scientists from the Stanford Univer...
1    Lead researchers say this may bring early dete...
2    The JAS 39C Gripen crashed onto a runway at ar...
3    The pilot was identified as Squadron Leader Di...
4    Local media reports an airport fire vehicle ro...
Name: english, dtype: object

In [3]:
from datasets import load_dataset
from dotenv import load_dotenv
access_token = os.getenv("HF_TOKEN")
load_dotenv()
access_token = os.getenv("HF_TOKEN")

# load dutch-english sentence pairs from bouquet dataset
bouquet = load_dataset("facebook/bouquet", "nld_Latn", token=access_token)

# convert to pandas df
bouquet_df = bouquet["dev"].to_pandas()

# only keep relevant columns
bouquet_df = bouquet_df[['src_text', 'tgt_text']].rename(
    columns={
        'src_text': 'dutch',
        'tgt_text': 'english'
    }
)

print(bouquet_df.head())

  from .autonotebook import tqdm as notebook_tqdm
Resolving data files: 100%|██████████| 108/108 [00:00<00:00, 289.65it/s]
Resolving data files: 100%|██████████| 108/108 [00:00<00:00, 222.23it/s]


                                               dutch  \
0  Het recept voor mahshi dolma varieert van land...   
1  In Egypte wordt het meestal gevuld met rijst e...   
2          In de Levant vullen ze het ook met vlees.   
3     Manieren om van negatieve energie af te komen.   
4  Buiten wandelen, al is het maar 10 minuten per...   

                                             english  
0  Mahshi "dolma" recipe varies a lot from one co...  
1  In Egypt usually it's stuffed with rice and ve...  
2     In the levant they stuff it with meat as well.  
3                Ways to get rid of negative energy.  
4  Walking in the open air, even if it's for 10 m...  


In [4]:
bouquet_df['english'].iloc[0:5]

0    Mahshi "dolma" recipe varies a lot from one co...
1    In Egypt usually it's stuffed with rice and ve...
2       In the levant they stuff it with meat as well.
3                  Ways to get rid of negative energy.
4    Walking in the open air, even if it's for 10 m...
Name: english, dtype: object

In [5]:
from datasets import load_dataset
ds = load_dataset("allenai/madlad-400", "nl", split="clean", streaming=True)
limit=100
sources = []
for i, example in enumerate(ds):
    if i >= limit:
        break
    sources.append(example['text'])
    
targets = ["Monolingual - No Target"] * len(sources)

KeyboardInterrupt: 

In [9]:
from datasets import load_dataset

# We point directly to the parquet files for the Dutch (nl) clean split
# This is much more stable than relying on the community script
ds = load_dataset(
    "allenai/madlad-400", 
    data_files="data/nl/clean/*.parquet", 
    split="train", 
    streaming=True
)

# Take the first 100 rows
sources = [x["text"] for x in ds.take(100)]
targets = ["Monolingual - No Target"] * len(sources)

print(f"Success! Loaded {len(sources)} sources.")
print("Sample text:", sources[0][:100], "...")

DataFilesNotFoundError: No (supported) data files found in allenai/madlad-400