In [None]:
import pandas as pd
import os
import requests
import tarfile

# set paths for dataset files
DATA_DIR = "../../data"
FLORES_DIR = os.path.join(DATA_DIR, "flores200_dataset")
DOWNLOAD_URL = "https://tinyurl.com/flores200dataset"
TAR_PATH = os.path.join(DATA_DIR, "flores200.tar.gz")

# check if flores is already downloaded, otherwise download and extract it
def check_flores_exists():
    
    if not os.path.exists(FLORES_DIR):
        print("FLORES dataset not found. Downloading...")
        
        # create data directory if it doesn't exist
        os.makedirs(DATA_DIR, exist_ok=True)
        
        # download and extract flores dataset
        response = requests.get(DOWNLOAD_URL, stream=True)
        if response.status_code == 200:
            with open(TAR_PATH, 'wb') as f:
                f.write(response.raw.read())
            print("Download complete. Extracting...")
            
            with tarfile.open(TAR_PATH, "r:gz") as tar:
                tar.extractall(path=DATA_DIR)
            
            # remove tar file
            os.remove(TAR_PATH)
            print("Extraction finished.")
        else:
            raise Exception(f"Failed to download dataset. Status code: {response.status_code}")

check_flores_exists()

# load english-dutch sentence pairs into a dataframe
eng_path = os.path.join(FLORES_DIR, "dev", "eng_Latn.dev")
nld_path = os.path.join(FLORES_DIR, "dev", "nld_Latn.dev")

with open(eng_path, "r", encoding="utf-8") as f:
    english_sentences = [line.strip() for line in f]

with open(nld_path, "r", encoding="utf-8") as f:
    dutch_sentences = [line.strip() for line in f]

flores_df = pd.DataFrame({
    'english': english_sentences,
    'dutch': dutch_sentences
})

print(f"Success! Loaded {len(flores_df)} sentence pairs.")
print(flores_df.head())

In [None]:
flores_df['english'].iloc[0:5]

In [None]:
import datasets
from dotenv import load_dotenv
access_token = os.getenv("HF_TOKEN")
load_dotenv()
access_token = os.getenv("HF_TOKEN")

# load dutch-english sentence pairs from bouquet dataset
bouquet = load_dataset("facebook/bouquet", "nld_Latn", token=access_token)

# convert to pandas df
bouquet_df = bouquet["dev"].to_pandas()

# only keep relevant columns
bouquet_df = bouquet_df[['src_text', 'tgt_text']].rename(
    columns={
        'src_text': 'dutch',
        'tgt_text': 'english'
    }
)

print(bouquet_df.head())

In [None]:
bouquet_df['english'].iloc[0:5]