In [None]:
import os
import numpy as np
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from datasets import load_dataset
from tqdm import tqdm

service_account_json_file = os.path.join('../../', 'thesis_service_account.json')
if not os.path.exists(service_account_json_file):
    raise FileNotFoundError(f"[!] service account JSON file not found: {service_account_json_file}")


datasets = [
    "clerc",
    "cuad",
    "echr_qa",
    "oal_qa",
    "obli_qa",
]

def compute_length_percentiles(hf_dataset, split_name="test", column_name="citations"):
    lengths = []
    list_lengths = []
    # Iterate over rows in the chosen split (e.g., 'train', 'test')
    for example in hf_dataset[split_name]:
        if isinstance(example[column_name], str):
            word_count = len(example[column_name].split())
            lengths.append(word_count)
            list_lengths.append(1)
        elif isinstance(example[column_name], list):
            list_lengths.append(len(example[column_name]))
            for nested_arr in example[column_name]:
                if len(nested_arr) > 1:
                    # The second item is the "document" text
                    doc_text = nested_arr[1]
                    # Split by whitespace to count words
                    word_count = len(doc_text.split())
                    lengths.append(word_count)

    if not lengths:
        print(f"[!] No documents found in split={split_name}, column={column_name}.")
        return None

    lengths = np.array(lengths)

    stats = {
        "mean": float(np.mean(lengths)),
        "p25": float(np.percentile(lengths, 25)),
        "p50": float(np.percentile(lengths, 50)),
        "p75": float(np.percentile(lengths, 75)),
        "p90": float(np.percentile(lengths, 90)),
        "p95": float(np.percentile(lengths, 95)),
        "p99": float(np.percentile(lengths, 99)),
        "count": np.mean(list_lengths)
    }
    return stats


def add_record(dataset, data):
    """
    Example function that appends a row to a spreadsheet.
    The logic is partially from your code snippet, with small adjustments.
    """
    new_row = [dataset]
    sheet_columns_data = [
        "citations.mean", "citations.p25", "citations.p50", "citations.p75", "citations.p99", "citations.count",
        "gold_text.mean", "gold_text.p25", "gold_text.p50", "gold_text.p75", "gold_text.p99",
        "count"
    ]
    for key in sheet_columns_data:
        keys = key.split('.')
        value = data
        for k in keys:
            if value is None:
                break
            value = value.get(k, None)
            if value is None:
                break
        new_row.append(value)

    new_row[0] = new_row[0].upper()  # e.g., OAL_QA

    scope = [
        "https://spreadsheets.google.com/feeds",
        "https://www.googleapis.com/auth/spreadsheets",
        "https://www.googleapis.com/auth/drive.file",
        "https://www.googleapis.com/auth/drive"
    ]
    creds = ServiceAccountCredentials.from_json_keyfile_name(service_account_json_file, scope)
    client = gspread.authorize(creds)

    spreadsheet_url = "https://docs.google.com/spreadsheets/d/1bE5AbY1hrqlR-_v-ohLCgHA6hFvRCTpH4lrRPqXm9UU/edit?usp=sharing"
    sheet = client.open_by_url(spreadsheet_url)
    worksheet = sheet.worksheet(f"Dataset — Lengths")

    worksheet.append_row(new_row, value_input_option="USER_ENTERED")
    # print("[!] added record to Google Sheets")
    
for dataset in tqdm(datasets):
    if dataset == "clerc":
        workshop_hf_name = f"CLERC-generation-workshop"
    elif dataset == "echr_qa":
        workshop_hf_name = f"ECHR_QA-generation-workshop"
    elif dataset == "obli_qa":
        workshop_hf_name = f"OBLI_QA-generation-workshop"
    elif dataset == "cuad":
        workshop_hf_name = f"CUAD-generation-workshop"
    elif dataset == "oal_qa":
        workshop_hf_name = f"OAL_QA-generation-workshop"
    else:
        raise ValueError(f"Invalid dataset {dataset}")
    current_chosen_dataset = load_dataset(f"ylkhayat/{workshop_hf_name}", data_dir="data")
    split_name = "test"
    # print("[+] Loaded dataset:", current_chosen_dataset)
    citations_percentiles = compute_length_percentiles(current_chosen_dataset, split_name=split_name, column_name="citations")
    gold_text_percentiles = compute_length_percentiles(current_chosen_dataset, split_name=split_name, column_name="gold_text")
    data_to_publish = {
        "mean": citations_percentiles["mean"] if citations_percentiles else None,
        "citations":{
            "mean": citations_percentiles["mean"] if citations_percentiles else None,
            "p25": citations_percentiles["p25"] if citations_percentiles else None,
            "p50": citations_percentiles["p50"] if citations_percentiles else None,
            "p75": citations_percentiles["p75"] if citations_percentiles else None,
            "p99": citations_percentiles["p99"] if citations_percentiles else None,
            "count" : citations_percentiles["count"] if citations_percentiles else None
        },
        "gold_text":{
            "mean": gold_text_percentiles["mean"] if gold_text_percentiles else None,
            "p25": gold_text_percentiles["p25"] if gold_text_percentiles else None,
            "p50": gold_text_percentiles["p50"] if gold_text_percentiles else None,
            "p75": gold_text_percentiles["p75"] if gold_text_percentiles else None,
            "p99": gold_text_percentiles["p99"] if gold_text_percentiles else None,
        },
        "count": len(current_chosen_dataset[split_name]),
    }
    add_record(dataset, data_to_publish)


