<a href="https://colab.research.google.com/github/ymoslem/datasets/blob/main/LongContextQE/Long-Context-MT-QE-WMT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip3 install datasets polars -q

In [None]:
from datasets import load_dataset

dataset = load_dataset("ymoslem/wmt-da-human-evaluation",
                       )

dataset

DatasetDict({
    train: Dataset({
        features: ['lp', 'src', 'mt', 'ref', 'score', 'raw', 'annotators', 'domain', 'year'],
        num_rows: 1275190
    })
    test: Dataset({
        features: ['lp', 'src', 'mt', 'ref', 'score', 'raw', 'annotators', 'domain', 'year'],
        num_rows: 10000
    })
})

In [None]:
df_train = dataset["train"].to_polars()
df_test = dataset["test"].to_polars()

print(f"Train: {df_train.shape[0]:,} Test: {df_test.shape[0]:,}")
print(f"Domains: {df_train['domain'].unique().to_list()}")

Train: 1,275,190 Test: 10,000
Domains: ['ecommerce', 'social', 'other', 'wiki', 'conversation', 'news']


In [None]:
import polars as pl
from random import randrange
from tqdm.auto import tqdm


def pair_sentences(df):
  df1 = df.with_row_index()
  df2 = df1.sample(fraction=1, shuffle=True)

  group_columns = ["lp", "domain"]

  # Group by language and domain
  df1_groups = df1.group_by(group_columns).agg(pl.all()).sort(group_columns)
  df2_groups = df2.group_by(group_columns).agg(pl.all()).sort(group_columns)

  df_joined = df1_groups.join(df2_groups,
                              on=group_columns,
                              how="left",
                              suffix="_pair")

  df_paired = df_joined.explode(pl.exclude(group_columns))

  # Filter out exact indices
  df_paired = df_paired.filter(pl.col("index") != pl.col("index_pair"))

  # Combine the sentences in each pair
  df_paired = df_paired.with_columns([
      (pl.col("src") + " " + pl.col("src_pair")).alias("src_doc"),
      (pl.col("ref") + " " + pl.col("ref_pair")).alias("ref_doc"),
      (pl.col("mt") + " " + pl.col("mt_pair")).alias("mt_doc"),
  ])

  # Compute weighted averaged score
  df_scored = df_paired.with_columns(
      # Compute word counts
      (
          pl.col("src").str.len_chars() + pl.col("mt").str.len_chars()
      ).alias("char_count_1"),
      (
          pl.col("src_pair").str.len_chars() + pl.col("mt_pair").str.len_chars()
      ).alias("char_count_2")
  ).with_columns(
      # Weighted score by character count
      (
          (pl.col("char_count_1") * pl.col("raw") + pl.col("char_count_2") * pl.col("raw_pair"))
          /
          (pl.col("char_count_1") + pl.col("char_count_2"))
      ).alias("raw_doc")
  )

  columns_to_keep = ['lp','src_doc', 'mt_doc', 'ref_doc', 'raw_doc', 'domain', 'year']
  df_scored = df_scored.select(columns_to_keep)
  df_scored = df_scored.rename({"src_doc": "src",
                                "mt_doc": "mt",
                                "ref_doc": "ref",
                                "raw_doc": "raw"})

  return df_scored


def process(df_scored, iterations):

  df_scored = df_scored.with_columns(sents = 1)
  df_scored = df_scored.drop("score", "annotators")

  dfs = [df_scored]
  count = 1

  for i in tqdm(range(iterations)):
    df_scored = pair_sentences(df_scored)
    count += count
    df_scored = df_scored.with_columns(sents = count)
    dfs.append(df_scored)

  dfs_scored_concatenated = pl.concat(dfs)

  return dfs_scored_concatenated

In [None]:
# create the long-context train dataset
df_scored_train = process(df_train, 5)

In [None]:
# create the long-context test dataset
df_scored_test = process(df_test, 5)

# show 3 random rows
with pl.Config(set_fmt_str_lengths=300):
  display(df_scored_test.sample(n=3))

  0%|          | 0/5 [00:00<?, ?it/s]

lp,src,mt,ref,raw,domain,year,sents
str,str,str,str,f64,str,i64,i32
"""de-en""","""Flüchtlingsboot gekentert: Baby und Kleinkinder ertrinken in Ägäis Inzwischen ist diese Veranstaltung ein fester Bestandteil im musikalischen Veranstaltungskalender in Nottuln geworden"", freut sich Hanns Moormann, Vorsitzender des Nottulner Heimatvereins. Der 57-Jährige trainiert den mexikanischen C…","""Refugee boat capsized: Baby and toddlers drown in Aegean Sea Meanwhile, this event has become an integral part of the musical event calendar in Nottuln, ""is pleased to be the chairman of the Nottulner Home Association. The 57-year-old has been coaching the Mexican club since early September. Accordi…","""Refugee Boat Capsized: Baby and Small Children Drown in the Aegean This event has now become a fixture in the musical event calendar in Nottuln,” says Hanns Moormann, chairman of the Nottuln Home Association, pleased. Since the beginning of September the 57-year-old has been training the Mexican clu…",0.547435,"""news""",2020,16
"""de-en""","""Spannung vor Ratssitzung: City-Ost und Lärmschutz stehen noch vor großer Sommerpause zur Diskussion Nach Angaben der Feuerwehr war am Mittwoch (Ortszeit) noch ein Verletzter in kritischem Zustand. Neues Überschallflugzeug soll 2030 fliegen: In viereinhalb Stunden von London nach Sydney Ein Zuspiel v…","""Tension before council meeting: City-Ost and noise protection are still up for discussion before the big summer break According to the fire service, an injured person was still in critical condition on Wednesday (local time). New supersonic plane to fly in 2030: From London to Sydney in four and a h…","""Tensions before the council meeting: City-East and noise protection are still up for debate before the long summer break. According to information from the fire department, one injured person was still in a critical condition on Wednesday (local time). New Supersonic Aircraft to fly in 2030: Four an…",0.71529,"""news""",2021,16
"""zh-en""","""巴姆表示：“这是一次文化、心态和实质上的改变。” 摩洛哥司法部门正在对此案展开进一步调查。 正因如此，法国媒体评价他是“法国在世界舞台上的杰出代表 ” 。 遭到驱逐后，这家人一直在搬家，据说目前住在私人住宅 完成该项作业需把北部高架引桥南移700毫米。 他提及，香港今年第二季度经济状况仍十分严峻，但收缩的态势似乎略为放缓。 刘长宝说，为避免索头端位置出现重叠和“打架”等问题，还将创新采用“双叉耳”索头结构，通过特制的固定插件确保“打架”的索头收束在正确的位置。 为确保通航安全，重庆海事部门决定，对长江界石盘至观音滩水域实施临时交通管制。。 自去年以来，国电集团共计有163套风电项目陆续从连云港…","""""This is a cultural, psychological and substantive change,"" said Bam. The Moroccan judicial department is launching a further investigation into the case. For this reason, the French media rated him as ""an outstanding representative of France on the world stage"". After being exported, the family had…","""""A cultural change, a mental change, a physical change,"" Bam said. The judicial department of Morocco is conducting further investigations into this case. For this reason, the French media hailed him as ""France's outstanding representative in the international arena."" The family have now been moved …",0.736477,"""news""",2017,32


# Create HF Dataset

In [None]:
# Case 2: dataset from Polars dataframe
from datasets import Dataset, DatasetDict

dataset_train = Dataset.from_polars(df_scored_train)
dataset_test = Dataset.from_polars(df_scored_test)

dataset = DatasetDict({
    "train": dataset_train,
    "test": dataset_test
})

dataset

DatasetDict({
    train: Dataset({
        features: ['lp', 'src', 'mt', 'ref', 'raw', 'domain', 'year', 'sents'],
        num_rows: 7650371
    })
    test: Dataset({
        features: ['lp', 'src', 'mt', 'ref', 'raw', 'domain', 'year', 'sents'],
        num_rows: 59176
    })
})

In [None]:
commit_message = "WMT long-context DA eval"
commit_description="WMT DA human evaluation dataset augmented into 2 to 32 sentences"

user = "user_name"  # change

dataset.push_to_hub(f"{user}/wmt-da-human-evaluation-long-context",
                    data_dir="data",
                    private=True,
                    commit_message=commit_message,
                    commit_description=commit_description
                    )