# Compression counts
Display the number of original samples, compared to the reduced ones.

In [1]:
import numpy as np

def compress(df, lang1, lang2, MAX_SIZE=400_000):
    compressions = {}

    src_len = df[lang1].str.len()
    tgt_len = df[lang2].str.len()
    mean_len = (tgt_len / src_len).mean()

    compressions["length_ratio"] = mean_len
    print(f"Length ratio between {lang1} and {lang2}: {mean_len}")

    for comp_rate in np.arange(0.5, 1.05, 0.1):
        comp_rate = round(comp_rate, 1)
        print(f"Compression rate: {comp_rate}")
        compressed = df[
            df[lang2].str.len() < mean_len * comp_rate * df[lang1].str.len()
        ]
        print(compressed.describe())
        reduced = compressed.sample(
            n=min(MAX_SIZE, compressed.shape[0]), random_state=42
        )
        compressions[comp_rate] = {
            "compression_rate": comp_rate,
            "count": len(compressed),
            f"unique_{lang1}": compressed[lang1].unique().shape[0],
            f"unique_{lang2}": compressed[lang2].unique().shape[0],
            "count_reduced": len(reduced),
            f"unique_{lang1}_reduced": reduced[lang1].unique().shape[0],
            f"unique_{lang2}_reduced": reduced[lang2].unique().shape[0],
        }

    return compressions

In [2]:
import os
import pandas as pd

path = "../data/opensubtitles/datasets"

all_compressions = {}
for filename in os.listdir(path):
    if filename.endswith(".csv"):
        print(filename)
        lang1, lang2 = filename.split(".")[0].split("-")
        print(lang1, lang2)
        df = pd.read_csv(os.path.join(path, filename))
        all_compressions[filename] = compress(df, lang1, lang2)



en-no.csv
en no
Length ratio between en and no: 0.9621212692882832
Compression rate: 0.5
                en      no
count       415797  415797
unique      383220  255689
top     All right.     Ja.
freq          1107    7964
Compression rate: 0.6
                en      no
count       749826  749826
unique      676004  491008
top     Thank you.   Takk.
freq          9250   12353
Compression rate: 0.7
             en       no
count   1224125  1224125
unique  1063829   819239
top       Yeah.      Ja.
freq      12507    21771
Compression rate: 0.8
             en       no
count   1812989  1812989
unique  1555327  1257908
top       Yeah.      Ja.
freq      12507    32970
Compression rate: 0.9
             en       no
count   2514408  2514408
unique  2126521  1787814
top       Yeah.      Ja.
freq      12806    32970
Compression rate: 1.0
             en       no
count   3261374  3261374
unique  2745492  2377950
top       Yeah.      Ja.
freq      12806    32970
en-is.csv
en is
Length ratio be

In [4]:
all_compressions

{'en-no.csv': {'length_ratio': 0.9621212692882832,
  0.5: {'compression_rate': 0.5,
   'count': 415797,
   'unique_en': 383220,
   'unique_no': 255689,
   'count_reduced': 400000,
   'unique_en_reduced': 368921,
   'unique_no_reduced': 246698},
  0.6: {'compression_rate': 0.6,
   'count': 749826,
   'unique_en': 676004,
   'unique_no': 491008,
   'count_reduced': 400000,
   'unique_en_reduced': 365459,
   'unique_no_reduced': 273546},
  0.7: {'compression_rate': 0.7,
   'count': 1224125,
   'unique_en': 1063829,
   'unique_no': 819239,
   'count_reduced': 400000,
   'unique_en_reduced': 356458,
   'unique_no_reduced': 287246},
  0.8: {'compression_rate': 0.8,
   'count': 1812989,
   'unique_en': 1555327,
   'unique_no': 1257908,
   'count_reduced': 400000,
   'unique_en_reduced': 356176,
   'unique_no_reduced': 302424},
  0.9: {'compression_rate': 0.9,
   'count': 2514408,
   'unique_en': 2126521,
   'unique_no': 1787814,
   'count_reduced': 400000,
   'unique_en_reduced': 355374,
   '