In [24]:
import os
import json

opensub_path = os.path.join("..", "data", "opensubtitles", "compressed")

infos = {}
for f in os.listdir(opensub_path):
    if ".info" in f:
        _, target_lang = f.split(".")[0].split("-")
        # open .info file, which is a json file
        with open(os.path.join(opensub_path, f), "r") as file:
            info = json.load(file)
            infos[target_lang] = {}
            for _key in info.keys():
                if isinstance(info[_key], dict):
                    infos[target_lang][_key] = int(info[_key]["count"])
                else:
                    infos[target_lang][_key] = round(float(info[_key]), 2)

# > 1 means that the target language is longer than the source language
infos

{'de': {'length_ratio': 1.19,
  '0.5': 719020,
  '0.6': 1414345,
  '0.7': 2525508,
  '0.8': 4146918,
  '0.9': 6627334,
  '1.0': 8697959},
 'eu': {'length_ratio': 1.09,
  '0.5': 7768,
  '0.6': 15606,
  '0.7': 28228,
  '0.8': 49363,
  '0.9': 72849,
  '1.0': 105692},
 'is': {'length_ratio': 1.03,
  '0.5': 75880,
  '0.6': 150171,
  '0.7': 260511,
  '0.8': 419985,
  '0.9': 614276,
  '1.0': 838767},
 'fr': {'length_ratio': 1.11,
  '0.5': 1806772,
  '0.6': 3548104,
  '0.7': 6300807,
  '0.8': 10141933,
  '0.9': 14264796,
  '1.0': 20502881},
 'da': {'length_ratio': 0.99,
  '0.5': 790191,
  '0.6': 1435860,
  '0.7': 2303276,
  '0.8': 3450454,
  '0.9': 4899023,
  '1.0': 6256848},
 'hu': {'length_ratio': 1.01,
  '0.5': 1560313,
  '0.6': 3175010,
  '0.7': 5937986,
  '0.8': 10178195,
  '0.9': 15289547,
  '1.0': 21522518},
 'ms': {'length_ratio': 1.12,
  '0.5': 37351,
  '0.6': 74119,
  '0.7': 128568,
  '0.8': 211236,
  '0.9': 364438,
  '1.0': 460469},
 'pl': {'length_ratio': 1.02,
  '0.5': 2105542,
  

In [59]:
import pandas as pd

df = pd.DataFrame(infos)
df

Unnamed: 0,de,eu,is,fr,da,hu,ms,pl,no,lt,sq
length_ratio,1.19,1.09,1.03,1.11,0.99,1.01,1.12,1.02,0.96,0.99,1.05
0.5,719020.0,7768.0,75880.0,1806772.0,790191.0,1560313.0,37351.0,2105542.0,415797.0,69190.0,102869.0
0.6,1414345.0,15606.0,150171.0,3548104.0,1435860.0,3175010.0,74119.0,3810521.0,749826.0,134416.0,196339.0
0.7,2525508.0,28228.0,260511.0,6300807.0,2303276.0,5937986.0,128568.0,6275552.0,1224125.0,240922.0,330680.0
0.8,4146918.0,49363.0,419985.0,10141933.0,3450454.0,10178195.0,211236.0,9399322.0,1812989.0,372663.0,527887.0
0.9,6627334.0,72849.0,614276.0,14264796.0,4899023.0,15289547.0,364438.0,13174343.0,2514408.0,532621.0,769454.0
1.0,8697959.0,105692.0,838767.0,20502881.0,6256848.0,21522518.0,460469.0,17491329.0,3261374.0,664991.0,1076166.0


In [60]:
df = df.reindex(df.loc["1.0"].sort_values(ascending=False).index, axis=1)
def human_readable_num(x):
    # round to nearest k:
    x = round(x, -3)
    # from e.g. 1560313 -> 1,560 (k)
    return f"{int(x/1000):,}" if x > 1000 else f"{int(x):,}"

# df = df.map(lambda x: str(int(x)) if x.is_integer() else str(x))
df = df.map(lambda x: human_readable_num(x) if x.is_integer() else str(x))
df

Unnamed: 0,hu,fr,pl,de,da,no,sq,is,lt,ms,eu
length_ratio,1.01,1.11,1.02,1.19,0.99,0.96,1.05,1.03,0.99,1.12,1.09
0.5,1560.0,1807.0,2106.0,719.0,790.0,416.0,103.0,76.0,69.0,37.0,8.0
0.6,3175.0,3548.0,3811.0,1414.0,1436.0,750.0,196.0,150.0,134.0,74.0,16.0
0.7,5938.0,6301.0,6276.0,2526.0,2303.0,1224.0,331.0,261.0,241.0,129.0,28.0
0.8,10178.0,10142.0,9399.0,4147.0,3450.0,1813.0,528.0,420.0,373.0,211.0,49.0
0.9,15290.0,14265.0,13174.0,6627.0,4899.0,2514.0,769.0,614.0,533.0,364.0,73.0
1.0,21523.0,20503.0,17491.0,8698.0,6257.0,3261.0,1076.0,839.0,665.0,460.0,106.0


In [61]:
print(df.T.to_latex(float_format="{:0.2f}".format))

\begin{tabular}{llllllll}
\toprule
 & length_ratio & 0.5 & 0.6 & 0.7 & 0.8 & 0.9 & 1.0 \\
\midrule
hu & 1.01 & 1,560 & 3,175 & 5,938 & 10,178 & 15,290 & 21,523 \\
fr & 1.11 & 1,807 & 3,548 & 6,301 & 10,142 & 14,265 & 20,503 \\
pl & 1.02 & 2,106 & 3,811 & 6,276 & 9,399 & 13,174 & 17,491 \\
de & 1.19 & 719 & 1,414 & 2,526 & 4,147 & 6,627 & 8,698 \\
da & 0.99 & 790 & 1,436 & 2,303 & 3,450 & 4,899 & 6,257 \\
no & 0.96 & 416 & 750 & 1,224 & 1,813 & 2,514 & 3,261 \\
sq & 1.05 & 103 & 196 & 331 & 528 & 769 & 1,076 \\
is & 1.03 & 76 & 150 & 261 & 420 & 614 & 839 \\
lt & 0.99 & 69 & 134 & 241 & 373 & 533 & 665 \\
ms & 1.12 & 37 & 74 & 129 & 211 & 364 & 460 \\
eu & 1.09 & 8 & 16 & 28 & 49 & 73 & 106 \\
\bottomrule
\end{tabular}

