#I. Definition of the metrics

In [None]:
import numpy as np
import pandas as pd

In [None]:

def standardize_sentence(sentence):
  words = sentence.split()
  for i in range(len(words)): 
    w=words[i]
    w = w.lower()
    if w[-1]==".": w = w[:len(w)-1]
    words[i]=w
  return " ".join(words)

def n_grams(n,sentence):
  words = sentence.split()
  for i in range(len(words)): 
    w=words[i]
    w = w.lower()
    if w[-1]==".": w = w[:len(w)-1]
    words[i]=w
  #if n>len(words):
    #raise ValueError("n must be less than the length of the sentence")
  L=[]
  if n>len(words):
    return standardize_sentence(sentence)
  i=0
  while i+n<=len(words):
    L.append(words[i:i+n])
    i+=1
  return [" ".join(l) for l in L]


def n_gram_substring_count(ngram,sentence):
  counts = []
  sent_standard = standardize_sentence(sentence)
  for substr in ngram : 
    pos = 0
    count = 0
    while pos+len(substr)<=len(sent_standard):
      count += (substr==sent_standard[pos:pos+len(substr)])
      pos+=1
    counts.append(count)
  return counts

def modified_ngram_precision_1_cand(n,candidate,references):
  n_gram_cand = n_grams(n,candidate)
  counts_cand_ref = [n_gram_substring_count(n_gram_cand,ref) 
  for ref in references]
  length_counts = len(counts_cand_ref[0])
  counts_cand_ref_max = [max([counts_cand_ref[r][i] for r in range(len(references))]) for i in range(length_counts)]

  counts_cand_cand =  n_gram_substring_count(n_gram_cand,candidate)

  precision = sum([min(counts_cand_cand[i],counts_cand_ref_max[i]) for i in
                   range(len(n_gram_cand))])

  precision/= sum([counts_cand_cand[i] for i in
                   range(len(n_gram_cand))])

  return precision


def brevity_penalty(candidate,references):
  ### It is normally defined over a whole corpus,
  ### but as we compute the score at sentence-level,
  ### we consider "corpora of one sentence only"
  c = len(candidate.split())
  ref_len = [len(ref.split()) for ref in references]
  r = ref_len[np.argmin([abs(r-c) for r in ref_len])]
  
  return np.exp(-max(r/c-1,0))

def BLEU(candidate,references):
  ### As recommended in the original "BLEU" paper,
  ### we define w_1 = ... = w_4
  precisions = [modified_ngram_precision_1_cand(i,candidate,references) for i in range(4)]
  BP = brevity_penalty(candidate,references)
  
  if min(precisions)==0:
    return 0
  
  return BP*np.exp(sum([np.log(p) for p in precisions])/4)


## Code for Bertscore

In [None]:
!pip install bert_score
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 KB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting transformers>=3.0.0
  Downloading transformers-4.27.1-py3-none-any.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.2-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
Instal

In [None]:
# hide the loading messages
import logging
import transformers
transformers.tokenization_utils.logger.setLevel(logging.ERROR)
transformers.configuration_utils.logger.setLevel(logging.ERROR)
transformers.modeling_utils.logger.setLevel(logging.ERROR)

In [None]:
import bert_score
# hide the loading messages
import logging
import transformers
transformers.tokenization_utils.logger.setLevel(logging.ERROR)
transformers.configuration_utils.logger.setLevel(logging.ERROR)
transformers.modeling_utils.logger.setLevel(logging.ERROR)

In [None]:
from bert_score import score

In [None]:
def BertScore(candidate,references):
  ### We compute the score for all the reference
  ### sentences and keep the highest (i.e. the
  ### one corresponding to the closest reference sentence)
  cand_list = [candidate for i in range(len(references))]
  
  P, R, F1 = bert_score.score(cand_list, references, lang='fr', verbose=False)

  return F1

## Code for MoverScore

In [None]:
!git clone https://github.com/AIPHES/emnlp19-moverscore

Cloning into 'emnlp19-moverscore'...
remote: Enumerating objects: 459, done.[K
remote: Counting objects: 100% (59/59), done.[K
remote: Compressing objects: 100% (56/56), done.[K
remote: Total 459 (delta 31), reused 14 (delta 3), pack-reused 400[K
Receiving objects: 100% (459/459), 7.07 MiB | 17.70 MiB/s, done.
Resolving deltas: 100% (214/214), done.


In [None]:
!pip install pyemd
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyemd
  Downloading pyemd-1.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (675 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m675.0/675.0 KB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyemd
Successfully installed pyemd-1.0.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import sys
sys.path.insert(0,"/content/emnlp19-moverscore")
from moverscore_v2 import get_idf_dict, word_mover_score 
from collections import defaultdict
import moverscore_v2 as mv

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
def MoverScore(cand,references):
    
    idf_dict_hyp = defaultdict(lambda: 1.)
    idf_dict_ref = defaultdict(lambda: 1.)
    
    cand = [cand] * len(references)
    
    sentence_score = 0 

    scores = word_mover_score(references, cand, idf_dict_ref, idf_dict_hyp, stop_words=[], n_gram=1, remove_subwords=False)
    
    sentence_score = scores
    ### Originally np.mean proposed by the authors but max seems more
    ### logical    
   
            
    return sentence_score

## Code for DepthScore


In [None]:
!git clone https://github.com/PierreColombo/nlg_eval_via_simi_measures

Cloning into 'nlg_eval_via_simi_measures'...
remote: Enumerating objects: 338, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 338 (delta 0), reused 0 (delta 0), pack-reused 329[K
Receiving objects: 100% (338/338), 8.78 MiB | 18.28 MiB/s, done.
Resolving deltas: 100% (194/194), done.


In [None]:
import sys
sys.path.insert(0,"/content/nlg_eval_via_simi_measures/nlg_eval_via_simi_measures")

In [None]:
!pip install POT

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting POT
  Downloading POT-0.8.2-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (670 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 KB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: POT
Successfully installed POT-0.8.2


In [None]:
!pip install geomloss

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting geomloss
  Downloading geomloss-0.2.5.tar.gz (26 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: geomloss
  Building wheel for geomloss (setup.py) ... [?25l[?25hdone
  Created wheel for geomloss: filename=geomloss-0.2.5-py3-none-any.whl size=32068 sha256=8169a2cae2cf6301baf3d39a62d5b13b1cc345255da24e3725130720f6308b88
  Stored in directory: /root/.cache/pip/wheels/91/5f/d7/0dbc6074929fc09f1280db596bfe0e2b59e5790bdfbaefe017
Successfully built geomloss
Installing collected packages: geomloss
Successfully installed geomloss-0.2.5


In [None]:
from depth_score import DepthScoreMetric

In [None]:
import numpy as np
def DepthScore(cand,refs):
  hypothesis = [cand for i in refs]

  metric_call = DepthScoreMetric()

  metric_call.prepare_idfs(refs, hypothesis)
  final_preds = metric_call.evaluate_batch(hypothesis,refs)
  return 1 - np.array(final_preds["depth_score"])
  ### As before, since there are several reference sentences, we select the most favorable one

## Code for BaryScore

In [None]:
from bary_score import BaryScoreMetric

In [None]:
import numpy as np

def BaryScore(cand,refs):
  hypothesis = [cand for i in refs]

  metric_call = BaryScoreMetric()

  metric_call.prepare_idfs(refs, hypothesis)
  final_preds = metric_call.evaluate_batch(hypothesis,refs)
  return 1 - np.array(final_preds['baryscore_W'])

## Code for InfoLM

In [None]:
from infolm import InfoLM

In [None]:
import numpy as np
def InfoLM_metric(cand,refs):
  hypothesis = [cand for i in refs]

  metric_call = InfoLM()




  metric_call.prepare_idfs(refs, hypothesis)
  final_preds = metric_call.evaluate_batch(hypothesis,ref)
  return 1 - np.array(final_preds['fisher_rao'])
  ### As before, since there are several reference sentences, we select the most favorable one

In [None]:
hypothesis = ["I am a smurf","I have smurfed a smurf in Smurfland"]
refs = ["I fear the wizard Gargamel","I ate a cake in Smurfland"]

metric_call = DepthScoreMetric()

metric_call.prepare_idfs(refs, hypothesis)
final_preds = metric_call.evaluate_batch(hypothesis,refs)


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Depth Score Progress: 100%|██████████| 2/2 [00:06<00:00,  3.29s/it]


In [None]:
final_preds

{'depth_score': [0.13857271441506605, 0.12556717284599872]}

In [None]:
hypothesis = ["I am a smurf","I have smurfed a smurf in Smurfland",'Smurf']
refs = ["I fear the wizard Gargamel","I ate a cake in Smurfland",'Smurf']

metric_call = DepthScoreMetric()

metric_call.prepare_idfs(refs, hypothesis)
final_preds = metric_call.evaluate_batch(hypothesis,refs)

Depth Score Progress: 100%|██████████| 3/3 [00:08<00:00,  3.00s/it]


In [None]:
final_preds

{'depth_score': [0.13857270211761533,
  0.1255671723450445,
  1.0296297697530636e-07]}

#II. Calculation of the metrics

**Preprocessing of the WMT15 data, computation of the score for one metric**

In [None]:
import csv
import pandas as pd
import numpy as np

model_names = ['newsdiscusstest2015.CIMS-FR.4056.en-fr',
               'newsdiscusstest2015.LIMSI-CNRS-mosesSoulMoreFeatures.4000.en-fr',
               'newsdiscusstest2015.online-A.0.en-fr',
               'newsdiscusstest2015.online-B.0.en-fr',
               'newsdiscusstest2015.online-E.0.en-fr',
               'newsdiscusstest2015.online-F.0.en-fr',
               'newsdiscusstest2015.uedin-jhu-phrase.4104.en-fr']

orig_sentences = 'newsdiscusstest2015-enfr-ref.fr'

wmt_csv = pd.read_csv('/content/wmt15engfre.csv', sep=',')

seg_IDs = wmt_csv.segmentId.unique()


# Load the reference sentences from the original sentences file
with open(orig_sentences, 'r', encoding='utf-8') as f:
    refs = f.readlines()

# Load the candidate sentences from the model names files
cands = [[] for _ in range(len(model_names))]
for j, model_name in enumerate(model_names):
    with open(model_name, 'r', encoding='utf-8') as f:
        cands[j] = f.readlines()

# Compute BaryScores for each line of the reference sentences
bary_scores = []
for i, id in enumerate(seg_IDs):
    print(str(i) + '/' + str(len(seg_IDs)))
    ref = refs[id]
    cand_scores = []
    cand_sentences = [cand[id] for cand in cands]
    score = []
    for j, cand in enumerate(cands):
      score.append(BLEU(ref, cand))
    print(score)
    cand_scores.append(score)
    bary_scores.append(cand_scores)

# Write results to a CSV file
with open('/content/BLEU_scores.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    header = ['ID'] + model_names
    writer.writerow(header)
    for i in range(len(seg_IDs)):
        row = np.insert(bary_scores[i][0], 0,int(seg_IDs[i]))
        writer.writerow(row)

0/313
[0.7016879391277372, 0.7016879391277372, 0.5706745777055999, 0.7997513045108656, 0.516742619606005, 0.7016879391277372, 0.7419446627365011]
1/313
[0.5388504329728031, 0.5428870478804044, 0.5134308701693979, 0.5682046610324758, 0.5723373308266564, 0.5261627090001443, 0.5548257650383028]
2/313
[0.5591863165189648, 0.6381273821668191, 0.4961112855774751, 0.48971601315899604, 0.4961112855774751, 0.3806774291089103, 0.612099359848327]
3/313
[0.5365235519610349, 0.5593558070690728, 0.5331675363405771, 0.5833483201978661, 0.554002504141802, 0.4920818016259463, 0.6253788041078522]
4/313
[0.4063798282013443, 0.7419446627365011, 0.44833867003844585, 0.4063798282013443, 0.4063798282013443, 0.5331675363405771, 0.41722614486115056]
5/313
[0.48549177170732344, 0.6756000774035172, 0.4400558683966967, 0.48549177170732344, 0.45180100180492244, 0.5133450480401704, 0.6756000774035172]
6/313
[0.6816620731700941, 0.626667427972578, 0.7745176489222466, 0.7190476054973329, 0.691262979963243, 0.58910574

**INFO_LM**


In [None]:
import csv
import pandas as pd
import numpy as np

model_names = ['newsdiscusstest2015.CIMS-FR.4056.en-fr',
               'newsdiscusstest2015.LIMSI-CNRS-mosesSoulMoreFeatures.4000.en-fr',
               'newsdiscusstest2015.online-A.0.en-fr',
               'newsdiscusstest2015.online-B.0.en-fr',
               'newsdiscusstest2015.online-E.0.en-fr',
               'newsdiscusstest2015.online-F.0.en-fr',
               'newsdiscusstest2015.uedin-jhu-phrase.4104.en-fr']

orig_sentences = 'newsdiscusstest2015-enfr-ref.fr'

wmt_csv = pd.read_csv('/content/wmt15engfre.csv', sep=',')

seg_IDs = wmt_csv.segmentId.unique()


# Load the reference sentences from the original sentences file
with open(orig_sentences, 'r', encoding='utf-8') as f:
    refs = f.readlines()

# Load the candidate sentences from the model names files
cands = [[] for _ in range(len(model_names))]
for j, model_name in enumerate(model_names):
    with open(model_name, 'r', encoding='utf-8') as f:
        cands[j] = f.readlines()

# Compute BaryScores for each line of the reference sentences
InfoLM_scores = []
for i, id in enumerate(seg_IDs):
    print(str(i) + '/' + str(len(seg_IDs)))
    ref = refs[id]
    cand_scores = []
    cand_sentences = [cand[id] for cand in cands]
    score = InfoLM_metric(ref, cand_sentences)
    cand_scores.append(score)
    InfoLM_scores.append(cand_scores)

# Write results to a CSV file
with open('/content/InfoLM_scores.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    header = ['ID'] + model_names
    writer.writerow(header)
    for i in range(len(seg_IDs)):
        row = np.insert(InfoLM_scores[i][0], 0,int(seg_IDs[i]))
        writer.writerow(row)

0/313


  dict_logits_distribution[str(self.temperature)] = torch.nn.Softmax()(


1/313
2/313
3/313
4/313
5/313
6/313
7/313
8/313
9/313
10/313
11/313
12/313
13/313
14/313
15/313
16/313
17/313
18/313
19/313
20/313
21/313
22/313
23/313
24/313
25/313
26/313
27/313
28/313
29/313
30/313
31/313
32/313
33/313
34/313
35/313
36/313
37/313
38/313
39/313
40/313
41/313
42/313
43/313
44/313
45/313
46/313
47/313
48/313
49/313
50/313
51/313
52/313
53/313
54/313
55/313
56/313
57/313
58/313
59/313
60/313
61/313
62/313
63/313
64/313
65/313
66/313
67/313
68/313
69/313
70/313
71/313
72/313
73/313
74/313
75/313
76/313
77/313
78/313
79/313
80/313
81/313
82/313
83/313
84/313
85/313
86/313
87/313
88/313
89/313
90/313
91/313
92/313
93/313
94/313
95/313
96/313
97/313
98/313
99/313
100/313
101/313
102/313
103/313
104/313
105/313
106/313
107/313
108/313
109/313
110/313
111/313
112/313
113/313
114/313
115/313
116/313
117/313
118/313
119/313
120/313
121/313
122/313
123/313
124/313
125/313
126/313
127/313
128/313
129/313
130/313
131/313
132/313
133/313
134/313
135/313
136/313
137/313
138/313
139/

#III. Ranking aggregation and human correlation evaluation

**Condorcet ranking**

In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('/content/wmt15engfre.csv', sep=',')

# Create a list of dictionaries to store the Condorcet ranks for each segmentId
results = []

# Loop through each segmentId
for segmentId in df.segmentId.unique():
    # Create a dictionary to store the Copeland scores for each system
    system_scores = {}
    # Loop through each row with the current segmentId
    for _, row in df[df.segmentId == segmentId].iterrows():
        # Get the system IDs and ranks
        system1 = row['system1Id']
        system2 = row['system2Id']
        rank1 = row['system1rank']
        rank2 = row['system2rank']
        # Update the scores for the first system
        if system1 not in system_scores:
            system_scores[system1] = 0
        if rank1 < rank2:
            system_scores[system1] += 1
        # Update the scores for the second system
        if system2 not in system_scores:
            system_scores[system2] = 0
        if rank2 < rank1:
            system_scores[system2] += 1
    # Calculate the Copeland score for each system
    copeland_scores = {}
    for system in system_scores:
        copeland_scores[system] = sum([1 for other_system in system_scores if system_scores[other_system] < system_scores[system]])
    # Calculate the Condorcet rank for each system
    condorcet_ranks = {}
    for system in copeland_scores:
        rank = 1
        for other_system in copeland_scores:
            if copeland_scores[other_system] > copeland_scores[system]:
                rank += 1
        condorcet_ranks[system] = rank
    # Add the Condorcet ranks for this segmentId to the results list
    results.append(condorcet_ranks)

# Create a new DataFrame with the Condorcet ranks
result_df = pd.DataFrame(results)
result_df.to_csv('pivoted_wmt15engfre.csv', index=False)

**Ranking according to the different metrics**

In [None]:
import csv
import re

models = ['BERT_scores', 'BLEU_scores', 'bary_scores', 'depth_scores', 'mover_scores']
for i in range(len(models)):
  # Read the input CSV file
  with open('/content/' + models[i] + '.csv', 'r') as f:
      reader = csv.reader(f)
      rows = [row for row in reader]
      rows = rows[1:]

  # Compute the ranks of each value in each row
  for row in rows:
      # Extract the numeric values
      if models[i] == 'BERT_scores':
        values = [float(re.search(r'[-+]?\d*\.\d+|\d+', val).group()) for val in row[1:]]
      else:
        values = [float(val) for val in row[1:]]
      # Sort the values in descending order
      sorted_values = sorted(values, reverse=True)
      # Compute the ranks of each value
      ranks = [sorted_values.index(val) + 1 for val in values]
      # Replace the values with their ranks
      for j in range(len(values)):
          row[j+1] = str(ranks[j])

  # Write the modified list of lists to a new CSV file
  with open('/content/' + models[i] + '_ranks.csv', 'w', newline='') as f:
      writer = csv.writer(f)
      writer.writerows(rows)

**Spearman correlation score displayed in a boxplot for each model**

In [None]:
import csv
import numpy as np
from scipy.stats import pearsonr, spearmanr, kendalltau
import math
import matplotlib.pyplot as plt

def correlations(model):
  # Read in the two CSV files
  condorcet_ranks = []
  with open('/content/pivoted_wmt15engfre.csv', 'r') as f:
      reader = csv.reader(f)
      for i, row in enumerate(reader):
        if i > 0:
          condorcet_ranks.append([int(float(x)) if x != '' else None for x in row])

  bleu_scores_ranks = []
  with open('/content/' + model + '_ranks.csv', 'r') as f:
      reader = csv.reader(f)
      for row in reader:
        if model=='BERT_scores':
          bleu_scores_ranks.append([int(float(re.search(r'[-+]?\d*\.\d+|\d+', x).group())) for x in row])
        else:
          bleu_scores_ranks.append([int(float(x)) for x in row])

  # Fill in missing values in condorcet_ranks with the average rank
  for i in range(len(condorcet_ranks)):
      num_filled = sum([1 for x in condorcet_ranks[i] if x is not None])
      if num_filled > 0:
          avg_rank = sum([x for x in condorcet_ranks[i] if x is not None]) / num_filled
          for j in range(len(condorcet_ranks[i])):
              if condorcet_ranks[i][j] is None:
                  condorcet_ranks[i][j] = avg_rank

  # Compute the correlation scores row by row
  pearson_scores = []
  spearman_scores = []
  kendall_scores = []

  for i in range(len(condorcet_ranks)):
      spearman, _ = spearmanr(condorcet_ranks[i], bleu_scores_ranks[i][1:])
      if not math.isnan(spearman):
        spearman_scores.append(spearman)

  # Print out the correlation scores
  return np.mean(spearman_scores), spearman_scores

fig, ax = plt.subplots()
scores_array = []
for model in models:
  mean, spearman_scores = correlations(model)
  print('Spearman for ' + model + ':' + str(mean))
  scores_array.append(spearman_scores)
# create boxplot
ax.boxplot(scores_array)
ax.set_xticklabels(models)
ax.set_ylabel('Correlation Coefficient')
ax.set_title('Spearman Correlations for All Models')
plt.show()