In [None]:
!pip -q install transformers
!pip -q install sentencepiece
!pip -q install sentence_transformers
!pip -q install textstat
!pip -q install rouge-score
!pip -q install evaluate
!pip -q install sacremoses
!pip -q install sacrebleu
from rouge_score import rouge_scorer
from evaluate import load
import pandas as pd
import numpy as np
import re
import textstat
import pickle

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.7/119.7 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import drive
drive.mount("/content/drive/")
%cd "./drive/My Drive"

Mounted at /content/drive/
/content/drive/My Drive


### Load data

In [None]:
X_dev = pd.read_csv("./processed_dev_data.csv")
X_test = pd.read_csv("./processed_test_data.csv")

In [None]:
with open("student_dev_output_no_prompts.pkl", "rb") as f:
  y_student_dev = pickle.load(f)

with open("student_test_output_no_prompts.pkl", "rb") as f:
  y_student_test = pickle.load(f)

with open("teacher_dev_output.pkl", "rb") as f:
  y_teacher_dev = pickle.load(f)

with open("teacher_test_output.pkl", "rb") as f:
  y_teacher_test = pickle.load(f)

### Metrics

In [None]:
sari = load("sari")

In [None]:
def rouge(output, gen):
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
  scores = scorer.score(output, gen)
  return scores

In [None]:
def flesch(text):

  if (text == ""):
    return 0, 0

  # Remove non-alphabetic characters and digits
  clean_text = re.sub(r'[^a-zA-Z\s]', '', text)

  # Calculate average number of words per sentence
  words_per_sentence = textstat.lexicon_count(clean_text) / textstat.sentence_count(text)

  # Calculate average number of syllables per word
  syllables_per_word = textstat.syllable_count(clean_text) / textstat.lexicon_count(clean_text)

  # Calculate Flesch scores
  fre_score = 206.835 - 1.015 * words_per_sentence - 84.6 * syllables_per_word
  fkgl_score = 0.39 * words_per_sentence + 11.8 * syllables_per_word - 15.59

  return fre_score, fkgl_score

In [None]:
def compute_sari(X, y, ref):
  sources = [X]
  predictions = [y]
  references = [[ref]]
  return(sari.compute(sources=sources, predictions=predictions, references=references)["sari"])

### Validation scores

In [None]:
# Student dev
rouge1_f_dev = 0.0
rougeL_f_dev = 0.0

for i in range(len(X_dev)):
  scores = rouge(X_dev["Expert"].values[i], y_student_dev[i])
  rouge1_f_dev += scores["rouge1"].fmeasure
  rougeL_f_dev += scores["rougeL"].fmeasure

rouge1_f_dev /= len(X_dev)
rougeL_f_dev /= len(X_dev)

print(f"ROUGE-1 F-measure: {rouge1_f_dev}")
print(f"ROUGE-L F-measure: {rougeL_f_dev}")

ROUGE-1 F-measure: 0.8066125130779112
ROUGE-L F-measure: 0.8008735147445086


In [None]:
# Teacher dev
rouge1_f_dev = 0.0
rougeL_f_dev = 0.0

for i in range(len(X_dev)):
  scores = rouge(X_dev["Expert"].values[i], y_teacher_dev[i])
  rouge1_f_dev += scores["rouge1"].fmeasure
  rougeL_f_dev += scores["rougeL"].fmeasure

rouge1_f_dev /= len(X_dev)
rougeL_f_dev /= len(X_dev)

print(f"ROUGE-1 F-measure: {rouge1_f_dev}")
print(f"ROUGE-L F-measure: {rougeL_f_dev}")

ROUGE-1 F-measure: 0.6851174307198339
ROUGE-L F-measure: 0.660336441935362


In [None]:
# Original Simple
rouge1_f_dev = 0.0
rougeL_f_dev = 0.0

for i in range(len(X_dev)):
  scores = rouge(X_dev["Expert"].values[i], X_dev["Simple"].values[i])
  rouge1_f_dev += scores["rouge1"].fmeasure
  rougeL_f_dev += scores["rougeL"].fmeasure

rouge1_f_dev /= len(X_dev)
rougeL_f_dev /= len(X_dev)

print(f"ROUGE-1 F-measure: {rouge1_f_dev}")
print(f"ROUGE-L F-measure: {rougeL_f_dev}")

ROUGE-1 F-measure: 0.5439899701984097
ROUGE-L F-measure: 0.4990515538425681


In [None]:
# Original Expert
fre_dev = 0.0
fkgl_dev = 0.0

for x in X_dev["Expert"].values:
  fre, fkgl = flesch(x)
  fre_dev += fre
  fkgl_dev += fkgl


fre_dev /= len(X_dev)
fkgl_dev /= len(X_dev)

print(f"Flesch Reading Ease Score: {fre_dev}")
print(f"Flesch Kincaid Grade Level: {fkgl_dev}")

Flesch Reading Ease Score: 36.826087809038576
Flesch Kincaid Grade Level: 13.026700713356318


In [None]:
# Original Simple
fre_dev = 0.0
fkgl_dev = 0.0

for x in X_dev["Simple"].values:
  fre, fkgl = flesch(x)
  fre_dev += fre
  fkgl_dev += fkgl


fre_dev /= len(X_dev)
fkgl_dev /= len(X_dev)

print(f"Flesch Reading Ease Score: {fre_dev}")
print(f"Flesch Kincaid Grade Level: {fkgl_dev}")

Flesch Reading Ease Score: 49.10630040968796
Flesch Kincaid Grade Level: 11.5716440127897


In [None]:
# Student
fre_dev = 0.0
fkgl_dev = 0.0

for y in y_student_dev:
  fre, fkgl = flesch(y)
  fre_dev += fre
  fkgl_dev += fkgl

fre_dev /= len(X_dev)
fkgl_dev /= len(X_dev)

print(f"Flesch Reading Ease Score: {fre_dev}")
print(f"Flesch Kincaid Grade Level: {fkgl_dev}")

Flesch Reading Ease Score: 31.318027892589058
Flesch Kincaid Grade Level: 13.211549544275194


In [None]:
# Teacher
fre_dev = 0.0
fkgl_dev = 0.0

for y in y_teacher_dev:
  fre, fkgl = flesch(y)
  fre_dev += fre
  fkgl_dev += fkgl

fre_dev /= len(X_dev)
fkgl_dev /= len(X_dev)

print(f"Flesch Reading Ease Score: {fre_dev}")
print(f"Flesch Kincaid Grade Level: {fkgl_dev}")

Flesch Reading Ease Score: 48.936040195240615
Flesch Kincaid Grade Level: 10.715099471891381


In [None]:
# Student
sari_score_dev = 0.0

for i in range(len(X_dev)):
  sari_score_dev += compute_sari(X_dev["Expert"].values[i],
                             y_student_dev[i],
                             X_dev["Simple"].values[i])

sari_score_dev /= len(X_dev)

print(f"Sari Score: {sari_score_dev}")

Sari Score: 40.961821885002955


In [None]:
# Teacher
sari_score_dev = 0.0

for i in range(len(X_dev)):
  sari_score_dev += compute_sari(X_dev["Expert"].values[i],
                             y_teacher_dev[i],
                             X_dev["Simple"].values[i])

sari_score_dev /= len(X_dev)

print(f"Sari Score: {sari_score_dev}")

Sari Score: 43.909374453978636


### Test scores

In [None]:
# Student
rouge1_f_test = 0.0
rougeL_f_test = 0.0

for i in range(len(X_test)):
  scores = rouge(X_test["Expert"].values[i], y_student_test[i])
  rouge1_f_test += scores["rouge1"].fmeasure
  rougeL_f_test += scores["rougeL"].fmeasure

rouge1_f_test /= len(X_test)
rougeL_f_test /= len(X_test)

print(f"ROUGE-1 F-measure: {rouge1_f_test}")
print(f"ROUGE-L F-measure: {rougeL_f_test}")

ROUGE-1 F-measure: 0.8229392908677232
ROUGE-L F-measure: 0.8175084154689236


In [None]:
# Teacher
rouge1_f_test = 0.0
rougeL_f_test = 0.0

for i in range(len(X_test)):
  scores = rouge(X_test["Expert"].values[i], y_teacher_test[i])
  rouge1_f_test += scores["rouge1"].fmeasure
  rougeL_f_test += scores["rougeL"].fmeasure

rouge1_f_test /= len(X_test)
rougeL_f_test /= len(X_test)

print(f"ROUGE-1 F-measure: {rouge1_f_test}")
print(f"ROUGE-L F-measure: {rougeL_f_test}")

ROUGE-1 F-measure: 0.7006819399606844
ROUGE-L F-measure: 0.6774642757408771


In [None]:
# Dataset simple
rouge1_f_test = 0.0
rougeL_f_test = 0.0

for i in range(len(X_test)):
  scores = rouge(X_test["Expert"].values[i], X_test["Simple"].values[i])
  rouge1_f_test += scores["rouge1"].fmeasure
  rougeL_f_test += scores["rougeL"].fmeasure

rouge1_f_test /= len(X_test)
rougeL_f_test /= len(X_test)

print(f"ROUGE-1 F-measure: {rouge1_f_test}")
print(f"ROUGE-L F-measure: {rougeL_f_test}")

ROUGE-1 F-measure: 0.5259428633457133
ROUGE-L F-measure: 0.4830311462346027


In [None]:
# Original medical text
fre_test = 0.0
fkgl_test = 0.0

for x in X_test["Expert"].values:
  fre, fkgl = flesch(x)
  fre_test += fre
  fkgl_test += fkgl


fre_test /= len(X_test)
fkgl_test /= len(X_test)

print(f"Flesch Reading Ease Score: {fre_test}")
print(f"Flesch Kincaid Grade Level: {fkgl_test}")

Flesch Reading Ease Score: 39.646411957824604
Flesch Kincaid Grade Level: 12.258081970815644


In [None]:
# Simple medical text
fre_test = 0.0
fkgl_test = 0.0

for x in X_test["Simple"].values:
  fre, fkgl = flesch(x)
  fre_test += fre
  fkgl_test += fkgl


fre_test /= len(X_test)
fkgl_test /= len(X_test)

print(f"Flesch Reading Ease Score: {fre_test}")
print(f"Flesch Kincaid Grade Level: {fkgl_test}")

Flesch Reading Ease Score: 53.100896209940565
Flesch Kincaid Grade Level: 10.239037530776054


In [None]:
# Student
fre_test = 0.0
fkgl_test = 0.0

for y in y_student_test:
  fre, fkgl = flesch(y)
  fre_test += fre
  fkgl_test += fkgl

fre_test /= len(X_test)
fkgl_test /= len(X_test)

print(f"Flesch Reading Ease Score: {fre_test}")
print(f"Flesch Kincaid Grade Level: {fkgl_test}")

Flesch Reading Ease Score: 39.28209262859046
Flesch Kincaid Grade Level: 11.763749652408329


In [None]:
# Teacher
fre_test = 0.0
fkgl_test = 0.0

for y in y_teacher_test:
  fre, fkgl = flesch(y)
  fre_test += fre
  fkgl_test += fkgl

fre_test /= len(X_test)
fkgl_test /= len(X_test)

print(f"Flesch Reading Ease Score: {fre_test}")
print(f"Flesch Kincaid Grade Level: {fkgl_test}")

Flesch Reading Ease Score: 50.7915697290646
Flesch Kincaid Grade Level: 10.43214894243939


In [None]:
# Student
sari_score_test = 0.0

for i in range(len(X_test)):
  sari_score_test += compute_sari(X_test["Expert"].values[i],
                             y_student_test[i],
                             X_test["Simple"].values[i])

sari_score_test /= len(X_test)

print(f"Sari Score: {sari_score_test}")

Sari Score: 41.45003621398768


In [None]:
# Teacher
sari_score_test = 0.0

for i in range(len(X_test)):
  sari_score_test += compute_sari(X_test["Expert"].values[i],
                             y_teacher_test[i],
                             X_test["Simple"].values[i])

sari_score_test /= len(X_test)

print(f"Sari Score: {sari_score_test}")

Sari Score: 45.76244494468783


In [None]:
for i in range(len(X_dev)):
  print(f"Expert: {X_dev['Expert'].values[i]}")
  print(f"Simple: {X_dev['Simple'].values[i]}")
  print(f"Teacher: {y_teacher_dev[i]}")
  print(f"Student: {y_student_dev[i]}")
  print("============================================")

Expert: Nonsurgical treatment fails in about 40 to 70 % of patients, necessitating surgical excision.
Simple: In about 40 to 70 % of people, surgical removal may be necessary.
Teacher: Nonsurgical treatment fails in about 40 to 70 % of people, requiring surgical excision.
Student: Nonsurgical treatment fails in about 40 to 70 % of patients, necessitating surgical excision.
Expert: In about 10 to 20 % of patients, discrete pink, blanching lesions (rose spots) appear in crops on the chest and abdomen during the 2nd wk and resolve in 2 to 5 days.
Simple: During the second week, a rash of flat, rose-colored spots develops on the chest and abdomen of about 10 to 20 % of people.
Teacher: During the 2nd week, small, raised sores (rose spots) appear on the chest and abdomen. They resolve in about 2 to 5 days, and the sores usually subside within a few days.
Student: In about 10 % of patients, discrete pink, blanching lesions (rose spots) appear in crops on the chest and abdomen during the 2nd 