In [22]:
import pandas as pd
from readability import Readability
from collections import defaultdict
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [None]:
model_mapping = {
    "llama1b": "Llama-3.2-1B-Instruct",
    "llama8b": "Llama-3.1-8B-Instruct",
    "llama70b": "Llama-3.3-70B-Instruct-Turbo"
}

for model in ["llama70b"]:
    for length in ["cutoff", "full"]:
        for persona in ["persona", "np"]:
            if length == "cutoff":
                df = pd.read_csv(f"../output/{model}-{length}-{persona}/{model_mapping[model]}_dolly_output.tsv", sep="\t")
            else:
                df = pd.read_csv(f"../output/{model}-{persona}/{model_mapping[model]}_dolly_output.tsv", sep="\t")
            df["ari"] = None # higher = more complicated
            df["coleman_liau"] = None # higher = more complicated
            df["flesch"] = None # lower = more complicated
            df["flesch_kincaid"] = None # higher = more complicated
            df["gunning_fog"] = None # higher = more complicated
            df["linsear_write"] = None # higher = more complicated
            # df["smog"] = None
            df["dale_chall"] = None # higher = more complicated
            for i, row in df.iterrows():
                print("{}/{}".format(i, len(df)))
                text = row["response"]
                # if text contains less than 100 words, skip (metrics need at least 100 words to be able to calculate)
                if len(text.split()) <= 106:
                    continue
                print(len(text.split()))
                r = Readability(text)
                df.at[i, "ari"] = r.ari().score
                df.at[i, "coleman_liau"] = r.coleman_liau().score
                df.at[i, "flesch"] = r.flesch().score
                df.at[i, "flesch_kincaid"] = r.flesch_kincaid().score
                df.at[i, "gunning_fog"] = r.gunning_fog().score
                df.at[i, "linsear_write"] = r.linsear_write().score
                # df.at[i, "smog"] = r.smog().score
                df.at[i, "dale_chall"] = r.dale_chall().score
            if length == "cutoff":
                df.to_csv(f"../output/{model}-{length}-{persona}/{model_mapping[model]}_readability.tsv", sep="\t", index=False)
            else:
                df.to_csv(f"../output/{model}-{persona}/{model_mapping[model]}_readability.tsv", sep="\t", index=False)
            

In [52]:
model_mapping = {
    "llama1b": "Llama-3.2-1B-Instruct",
    "llama8b": "Llama-3.1-8B-Instruct",
    "llama70b": "Llama-3.3-70B-Instruct-Turbo"
}

averages = pd.DataFrame(columns=["model", "metric", "value", "std"])
for model in ["llama1b", "llama8b", "llama70b"]:
    for length in ["cutoff", "full"]:
        for persona in ["persona", "np"]:
            if length == "cutoff":
                df = pd.read_csv(f"../output/{model}-{length}-{persona}/{model_mapping[model]}_readability.tsv", sep="\t")
            else:
                df = pd.read_csv(f"../output/{model}-{persona}/{model_mapping[model]}_readability.tsv", sep="\t")
            print(f"Model: {model}, Length: {length}, Persona: {persona}")
            # Get average readability scores
            averages.loc[len(averages)] = [f"{model}-{length}-{persona}", "ARI", df["ari"].mean(), df["ari"].std()]
            averages.loc[len(averages)] = [f"{model}-{length}-{persona}", "Coleman-Liau", df["coleman_liau"].mean(), df["coleman_liau"].std()]
            averages.loc[len(averages)] = [f"{model}-{length}-{persona}", "Flesch", df["flesch"].mean(), df["flesch"].std()]
            averages.loc[len(averages)] = [f"{model}-{length}-{persona}", "Flesch-Kincaid", df["flesch_kincaid"].mean(), df["flesch_kincaid"].std()]
            averages.loc[len(averages)] = [f"{model}-{length}-{persona}", "Gunning Fog", df["gunning_fog"].mean(), df["gunning_fog"].std()]
            averages.loc[len(averages)] = [f"{model}-{length}-{persona}", "Linsear Write", df["linsear_write"].mean(), df["linsear_write"].std()]
            averages.loc[len(averages)] = [f"{model}-{length}-{persona}", "Dale Chall", df["dale_chall"].mean(), df["dale_chall"].std()]
averages.to_csv("../output/readability_averages.tsv", sep="\t", index=False)

Model: llama1b, Length: cutoff, Persona: persona
Model: llama1b, Length: cutoff, Persona: np
Model: llama1b, Length: full, Persona: persona
Model: llama1b, Length: full, Persona: np
Model: llama8b, Length: cutoff, Persona: persona
Model: llama8b, Length: cutoff, Persona: np
Model: llama8b, Length: full, Persona: persona
Model: llama8b, Length: full, Persona: np
Model: llama70b, Length: cutoff, Persona: persona
Model: llama70b, Length: cutoff, Persona: np
Model: llama70b, Length: full, Persona: persona
Model: llama70b, Length: full, Persona: np


In [None]:
averages = averages[averages["metric"] != "Flesch"]
fig = px.bar(averages, x="model", y="value", color="metric", barmode="group", title="Readability Metrics")
# add tick marks on y-axis
fig.update_yaxes(tickvals=np.arange(0, 20, 2))
# add values to bars
fig.update_traces(texttemplate='%{value:.2f}', textposition='outside')
# add error bars
# error bars should be standard deviation / 2
averages["std"] = averages["std"] / 2
fig.update_traces(error_y=dict(type='data', array=averages["std"]))
fig.show()


0      5.277316
1      1.638908
3      4.040528
4      4.198162
5      5.980986
        ...    
78     1.603431
80     6.977651
81     7.372367
82    11.188726
83     1.112460
Name: std, Length: 72, dtype: float64


In [None]:
averages = pd.read_csv("../output/readability_averages.tsv", sep="\t")
averages = averages[averages["metric"] == "Flesch"]
averages["std"] = averages["std"] / 2
fig = px.bar(averages, x="model", y="value", title="Flesch Score")
# add values to bars
fig.update_traces(texttemplate='%{value:.2f}', textposition='inside')
# add error bars
fig.update_traces(error_y=dict(type='data', array=averages["std"]))
fig.show()