In [51]:
import numpy as np
import pandas as pd

#### The first part of the form has 3 questions for each pair of stanzas.  
#### The second part of the form has 2 question for each stanza

# Collect annotations for pairwise comparisons

In [2]:
def get_pairwise_annotation_df(form_csv_path, source_tsv_path):
    df = pd.read_csv(form_csv_path)
    df = df.T
    
    source_df = pd.read_csv(source_tsv_path, sep="\t")
    source_df["order_in_pair"] = [1,2]*(len(source_df)//2)
    written_by_human = source_df.loc[source_df["type"]=="orig"].order_in_pair
    
    choose_human_prompt = "Hvilket vers er skrevet av et menneske?"
    rhymes_v1 = "Er vers 1 skrevet på rim?"
    rhymes_v2 = "Er vers 2 skrevet på rim?"

    rated_human = []
    rhyme_rating = []    

    for i, gold in enumerate(written_by_human):
        human_s = choose_human_prompt
        v1_s = rhymes_v1
        v2_s = rhymes_v2
        if i:
            human_s += f".{i}"
            v1_s += f".{i}"
            v2_s += f".{i}"

        orig_rated_human = df.loc[human_s]
        orig_rated_human = [int(e.split()[1])==gold for e in orig_rated_human]

        rhyme_v1 = list(df.loc[v1_s].apply(int))
        rhyme_rating.append(rhyme_v1)

        rhyme_v2 = list(df.loc[v2_s].apply(int))
        rhyme_rating.append(rhyme_v2)


        i = i*2
        if source_df.loc[i].type == "orig":
            rated_human.append(orig_rated_human)
            rated_human.append([not(e) for e in orig_rated_human])
        else:
            rated_human.append([not(e) for e in orig_rated_human])
            rated_human.append(orig_rated_human)


    source_df["rated_human"] = rated_human
    source_df["rhyme_rating"] = rhyme_rating
    return source_df

# Collect annotations for standalone verses 

In [3]:
def get_standalone_annotation_df(form_csv_path, source_tsv_path):
    df = pd.read_csv(form_csv_path)
    df = df.T
    
    source_df = pd.read_csv(source_tsv_path, sep="\t")
    rhymes_q = "Er verset skrevet på rim?"
    human_q = "Er verset skrevet av et menneske?"

    rated_human = []
    rhyme_rating = []    

    for i in range(len(source_df)):
        human_s = human_q
        rhymes_s = rhymes_q
        if i:
            human_s += f".{i}"
            rhymes_s += f".{i}"

        rated_human.append([e=="Ja" for e in df.loc[human_s]])
        rhyme_rating.append(list(df.loc[rhymes_s].apply(int))) 

    source_df["rated_human"] = rated_human
    source_df["rhyme_rating"] = rhyme_rating

    return source_df

# Summary 

In [94]:
def get_avg_rhyme_rating(df):
    return round(np.mean([n for e in df["rhyme_rating"] for n in e]), 2)

def get_rated_human_percentage(df, not_human=False):
    rated = [e for l in df["rated_human"] for e in l]
    if not_human:
        return round(np.mean([not(e) for e in rated])*100, 2)
    return round(np.mean(rated)*100, 2)

# Look at results

In [76]:
side_by_side_df = pd.DataFrame({})
standalone_df = pd.DataFrame({})

for i in range(1, 5):
    df = get_pairwise_annotation_df(f"annotations/Evaluering av AI-genererte dikt og rim {i}.csv",
                                   f"side_by_side_form_{i}.tsv")
    side_by_side_df = pd.concat([side_by_side_df, df])
    
    df = get_standalone_annotation_df(f"annotations/Evaluering av AI-genererte dikt og rim {i}.csv",
                                     f"rest_{i}.tsv")
    standalone_df = pd.concat([standalone_df, df])

In [77]:
standalone_df

Unnamed: 0.1,Unnamed: 0,type,scheme,stanza,rated_human,rhyme_rating
0,5,line,ABCB,med yndig majestet han går \n som gjorde sinn...,"[False, True, True, False, False, False]","[3, 3, 3, 3, 3, 2]"
1,3,orig,AAB,hva er det for et mektig lys\ndet farer som et...,"[True, True, False, True, True, True]","[2, 3, 3, 2, 2, 2]"
2,4,orig,AABB,skurken saulus løfter armen\natter dette stikk...,"[True, True, False, False, True, True]","[3, 3, 3, 3, 3, 3]"
3,0,orig,ABCCAB,frysende skinner det store og sanne\nvarme har...,"[True, False, False, False, True, True]","[2, 3, 3, 3, 3, 2]"
4,15,base,-,og til han far elsket sønn så fordum lystig v...,"[True, False, False, False, False, False]","[3, 3, 3, 2, 2, 1]"
...,...,...,...,...,...,...
15,9,line,ABABCC,skjønt hver av dem er kraftig som en hest \n ...,"[False, True, False, False, False]","[2, 2, 2, 2, 2]"
16,19,base,-,hvor vred ble det kryp i sne \n som bølger i ...,"[True, False, False, True, True]","[1, 1, 1, 1, 3]"
17,18,base,-,ta dem dog av meg selv \n over denne dag jeg ...,"[False, False, False, False, False]","[0, 0, 0, 0, 0]"
18,7,line,ABABCC,og enda ser a ut som a var bare skinn og bein...,"[False, True, False, True, True]","[3, 3, 3, 3, 3]"


In [78]:
side_by_side_df

Unnamed: 0.1,Unnamed: 0,type,scheme,stanza,pair_number,order_in_pair,rated_human,rhyme_rating
0,20,stanza,ABAB,her hørte dalen her så det seg svang \n runde...,0,1,"[False, False, True, False, False, False]","[3, 3, 2, 3, 3, 1]"
1,0,orig,AABCCB,sjo-i sjo-i hyss\nsang jeg om et kyss\nnei det...,0,2,"[True, True, False, True, True, True]","[3, 3, 3, 3, 3, 3]"
2,1,orig,ABAB,du den høyeste blandt høye\ndu den viseste bla...,1,1,"[True, True, False, False, True, True]","[3, 3, 3, 3, 3, 3]"
3,18,line,ABAB,inn i fjellets flammesky \n en lovsangs brus ...,1,2,"[False, False, True, True, False, False]","[3, 3, 3, 3, 3, 0]"
4,16,line,ABAB,landets fiender og dro \n la min unge søsters...,2,1,"[False, True, True, False, False, False]","[3, 3, 3, 3, 3, 1]"
...,...,...,...,...,...,...,...,...
25,27,base,-,jeg er død som en byrde i mørket lyser \n og ...,12,2,"[True, False, False, False, True]","[0, 0, 0, 0, 0]"
26,13,orig,ABAB,for henne selv ble livet nu\nen natt hvor inge...,13,1,"[True, True, True, True, True]","[3, 3, 3, 3, 3]"
27,28,base,-,ti falt ei het og vill \n av byens siste natt...,13,2,"[False, False, False, False, False]","[0, 0, 0, 0, 0]"
28,14,orig,AABB,å nå forstår jeg det var ham\njeg drømte om de...,14,1,"[True, True, True, True, True]","[3, 3, 3, 3, 3]"


In [95]:
base = side_by_side_df.loc[side_by_side_df["type"]=="base"]
line = side_by_side_df.loc[side_by_side_df["type"]=="line"]
stanza = side_by_side_df.loc[side_by_side_df["type"]=="stanza"] 
orig = side_by_side_df.loc[side_by_side_df["type"]=="orig"] 

print(f"""
PAIRWISE ratings results:
    Original stanzas:
        Average rhyme rating                   : {get_avg_rhyme_rating(orig)}
        % of times rated 'written by a human'  : {get_rated_human_percentage(orig)}
    
    Baseline generation:
        Average rhyme rating                   : {get_avg_rhyme_rating(base)}
        % of times rated 'written by a human'  : {get_rated_human_percentage(base)}
    
    Line-level generation:
        Average rhyme rating                   : {get_avg_rhyme_rating(line)}
        % of times rated 'written by a human'  : {get_rated_human_percentage(line)}
        
    Stanza-level generation:
        Average rhyme rating                   : {get_avg_rhyme_rating(stanza)}
        % of times rated 'written by a human'  : {get_rated_human_percentage(stanza)}
""")


PAIRWISE ratings results:
    Original stanzas:
        Average rhyme rating                   : 2.6
        % of times rated 'written by a human'  : 85.76
    
    Baseline generation:
        Average rhyme rating                   : 0.54
        % of times rated 'written by a human'  : 16.36
    
    Line-level generation:
        Average rhyme rating                   : 2.32
        % of times rated 'written by a human'  : 18.18
        
    Stanza-level generation:
        Average rhyme rating                   : 2.0
        % of times rated 'written by a human'  : 8.18



In [96]:
base = standalone_df.loc[standalone_df["type"]=="base"]
line = standalone_df.loc[standalone_df["type"]=="line"]
stanza = standalone_df.loc[standalone_df["type"]=="stanza"] 
orig = standalone_df.loc[standalone_df["type"]=="orig"] 

print(f"""
STANDALONE ratings results:
    Original stanzas:
        Average rhyme rating                   : {get_avg_rhyme_rating(orig)}
        % of times rated 'written by a human'  : {get_rated_human_percentage(orig)}
    
    Baseline generation:
        Average rhyme rating                   : {get_avg_rhyme_rating(base)}
        % of times rated 'written by a human'  : {get_rated_human_percentage(base)}
    
    Line-level generation:
        Average rhyme rating                   : {get_avg_rhyme_rating(line)}
        % of times rated 'written by a human'  : {get_rated_human_percentage(line)}
        
    Stanza-level generation:
        Average rhyme rating                   : {get_avg_rhyme_rating(stanza)}
        % of times rated 'written by a human'  : {get_rated_human_percentage(stanza)}
""")


STANDALONE ratings results:
    Original stanzas:
        Average rhyme rating                   : 2.67
        % of times rated 'written by a human'  : 83.64
    
    Baseline generation:
        Average rhyme rating                   : 0.88
        % of times rated 'written by a human'  : 24.55
    
    Line-level generation:
        Average rhyme rating                   : 2.22
        % of times rated 'written by a human'  : 46.36
        
    Stanza-level generation:
        Average rhyme rating                   : 1.8
        % of times rated 'written by a human'  : 8.18

