# T5 Transformer

In [1]:
import numpy as np;

T5_cs_ev = np.array([0.4112535185409239, 0.5122027395630958, 0.26269611892003886, 0.33008781791570163, 0.23009986874192218, 0.29469123164276817, 0.4481797561735765, 0.2571025100822311, 0.35660521600328693, 0.11149659942457762])
T5_bleu = np.array([0.14954147455501682, 0.05133318333112993, 0.054195673596052416, 0.04462849525866479, 5.435732264276073e-79, 0.0634647120918607, 0.0425183192120761, 0.08494809541027601, 0.05674540410389525, 2.5696368610675503e-155])
T5_METEOR = np.array([0.6117765845151223, 0.29129591612924605, 0.2814659260262837, 0.387681035563304, 0.22057745198241066, 0.33737678232297363, 0.31996697080378006, 0.5012290418025945, 0.41801122299547716, 0.19950124688279303])

print(f"Average T5 Scores: cosine similarity score {np.mean(T5_cs_ev)} , average of BLEU score {np.mean(T5_bleu)} , average of METEOR {np.mean(T5_METEOR)}")

Average T5 Scores: cosine similarity score 0.3214415377008123 , average of BLEU score 0.0547375357558972 , average of METEOR 0.35688821790239855


In [2]:
from typing import List, Dict
from collections import namedtuple

# Define a namedtuple for the ROUGE scores
Score = namedtuple('Score', 'precision recall fmeasure')

# ROUGE scores for different evaluations
rouge_scores = [
    {'rouge1': Score(precision=0.2246376811594203, recall=0.7948717948717948, fmeasure=0.3502824858757062),
     'rouge2': Score(precision=0.17518248175182483, recall=0.631578947368421, fmeasure=0.2742857142857143),
     'rougeL': Score(precision=0.21739130434782608, recall=0.7692307692307693, fmeasure=0.3389830508474576)},
    {'rouge1': Score(precision=0.17647058823529413, recall=0.7346938775510204, fmeasure=0.2845849802371542),
     'rouge2': Score(precision=0.06896551724137931, recall=0.2916666666666667, fmeasure=0.11155378486055777),
     'rougeL': Score(precision=0.12254901960784313, recall=0.5102040816326531, fmeasure=0.19762845849802368)},
    {'rouge1': Score(precision=0.11822660098522167, recall=0.5853658536585366, fmeasure=0.19672131147540983),
     'rouge2': Score(precision=0.0594059405940594, recall=0.3, fmeasure=0.09917355371900825),
     'rougeL': Score(precision=0.06403940886699508, recall=0.3170731707317073, fmeasure=0.10655737704918035)},
    {'rouge1': Score(precision=0.15151515151515152, recall=0.625, fmeasure=0.24390243902439024),
     'rouge2': Score(precision=0.07142857142857142, recall=0.30434782608695654, fmeasure=0.11570247933884298),
     'rougeL': Score(precision=0.10101010101010101, recall=0.4166666666666667, fmeasure=0.16260162601626016)},
    {'rouge1': Score(precision=0.08544303797468354, recall=0.6585365853658537, fmeasure=0.15126050420168066),
     'rouge2': Score(precision=0.012698412698412698, recall=0.1, fmeasure=0.022535211267605635),
     'rougeL': Score(precision=0.056962025316455694, recall=0.43902439024390244, fmeasure=0.10084033613445377)},
    {'rouge1': Score(precision=0.13705583756345177, recall=0.627906976744186, fmeasure=0.22499999999999998),
     'rouge2': Score(precision=0.05102040816326531, recall=0.23809523809523808, fmeasure=0.08403361344537814),
     'rougeL': Score(precision=0.09137055837563451, recall=0.4186046511627907, fmeasure=0.15)},
    {'rouge1': Score(precision=0.13147410358565736, recall=0.7021276595744681, fmeasure=0.22147651006711408),
     'rouge2': Score(precision=0.044, recall=0.2391304347826087, fmeasure=0.07432432432432431),
     'rougeL': Score(precision=0.05976095617529881, recall=0.3191489361702128, fmeasure=0.10067114093959731)},
    {'rouge1': Score(precision=0.2621359223300971, recall=0.6428571428571429, fmeasure=0.3724137931034483),
     'rouge2': Score(precision=0.09803921568627451, recall=0.24390243902439024, fmeasure=0.13986013986013987),
     'rougeL': Score(precision=0.1650485436893204, recall=0.40476190476190477, fmeasure=0.23448275862068965)},
    {'rouge1': Score(precision=0.1891891891891892, recall=0.8484848484848485, fmeasure=0.3093922651933702),
     'rouge2': Score(precision=0.08163265306122448, recall=0.375, fmeasure=0.1340782122905028),
     'rougeL': Score(precision=0.10135135135135136, recall=0.45454545454545453, fmeasure=0.16574585635359115)},
    {'rouge1': Score(precision=0.08695652173913043, recall=0.35714285714285715, fmeasure=0.13986013986013987),
     'rouge2': Score(precision=0.008771929824561403, recall=0.037037037037037035, fmeasure=0.014184397163120567),
     'rougeL': Score(precision=0.05217391304347826, recall=0.21428571428571427, fmeasure=0.0839160839160839)}
]

# Function to calculate average scores
def calculate_average_scores(rouge_scores: List[Dict[str, Score]]) -> Dict[str, Score]:
    # Initialize sums
    sum_scores = {key: Score(precision=0, recall=0, fmeasure=0) for key in rouge_scores[0].keys()}

    # Sum up all scores
    for scores in rouge_scores:
        for key, score in scores.items():
            sum_scores[key] = Score(
                precision=sum_scores[key].precision + score.precision,
                recall=sum_scores[key].recall + score.recall,
                fmeasure=sum_scores[key].fmeasure + score.fmeasure
            )

    # Calculate averages
    average_scores = {key: Score(
        precision=sum_score.precision / len(rouge_scores),
        recall=sum_score.recall / len(rouge_scores),
        fmeasure=sum_score.fmeasure / len(rouge_scores)
    ) for key, sum_score in sum_scores.items()}

    return average_scores

# Calculate and return average scores
average_scores = calculate_average_scores(rouge_scores)
average_scores

{'rouge1': Score(precision=0.15631046342772967, recall=0.6576987596250709, fmeasure=0.24948944290384137),
 'rouge2': Score(precision=0.06711451304495733, recall=0.2760758589061319, fmeasure=0.10697314305551948),
 'rougeL': Score(precision=0.10316571817843043, recall=0.4263545739431775, fmeasure=0.16414266883753376)}

he will be able to gamble in a casino buy a drink in a pub or see the horror film Hostel Part II currently six places below. he will be able to see the horror film Hostel Part II currently six places below. despite his growing fame and riches the actor says he is keeping his feet firmly on the ground. his latest outing as the boy wizard in Harry Potter and the Order of the Phoenix is breaking records on both sides of the Atlantic. he has filmed a tv movie called My Boy Jack about author Rudyard Kipling and his son due for release later this year He will also appear in December Boys an Australian film about four boys who escape an orphanage. he made his stage debut playing a tortured teenager in Peter Shaffer s Equus earlier this year.

# BART Transformer

In [3]:
BART_cs_ev = np.array([0.5370704273623851, 0.5237170832112014, 0.2122382874236857, 0.2570219889034265, 0.267363458454155, 0.32439025853747244, 0.3402558218753725, 0.2036475182641988, 0.4691296570802016, 0.14108683361247149, 3.1129734974218434e-155])
BART_bleu = np.array([0.209864568170383, 0.10501291293280349, 0.07970256924438435, 0.03682693546922536, 0.019116813335370467, 0.07187215550684418, 0.03261848475202703, 0.0495796869979592, 0.05815244807157816])
BART_METEOR = np.array([0.6832137645914398, 0.45410345185136475, 0.3433764094424756, 0.2989041877138667, 0.2876651141302811, 0.3446726993561499, 0.3000323700469166, 0.32336764705882354, 0.4589261167803141, 0.31695760598503736])

print(f"Average BART Scores: cosine similarity score {np.mean(BART_cs_ev)} , average of BLEU score {np.mean(BART_bleu)} , average of METEOR {np.mean(BART_METEOR)}")

Average BART Scores: cosine similarity score 0.29781103042950635 , average of BLEU score 0.07363850827561946 , average of METEOR 0.38112193669566696


In [4]:
from typing import List, Dict
from collections import namedtuple

# Define a namedtuple for the ROUGE scores
Score = namedtuple('Score', 'precision recall fmeasure')

# ROUGE scores for different evaluations
rouge_scores = [
    {'rouge1': Score(precision=0.3076923076923077, recall=0.9230769230769231, fmeasure=0.4615384615384615), 'rouge2': Score(precision=0.2413793103448276, recall=0.7368421052631579, fmeasure=0.3636363636363636), 'rougeL': Score(precision=0.2905982905982906, recall=0.8717948717948718, fmeasure=0.4358974358974359)},
    {'rouge1': Score(precision=0.22807017543859648, recall=0.7959183673469388, fmeasure=0.3545454545454545), 'rouge2': Score(precision=0.11176470588235295, recall=0.3958333333333333, fmeasure=0.1743119266055046), 'rougeL': Score(precision=0.14619883040935672, recall=0.5102040816326531, fmeasure=0.22727272727272724)},
    {'rouge1': Score(precision=0.12444444444444444, recall=0.6829268292682927, fmeasure=0.21052631578947367), 'rouge2': Score(precision=0.08035714285714286, recall=0.45, fmeasure=0.13636363636363638), 'rougeL': Score(precision=0.057777777777777775, recall=0.3170731707317073, fmeasure=0.09774436090225565)},
    {'rouge1': Score(precision=0.13274336283185842, recall=0.625, fmeasure=0.21897810218978103), 'rouge2': Score(precision=0.044642857142857144, recall=0.21739130434782608, fmeasure=0.07407407407407407), 'rougeL': Score(precision=0.07964601769911504, recall=0.375, fmeasure=0.1313868613138686)},
    {'rouge1': Score(precision=0.1111111111111111, recall=0.7073170731707317, fmeasure=0.1920529801324503), 'rouge2': Score(precision=0.038461538461538464, recall=0.25, fmeasure=0.06666666666666668), 'rougeL': Score(precision=0.07279693486590039, recall=0.4634146341463415, fmeasure=0.12582781456953643)},
    {'rouge1': Score(precision=0.16756756756756758, recall=0.7209302325581395, fmeasure=0.2719298245614035), 'rouge2': Score(precision=0.06521739130434782, recall=0.2857142857142857, fmeasure=0.10619469026548671), 'rougeL': Score(precision=0.10270270270270271, recall=0.4418604651162791, fmeasure=0.1666666666666667)},
    {'rouge1': Score(precision=0.11808118081180811, recall=0.6808510638297872, fmeasure=0.20125786163522014), 'rouge2': Score(precision=0.02962962962962963, recall=0.17391304347826086, fmeasure=0.05063291139240507), 'rougeL': Score(precision=0.05904059040590406, recall=0.3404255319148936, fmeasure=0.10062893081761007)},
    {'rouge1': Score(precision=0.1896551724137931, recall=0.5238095238095238, fmeasure=0.2784810126582279), 'rouge2': Score(precision=0.0782608695652174, recall=0.21951219512195122, fmeasure=0.11538461538461538), 'rougeL': Score(precision=0.1206896551724138, recall=0.3333333333333333, fmeasure=0.17721518987341772)},
    {'rouge1': Score(precision=0.216, recall=0.8181818181818182, fmeasure=0.34177215189873417), 'rouge2': Score(precision=0.0967741935483871, recall=0.375, fmeasure=0.15384615384615383), 'rougeL': Score(precision=0.12, recall=0.45454545454545453, fmeasure=0.189873417721519)},
    {'rouge1': Score(precision=0.11764705882352941, recall=0.5, fmeasure=0.19047619047619047), 'rouge2': Score(precision=0.03389830508474576, recall=0.14814814814814814, fmeasure=0.05517241379310345), 'rougeL': Score(precision=0.07563025210084033, recall=0.32142857142857145, fmeasure=0.12244897959183673)}
]

# Function to calculate average scores
def calculate_average_scores(rouge_scores: List[Dict[str, Score]]) -> Dict[str, Score]:
    # Initialize sums
    sum_scores = {key: Score(precision=0, recall=0, fmeasure=0) for key in rouge_scores[0].keys()}

    # Sum up all scores
    for scores in rouge_scores:
        for key, score in scores.items():
            sum_scores[key] = Score(
                precision=sum_scores[key].precision + score.precision,
                recall=sum_scores[key].recall + score.recall,
                fmeasure=sum_scores[key].fmeasure + score.fmeasure
            )

    # Calculate averages
    average_scores = {key: Score(
        precision=sum_score.precision / len(rouge_scores),
        recall=sum_score.recall / len(rouge_scores),
        fmeasure=sum_score.fmeasure / len(rouge_scores)
    ) for key, sum_score in sum_scores.items()}

    return average_scores

# Calculate and return average scores
average_scores = calculate_average_scores(rouge_scores)
average_scores

{'rouge1': Score(precision=0.17130123811350165, recall=0.6978011831242156, fmeasure=0.27215583554253964),
 'rouge2': Score(precision=0.08203859438210467, recall=0.32523544154069634, fmeasure=0.12962834520280095),
 'rougeL': Score(precision=0.11250810517323015, recall=0.44290801146441056, fmeasure=0.17749623846268742)}

Harry Potter star Daniel Radcliffe gains access to a reported 20 million 41 1 million fortune as he turns 18 on Monday. Radcliffe says he has no plans to fritter his cash away on fast cars drink and celebrity parties. Radcliffe s earnings from the first five Potter films have been held in a trust fund which he has not been able to touch. Details of how he ll mark his landmark birthday are under wraps. The Londoner has filmed a TV movie called My Boy Jack about author Rudyard Kipling and his son. He will also appear in December Boys an Australian film about four boys who escape an orphanage. Earlier this year he made his stage debut playing a tortured teenager in Peter Shaffer s Equus

# GPT Transformer

In [6]:
GPT_cs_ev = np.array([0.36085742689108996, 0.49380749182251815, 0.07778740451121077, 0.2552576067702628, 0.3246874064945309, 0.18473247134524573, 0.33362993647376604, 0.28290093428066077, 0.35162825977416934, 0.16218254333934326])
GPT_bleu = np.array([0.08275571211628001, 0.04646055781516015, 9.285743522911435e-79, 0.03550739701439182, 5.369641148426466e-155, 3.4580349761166087e-155, 1.515058646685186e-78, 0.0760951424958206, 1.4484515634481823e-78, 3.0598530310887206e-155])
GPT_METEOR = np.array([0.37161368244443355, 0.3661381152322822, 0.18197442046362908, 0.3655454950575272, 0.34790145985401455, 0.20561641453660237, 0.32203091521273336, 0.47863822638597414, 0.4197740365292726, 0.23474178403755872])

print(f"Average GPT Scores: cosine similarity score {np.mean(GPT_cs_ev)} , average of BLEU score {np.mean(GPT_bleu)} , average of METEOR {np.mean(GPT_METEOR)}")

Average GPT Scores: cosine similarity score 0.2827471481702798 , average of BLEU score 0.024081880944165256 , average of METEOR 0.3293974549754028


In [5]:
from typing import List, Dict
from collections import namedtuple

# Define a namedtuple for the ROUGE scores
Score = namedtuple('Score', 'precision recall fmeasure')

# ROUGE scores for different evaluations
rouge_scores = [
    {'rouge1': Score(precision=0.18518518518518517, recall=0.5128205128205128, fmeasure=0.272108843537415), 'rouge2': Score(precision=0.102803738317757, recall=0.2894736842105263, fmeasure=0.15172413793103448), 'rougeL': Score(precision=0.17592592592592593, recall=0.48717948717948717, fmeasure=0.2585034013605443)},
    {'rouge1': Score(precision=0.18787878787878787, recall=0.6326530612244898, fmeasure=0.2897196261682243), 'rouge2': Score(precision=0.06097560975609756, recall=0.20833333333333334, fmeasure=0.09433962264150944), 'rougeL': Score(precision=0.09696969696969697, recall=0.32653061224489793, fmeasure=0.14953271028037382)},
    {'rouge1': Score(precision=0.08641975308641975, recall=0.17073170731707318, fmeasure=0.11475409836065575), 'rouge2': Score(precision=0.0375, recall=0.075, fmeasure=0.05), 'rougeL': Score(precision=0.04938271604938271, recall=0.0975609756097561, fmeasure=0.06557377049180328)},
    {'rouge1': Score(precision=0.15384615384615385, recall=0.75, fmeasure=0.25531914893617025), 'rouge2': Score(precision=0.0603448275862069, recall=0.30434782608695654, fmeasure=0.10071942446043165), 'rougeL': Score(precision=0.09401709401709402, recall=0.4583333333333333, fmeasure=0.15602836879432622)},
    {'rouge1': Score(precision=0.2403846153846154, recall=0.6097560975609756, fmeasure=0.3448275862068965), 'rouge2': Score(precision=0.06796116504854369, recall=0.175, fmeasure=0.0979020979020979), 'rougeL': Score(precision=0.125, recall=0.3170731707317073, fmeasure=0.1793103448275862)},
    {'rouge1': Score(precision=0.14, recall=0.32558139534883723, fmeasure=0.19580419580419584), 'rouge2': Score(precision=0.020202020202020204, recall=0.047619047619047616, fmeasure=0.028368794326241134), 'rougeL': Score(precision=0.08, recall=0.18604651162790697, fmeasure=0.11188811188811189)},
    {'rouge1': Score(precision=0.28735632183908044, recall=0.5319148936170213, fmeasure=0.3731343283582089), 'rouge2': Score(precision=0.08139534883720931, recall=0.15217391304347827, fmeasure=0.10606060606060606), 'rougeL': Score(precision=0.14942528735632185, recall=0.2765957446808511, fmeasure=0.1940298507462687)},
    {'rouge1': Score(precision=0.25384615384615383, recall=0.7857142857142857, fmeasure=0.38372093023255816), 'rouge2': Score(precision=0.11627906976744186, recall=0.36585365853658536, fmeasure=0.17647058823529413), 'rougeL': Score(precision=0.12307692307692308, recall=0.38095238095238093, fmeasure=0.18604651162790697)},
    {'rouge1': Score(precision=0.26136363636363635, recall=0.696969696969697, fmeasure=0.3801652892561983), 'rouge2': Score(precision=0.08045977011494253, recall=0.21875, fmeasure=0.1176470588235294), 'rougeL': Score(precision=0.11363636363636363, recall=0.30303030303030304, fmeasure=0.1652892561983471)},
    {'rouge1': Score(precision=0.11188811188811189, recall=0.5714285714285714, fmeasure=0.18713450292397663), 'rouge2': Score(precision=0.014084507042253521, recall=0.07407407407407407, fmeasure=0.02366863905325444), 'rougeL': Score(precision=0.07692307692307693, recall=0.39285714285714285, fmeasure=0.1286549707602339)}
 ]

# Function to calculate average scores
def calculate_average_scores(rouge_scores: List[Dict[str, Score]]) -> Dict[str, Score]:
    # Initialize sums
    sum_scores = {key: Score(precision=0, recall=0, fmeasure=0) for key in rouge_scores[0].keys()}

    # Sum up all scores
    for scores in rouge_scores:
        for key, score in scores.items():
            sum_scores[key] = Score(
                precision=sum_scores[key].precision + score.precision,
                recall=sum_scores[key].recall + score.recall,
                fmeasure=sum_scores[key].fmeasure + score.fmeasure
            )

    # Calculate averages
    average_scores = {key: Score(
        precision=sum_score.precision / len(rouge_scores),
        recall=sum_score.recall / len(rouge_scores),
        fmeasure=sum_score.fmeasure / len(rouge_scores)
    ) for key, sum_score in sum_scores.items()}

    return average_scores

# Calculate and return average scores
average_scores = calculate_average_scores(rouge_scores)
average_scores

{'rouge1': Score(precision=0.19081687193181446, recall=0.5587570222001464, fmeasure=0.27966885497844995),
 'rouge2': Score(precision=0.06420060566724725, recall=0.19106255369040012, fmeasure=0.09469009694339987),
 'rougeL': Score(precision=0.10843570839547849, recall=0.3226159662247767, fmeasure=0.15948572969755026)}

Harry Potter star Daniel Radcliffe, who turned 18 on Monday, gains access to a reported £20 million ($41.1 million) fortune but insists that he won't be extravagant with his newfound wealth. Radcliffe says he has no plans to spend his money on fast cars, drink, or celebrity parties. He prefers buying things that cost around £10 like books, CDs, and DVDs. Radcliffe's earnings from the first five Potter films have been held in a trust fund, which he has not been able to touch. Despite his growing fame and riches, the actor says he is keeping his feet firmly on the ground and trying to avoid going off the rails like other child stars. Radcliffe's latest film, Harry Potter and the Order of the Phoenix, is breaking records in both the UK and the US. He also has other projects lined up, including a TV movie and an Australian film. With his newfound adult status, Radcliffe expects even closer media scrutiny.

# Fine-Tuned T5 Model

In [6]:
T5_ft_cs_ev = np.array([0.1289985907367188, 0.36992666978265637, 0.31576903178862953, 0.09482766850567939, 0.1713745597291317, 0.20134697774366345, 0.3422064513706536, 0.34211469363773894, 0.04701610374659439, 0.19342170935126668])
T5_ft_bleu = np.array([0.04926907641268068, 0.05316841432287423, 4.450692829663332e-155, 1.1465375328390843e-231, 1.4005706818653289e-78, 3.2125928308312104e-155, 1.6872397221559478e-78, 0.14743977874893247, 1.012071042130996e-231, 0.08191181829206526])
T5_ft_METEOR = np.array([0.2171731651376147, 0.1850481327982808, 0.20454003133202617, 0.1515151515151515, 0.21209183673469387, 0.182568965187635, 0.2678315789473684, 0.46028645833333326, 0.0746268656716418, 0.44537066163363737])

print(f"Average T5 Scores: cosine similarity score {np.mean(T5_ft_cs_ev)} , average of BLEU score {np.mean(T5_ft_bleu)} , average of METEOR {np.mean(T5_ft_METEOR)}")

Average T5 Scores: cosine similarity score 0.22070024563927326 , average of BLEU score 0.03317890877765527 , average of METEOR 0.24010528472913828


In [7]:
from typing import List, Dict
from collections import namedtuple

# Define a namedtuple for the ROUGE scores
Score = namedtuple('Score', 'precision recall fmeasure')

# ROUGE scores for different evaluations
rouge_scores = [
{'rouge1': Score(precision=0.16071428571428573, recall=0.23076923076923078, fmeasure=0.18947368421052632), 'rouge2': Score(precision=0.05454545454545454, recall=0.07894736842105263, fmeasure=0.06451612903225806), 'rougeL': Score(precision=0.10714285714285714, recall=0.15384615384615385, fmeasure=0.12631578947368421)}, {'rouge1': Score(precision=0.4358974358974359, recall=0.3469387755102041, fmeasure=0.3863636363636364), 'rouge2': Score(precision=0.10526315789473684, recall=0.08333333333333333, fmeasure=0.0930232558139535), 'rougeL': Score(precision=0.23076923076923078, recall=0.1836734693877551, fmeasure=0.20454545454545456)}, {'rouge1': Score(precision=0.2857142857142857, recall=0.43902439024390244, fmeasure=0.34615384615384615), 'rouge2': Score(precision=0.0967741935483871, recall=0.15, fmeasure=0.11764705882352941), 'rougeL': Score(precision=0.19047619047619047, recall=0.2926829268292683, fmeasure=0.23076923076923075)}, {'rouge1': Score(precision=0.14583333333333334, recall=0.2916666666666667, fmeasure=0.19444444444444448), 'rouge2': Score(precision=0.02127659574468085, recall=0.043478260869565216, fmeasure=0.028571428571428567), 'rougeL': Score(precision=0.10416666666666667, recall=0.20833333333333334, fmeasure=0.1388888888888889)}, {'rouge1': Score(precision=0.16923076923076924, recall=0.2682926829268293, fmeasure=0.2075471698113208), 'rouge2': Score(precision=0.046875, recall=0.075, fmeasure=0.05769230769230769), 'rougeL': Score(precision=0.13846153846153847, recall=0.21951219512195122, fmeasure=0.16981132075471697)}, {'rouge1': Score(precision=0.1518987341772152, recall=0.27906976744186046, fmeasure=0.19672131147540983), 'rouge2': Score(precision=0.01282051282051282, recall=0.023809523809523808, fmeasure=0.016666666666666666), 'rougeL': Score(precision=0.11392405063291139, recall=0.20930232558139536, fmeasure=0.14754098360655737)}, {'rouge1': Score(precision=0.2857142857142857, recall=0.3829787234042553, fmeasure=0.32727272727272727), 'rouge2': Score(precision=0.08064516129032258, recall=0.10869565217391304, fmeasure=0.09259259259259259), 'rougeL': Score(precision=0.1746031746031746, recall=0.23404255319148937, fmeasure=0.2)}, {'rouge1': Score(precision=0.48148148148148145, recall=0.6190476190476191, fmeasure=0.5416666666666666), 'rouge2': Score(precision=0.2830188679245283, recall=0.36585365853658536, fmeasure=0.3191489361702127), 'rougeL': Score(precision=0.2962962962962963, recall=0.38095238095238093, fmeasure=0.3333333333333333)}, {'rouge1': Score(precision=0.075, recall=0.09090909090909091, fmeasure=0.08219178082191782), 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rougeL': Score(precision=0.05, recall=0.06060606060606061, fmeasure=0.05479452054794521)}, {'rouge1': Score(precision=0.2708333333333333, recall=0.4642857142857143, fmeasure=0.3421052631578947), 'rouge2': Score(precision=0.10638297872340426, recall=0.18518518518518517, fmeasure=0.13513513513513514), 'rougeL': Score(precision=0.20833333333333334, recall=0.35714285714285715, fmeasure=0.2631578947368421)}
]

# Function to calculate average scores
def calculate_average_scores(rouge_scores: List[Dict[str, Score]]) -> Dict[str, Score]:
    # Initialize sums
    sum_scores = {key: Score(precision=0, recall=0, fmeasure=0) for key in rouge_scores[0].keys()}

    # Sum up all scores
    for scores in rouge_scores:
        for key, score in scores.items():
            sum_scores[key] = Score(
                precision=sum_scores[key].precision + score.precision,
                recall=sum_scores[key].recall + score.recall,
                fmeasure=sum_scores[key].fmeasure + score.fmeasure
            )

    # Calculate averages
    average_scores = {key: Score(
        precision=sum_score.precision / len(rouge_scores),
        recall=sum_score.recall / len(rouge_scores),
        fmeasure=sum_score.fmeasure / len(rouge_scores)
    ) for key, sum_score in sum_scores.items()}

    return average_scores

# Calculate and return average scores
average_scores = calculate_average_scores(rouge_scores)
average_scores

{'rouge1': Score(precision=0.24623179445964255, recall=0.3412982661205374, fmeasure=0.28139405303783904),
 'rouge2': Score(precision=0.08076019224920272, recall=0.11143029823291586, fmeasure=0.09249935104980844),
 'rougeL': Score(precision=0.1614173338382199, recall=0.2300094255992645, fmeasure=0.18691574166566532)}

he will be able to gamble in a casino buy a drink in a pub or see the horror film Hostel Part II currently six places below his number one movie on the UK box office chart. despite his growing fame and riches the young actor says he is keeping his feet firmly on the ground.

# Fine-Tuned BART:

In [8]:
BART_ft_cs_ev = np.array([0.6444823937847926, 0.4090179912362602, 0.1608096947628205, 0.194775963358456, 0.3260743091463692, 0.1460970800186441, 0.3379048414209187, 0.28691396739851954, 0.4246289206642457, 0.139475820918422])
BART_ft_bleu = np.array([0.3768331238005331, 0.042932320661346675, 0.1258955283490893, 0.030535796070597317, 0.02949662656591407, 0.006213841460704296, 0.06743392319512985, 0.05344759207671911, 0.030157820855373545, 0.03887213301533122])
BART_ft_METEOR = np.array([0.8182830121927016, 0.2588379845871692, 0.3871013566304426, 0.28629402529695774, 0.40641495601173017, 0.136986301369863, 0.27271638215034444, 0.520040161574891, 0.4058210981456922, 0.372896334434796])

print(f"Average BART Scores: cosine similarity score {np.mean(BART_ft_cs_ev)} , average of BLEU score {np.mean(BART_ft_bleu)} , average of METEOR {np.mean(BART_ft_METEOR)}")

Average BART Scores: cosine similarity score 0.30701809827094484 , average of BLEU score 0.08018187060507384 , average of METEOR 0.3865391612394588


In [9]:
from typing import List, Dict
from collections import namedtuple

# Define a namedtuple for the ROUGE scores
Score = namedtuple('Score', 'precision recall fmeasure')

# ROUGE scores for different evaluations
rouge_scores = [
{'rouge1': Score(precision=0.5230769230769231, recall=0.918918918918919, fmeasure=0.6666666666666666), 'rouge2': Score(precision=0.4375, recall=0.7777777777777778, fmeasure=0.56), 'rougeL': Score(precision=0.5230769230769231, recall=0.918918918918919, fmeasure=0.6666666666666666)}, {'rouge1': Score(precision=0.3541666666666667, recall=0.4594594594594595, fmeasure=0.4000000000000001), 'rouge2': Score(precision=0.0851063829787234, recall=0.1111111111111111, fmeasure=0.09638554216867469), 'rougeL': Score(precision=0.16666666666666666, recall=0.21621621621621623, fmeasure=0.18823529411764706)}, {'rouge1': Score(precision=0.30357142857142855, recall=0.4146341463414634, fmeasure=0.35051546391752575), 'rouge2': Score(precision=0.12727272727272726, recall=0.175, fmeasure=0.14736842105263157), 'rougeL': Score(precision=0.21428571428571427, recall=0.2926829268292683, fmeasure=0.24742268041237114)}, {'rouge1': Score(precision=0.16666666666666666, recall=0.4166666666666667, fmeasure=0.23809523809523808), 'rouge2': Score(precision=0.05084745762711865, recall=0.13043478260869565, fmeasure=0.07317073170731708), 'rougeL': Score(precision=0.13333333333333333, recall=0.3333333333333333, fmeasure=0.19047619047619044)}, {'rouge1': Score(precision=0.3220338983050847, recall=0.6129032258064516, fmeasure=0.4222222222222223), 'rouge2': Score(precision=0.15517241379310345, recall=0.3, fmeasure=0.20454545454545456), 'rougeL': Score(precision=0.22033898305084745, recall=0.41935483870967744, fmeasure=0.28888888888888886)}, {'rouge1': Score(precision=0.10869565217391304, recall=0.2631578947368421, fmeasure=0.15384615384615385), 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rougeL': Score(precision=0.07608695652173914, recall=0.18421052631578946, fmeasure=0.10769230769230768)}, {'rouge1': Score(precision=0.22580645161290322, recall=0.4117647058823529, fmeasure=0.29166666666666663), 'rouge2': Score(precision=0.06557377049180328, recall=0.12121212121212122, fmeasure=0.08510638297872339), 'rougeL': Score(precision=0.11290322580645161, recall=0.20588235294117646, fmeasure=0.14583333333333331)}, {'rouge1': Score(precision=0.3230769230769231, recall=0.65625, fmeasure=0.43298969072164956), 'rouge2': Score(precision=0.140625, recall=0.2903225806451613, fmeasure=0.18947368421052632), 'rougeL': Score(precision=0.23076923076923078, recall=0.46875, fmeasure=0.3092783505154639)}, {'rouge1': Score(precision=0.34615384615384615, recall=0.5454545454545454, fmeasure=0.4235294117647059), 'rouge2': Score(precision=0.09803921568627451, recall=0.15625, fmeasure=0.12048192771084337), 'rougeL': Score(precision=0.19230769230769232, recall=0.30303030303030304, fmeasure=0.23529411764705882)}, {'rouge1': Score(precision=0.2037037037037037, recall=0.39285714285714285, fmeasure=0.26829268292682923), 'rouge2': Score(precision=0.09433962264150944, recall=0.18518518518518517, fmeasure=0.125), 'rougeL': Score(precision=0.16666666666666666, recall=0.32142857142857145, fmeasure=0.21951219512195122)}
 ]

# Function to calculate average scores
def calculate_average_scores(rouge_scores: List[Dict[str, Score]]) -> Dict[str, Score]:
    # Initialize sums
    sum_scores = {key: Score(precision=0, recall=0, fmeasure=0) for key in rouge_scores[0].keys()}

    # Sum up all scores
    for scores in rouge_scores:
        for key, score in scores.items():
            sum_scores[key] = Score(
                precision=sum_scores[key].precision + score.precision,
                recall=sum_scores[key].recall + score.recall,
                fmeasure=sum_scores[key].fmeasure + score.fmeasure
            )

    # Calculate averages
    average_scores = {key: Score(
        precision=sum_score.precision / len(rouge_scores),
        recall=sum_score.recall / len(rouge_scores),
        fmeasure=sum_score.fmeasure / len(rouge_scores)
    ) for key, sum_score in sum_scores.items()}

    return average_scores

# Calculate and return average scores
average_scores = calculate_average_scores(rouge_scores)
average_scores

{'rouge1': Score(precision=0.28769521600080594, recall=0.5092066706123843, fmeasure=0.3647824196827657),
 'rouge2': Score(precision=0.125447659049126, recall=0.22472935585400525, fmeasure=0.1601532144374171),
 'rougeL': Score(precision=0.20364353924852657, recall=0.3663807987723255, fmeasure=0.25993000248718795)}

Harry Potter star Daniel Radcliffe gains access to a reported 20 million 41 1 million fortune as he turns 18 on Monday. The actor says he has no plans to fritter his cash away on fast cars drink and celebrity parties. Radcliffe s earnings from the first five Potter films have been held in a trust fund which he has not been able to touch.

# Fine-Tuned GPT:

In [22]:
GPT_ft_cs_ev = np.array([0.4249507100185822, 0.3895468223481394, 0.1002237462286768, 0.2946083876771906, 0.2790504707841771, 0.17201544463162935, 0.3038941952955195, 0.29541924656219637, 0.44596932947928963, 0.14372970081811628])
GPT_ft_bleu = np.array([0.17197504803923988, 0.05197426794975792, 0.02817056648462148, 0.04522690726491742, 5.279006155813533e-155, 3.0689352499743663e-155, 1.101004953905863e-78, 0.130792467406389, 0.0747916361399847, 4.017758378857155e-155])
GPT_ft_METEOR = np.array([0.6410897892162736, 0.3480152707172067, 0.22104347826086954, 0.3945570422843151, 0.32254428341384866, 0.1542649727767695, 0.21150592216582062, 0.5472743590744136, 0.4533372513924692, 0.23943661971830985])

print(f"Average Fine-Tuned GPT Scores: cosine similarity score {np.mean(GPT_ft_cs_ev)} , average of BLEU score {np.mean(GPT_ft_bleu)} , average of METEOR {np.mean(GPT_ft_METEOR)}")

Average GPT Scores: cosine similarity score 0.28494080538435174 , average of BLEU score 0.05029308932849104 , average of METEOR 0.35330689890202965


In [10]:
from typing import List, Dict
from collections import namedtuple

# Define a namedtuple for the ROUGE scores
Score = namedtuple('Score', 'precision recall fmeasure')

# ROUGE scores for different evaluations
rouge_scores = [
{'rouge1': Score(precision=0.29896907216494845, recall=0.7435897435897436, fmeasure=0.4264705882352941), 'rouge2': Score(precision=0.21875, recall=0.5526315789473685, fmeasure=0.31343283582089554), 'rougeL': Score(precision=0.27835051546391754, recall=0.6923076923076923, fmeasure=0.3970588235294118)},
{'rouge1': Score(precision=0.28125, recall=0.5510204081632653, fmeasure=0.37241379310344824), 'rouge2': Score(precision=0.08421052631578947, recall=0.16666666666666666, fmeasure=0.11188811188811187), 'rougeL': Score(precision=0.14583333333333334, recall=0.2857142857142857, fmeasure=0.19310344827586204)},
{'rouge1': Score(precision=0.125, recall=0.2926829268292683, fmeasure=0.17518248175182483), 'rouge2': Score(precision=0.021052631578947368, recall=0.05, fmeasure=0.02962962962962963), 'rougeL': Score(precision=0.0625, recall=0.14634146341463414, fmeasure=0.08759124087591241)},
{'rouge1': Score(precision=0.2, recall=0.75, fmeasure=0.31578947368421056), 'rouge2': Score(precision=0.06741573033707865, recall=0.2608695652173913, fmeasure=0.10714285714285714), 'rougeL': Score(precision=0.1111111111111111, recall=0.4166666666666667, fmeasure=0.17543859649122806)},
{'rouge1': Score(precision=0.23423423423423423, recall=0.6341463414634146, fmeasure=0.3421052631578948), 'rouge2': Score(precision=0.05454545454545454, recall=0.15, fmeasure=0.08), 'rougeL': Score(precision=0.14414414414414414, recall=0.3902439024390244, fmeasure=0.2105263157894737)},
{'rouge1': Score(precision=0.13953488372093023, recall=0.27906976744186046, fmeasure=0.186046511627907), 'rouge2': Score(precision=0.011764705882352941, recall=0.023809523809523808, fmeasure=0.015748031496062992), 'rougeL': Score(precision=0.08139534883720931, recall=0.16279069767441862, fmeasure=0.10852713178294575)},
{'rouge1': Score(precision=0.2222222222222222, recall=0.3829787234042553, fmeasure=0.28125), 'rouge2': Score(precision=0.025, recall=0.043478260869565216, fmeasure=0.03174603174603175), 'rougeL': Score(precision=0.12345679012345678, recall=0.2127659574468085, fmeasure=0.15625)},
{'rouge1': Score(precision=0.35555555555555557, recall=0.7619047619047619, fmeasure=0.48484848484848486), 'rouge2': Score(precision=0.1797752808988764, recall=0.3902439024390244, fmeasure=0.24615384615384617), 'rougeL': Score(precision=0.16666666666666666, recall=0.35714285714285715, fmeasure=0.22727272727272724)},
{'rouge1': Score(precision=0.2702702702702703, recall=0.6060606060606061, fmeasure=0.3738317757009346), 'rouge2': Score(precision=0.136986301369863, recall=0.3125, fmeasure=0.19047619047619047), 'rougeL': Score(precision=0.16216216216216217, recall=0.36363636363636365, fmeasure=0.22429906542056074)},
{'rouge1': Score(precision=0.17142857142857143, recall=0.42857142857142855, fmeasure=0.24489795918367346), 'rouge2': Score(precision=0.028985507246376812, recall=0.07407407407407407, fmeasure=0.041666666666666664), 'rougeL': Score(precision=0.12857142857142856, recall=0.32142857142857145, fmeasure=0.18367346938775508)}
]

# Function to calculate average scores
def calculate_average_scores(rouge_scores: List[Dict[str, Score]]) -> Dict[str, Score]:
    # Initialize sums
    sum_scores = {key: Score(precision=0, recall=0, fmeasure=0) for key in rouge_scores[0].keys()}

    # Sum up all scores
    for scores in rouge_scores:
        for key, score in scores.items():
            sum_scores[key] = Score(
                precision=sum_scores[key].precision + score.precision,
                recall=sum_scores[key].recall + score.recall,
                fmeasure=sum_scores[key].fmeasure + score.fmeasure
            )

    # Calculate averages
    average_scores = {key: Score(
        precision=sum_score.precision / len(rouge_scores),
        recall=sum_score.recall / len(rouge_scores),
        fmeasure=sum_score.fmeasure / len(rouge_scores)
    ) for key, sum_score in sum_scores.items()}

    return average_scores

# Calculate and return average scores
average_scores = calculate_average_scores(rouge_scores)
average_scores

{'rouge1': Score(precision=0.22984648095967325, recall=0.5430024707428605, fmeasure=0.3202836331293672),
 'rouge2': Score(precision=0.08284861381747392, recall=0.20242735720236138, fmeasure=0.11678842010202921),
 'rougeL': Score(precision=0.14041915004134298, recall=0.3349038457871323, fmeasure=0.1963740818825877)}

Daniel Radcliffe, the star of the Harry Potter films, has turned 18 and gained access to his £20 million fortune. Despite his new wealth, Radcliffe insists that he won't become extravagant and plans to continue his modest spending habits on books and DVDs. He also stated that he will have a party to celebrate his birthday, but details of the event are under wraps. Radcliffe’s earnings from the Harry Potter films have been held in a trust fund that he has not been able to touch.

# Everything put together:

T5 Evaluation:

- Average T5 Scores: cosine similarity score 0.3214415377008123 , average of BLEU score 0.0547375357558972 , average of METEOR 0.35688821790239855
- {'rouge1': Score(precision=0.15631046342772967, recall=0.6576987596250709, fmeasure=0.24948944290384137),
 'rouge2': Score(precision=0.06711451304495733, recall=0.2760758589061319, fmeasure=0.10697314305551948),
 'rougeL': Score(precision=0.10316571817843043, recall=0.4263545739431775, fmeasure=0.16414266883753376)}

T5 Fine-Tuned Evaluation:

- Average T5 Scores: cosine similarity score 0.22070024563927326 , average of BLEU score 0.03317890877765527 , average of METEOR 0.24010528472913828
- {'rouge1': Score(precision=0.24623179445964255, recall=0.3412982661205374, fmeasure=0.28139405303783904),
 'rouge2': Score(precision=0.08076019224920272, recall=0.11143029823291586, fmeasure=0.09249935104980844),
 'rougeL': Score(precision=0.1614173338382199, recall=0.2300094255992645, fmeasure=0.18691574166566532)}

BART Evaluation:

- Average BART Scores: cosine similarity score 0.29781103042950635 , average of BLEU score 0.07363850827561946 , average of METEOR 0.38112193669566696
- {'rouge1': Score(precision=0.17130123811350165, recall=0.6978011831242156, fmeasure=0.27215583554253964),
 'rouge2': Score(precision=0.08203859438210467, recall=0.32523544154069634, fmeasure=0.12962834520280095),
 'rougeL': Score(precision=0.11250810517323015, recall=0.44290801146441056, fmeasure=0.17749623846268742)}

BART Fine-Tuned Evaluation:

- Average BART Scores: cosine similarity score 0.30701809827094484 , average of BLEU score 0.08018187060507384 , average of METEOR 0.3865391612394588
- {'rouge1': Score(precision=0.28769521600080594, recall=0.5092066706123843, fmeasure=0.3647824196827657),
 'rouge2': Score(precision=0.125447659049126, recall=0.22472935585400525, fmeasure=0.1601532144374171),
 'rougeL': Score(precision=0.20364353924852657, recall=0.3663807987723255, fmeasure=0.25993000248718795)}

GPT Evaluation:

- Average GPT Scores: cosine similarity score 0.2827471481702798 , average of BLEU score 0.024081880944165256 , average of METEOR 0.3293974549754028
- {'rouge1': Score(precision=0.19081687193181446, recall=0.5587570222001464, fmeasure=0.27966885497844995),
 'rouge2': Score(precision=0.06420060566724725, recall=0.19106255369040012, fmeasure=0.09469009694339987),
 'rougeL': Score(precision=0.10843570839547849, recall=0.3226159662247767, fmeasure=0.15948572969755026)}


GPT Fine-Tuned Evaluation:

- Average GPT Scores: cosine similarity score 0.28494080538435174 , average of BLEU score 0.05029308932849104 , average of METEOR 0.35330689890202965
- {'rouge1': Score(precision=0.22984648095967325, recall=0.5430024707428605, fmeasure=0.3202836331293672),
 'rouge2': Score(precision=0.08284861381747392, recall=0.20242735720236138, fmeasure=0.11678842010202921),
 'rougeL': Score(precision=0.14041915004134298, recall=0.3349038457871323, fmeasure=0.1963740818825877)}







# T5 Evaluation

In [15]:
import plotly.graph_objects as go

models = ['T5', 'T5 Fine-Tuned']
rouge1_precision = [0.156, 0.246]
rouge1_recall = [0.658, 0.341]
rouge1_fm = [0.249, 0.281]

fig = go.Figure()
fig.add_trace(go.Bar(x=models, y=rouge1_precision, name='Rouge 1 Precision'))
fig.add_trace(go.Bar(x=models, y=rouge1_recall, name='Rouge 1 Recall'))
fig.add_trace(go.Bar(x=models, y=rouge1_fm, name='Rouge 1 Fmeasure'))

fig.update_layout(
    title='Comparison of T5 Model Performances on ROUGE 1 Metrics',
    xaxis_title='Models',
    yaxis_title='Scores',
    legend_title='Metrics',
    hovermode='x'
)

fig.show()

# BART Evaluation

In [16]:
import plotly.graph_objects as go

models = ['BART', 'BART Fine-Tuned']
rouge1_precision = [0.171, 0.288]
rouge1_recall = [0.698, 0.509]
rouge1_fm = [0.272, 0.364]
fig = go.Figure()
fig.add_trace(go.Bar(x=models, y=rouge1_precision, name='Rouge 1 Precision'))
fig.add_trace(go.Bar(x=models, y=rouge1_recall, name='Rouge 1 Recall'))
fig.add_trace(go.Bar(x=models, y=rouge1_fm, name='Rouge 1 Fmeasure'))

fig.update_layout(
    title='Comparison of BART Model Performances on ROUGE 1 Metrics',
    xaxis_title='Models',
    yaxis_title='Scores',
    legend_title='Metrics',
    hovermode='x'
)

fig.show()

# GPT Evaluation

In [18]:
import plotly.graph_objects as go

models = ['GPT', 'Prompt-Engineered GPT']
rouge1_precision = [0.191, 0.230]
rouge1_recall = [0.559, 0.543]
rouge1_fm = [0.279, 0.320]

fig = go.Figure()
fig.add_trace(go.Bar(x=models, y=rouge1_precision, name='Rouge 1 Precision'))
fig.add_trace(go.Bar(x=models, y=rouge1_recall, name='Rouge 1 Recall'))
fig.add_trace(go.Bar(x=models, y=rouge1_fm, name='Rouge 1 Fmeasure'))

fig.update_layout(
    title='Comparison of GPT Model Performances on ROUGE 1 Metrics',
    xaxis_title='Models',
    yaxis_title='Scores',
    legend_title='Metrics',
    hovermode='x'
)

fig.show()

# Overall Calculations

In [11]:
import plotly.graph_objects as go

models = ['T5', 'T5 Fine-Tuned', 'BART', 'BART Fine-Tuned', 'GPT', 'GPT Fine-Tuned']
rouge1_precision = [0.156, 0.246, 0.171, 0.288, 0.191, 0.230]
rouge1_recall = [0.658, 0.341, 0.698, 0.509, 0.559, 0.543]
rouge1_fm = [0.249, 0.281, 0.272, 0.364, 0.279, 0.320]

fig = go.Figure()
fig.add_trace(go.Bar(x=models, y=rouge1_precision, name='Rouge 1 Precision'))
fig.add_trace(go.Bar(x=models, y=rouge1_recall, name='Rouge 1 Recall'))
fig.add_trace(go.Bar(x=models, y=rouge1_fm, name='Rouge 1 Fmeasure'))

fig.update_layout(
    title='Comparison of NLP Model Performances on ROUGE 1 Metrics',
    xaxis_title='Models',
    yaxis_title='Scores',
    legend_title='Metrics',
    hovermode='x'
)

fig.show()

In [12]:
import plotly.graph_objects as go

models = ['T5', 'T5 Fine-Tuned', 'BART', 'BART Fine-Tuned', 'GPT', 'GPT Fine-Tuned']
rouge2_precision = [0.067, 0.081, 0.082, 0.125, 0.064, 0.083]
rouge2_recall = [0.276, 0.111, 0.325, 0.225, 0.191, 0.202]
rouge2_fm = [0.107, 0.092, 0.130, 0.160, 0.095, 0.117]

fig = go.Figure()
fig.add_trace(go.Bar(x=models, y=rouge2_precision, name='Rouge 2 Precision'))
fig.add_trace(go.Bar(x=models, y=rouge2_recall, name='Rouge 2 Recall'))
fig.add_trace(go.Bar(x=models, y=rouge2_fm, name='Rouge 2 Fmeasure'))

fig.update_layout(
    title='Comparison of NLP Model Performances on ROUGE 2 Metrics',
    xaxis_title='Models',
    yaxis_title='Scores',
    legend_title='Metrics',
    hovermode='x'
)

fig.show()


In [49]:
import plotly.graph_objects as go

models = ['T5', 'T5 Fine-Tuned', 'BART', 'BART Fine-Tuned', 'GPT', 'GPT Fine-Tuned']
rougeL_precision = [0.103, 0.161, 0.113, 0.204, 0.108, 0.140]
rougeL_recall = [0.426, 0.230, 0.443, 0.366, 0.323, 0.335]
rougeL_fm = [0.164, 0.187, 0.177, 0.260, 0.159, 0.196]

fig = go.Figure()
fig.add_trace(go.Bar(x=models, y=rougeL_precision, name='Rouge L Precision'))
fig.add_trace(go.Bar(x=models, y=rougeL_recall, name='Rouge L Recall'))
fig.add_trace(go.Bar(x=models, y=rougeL_fm, name='Rouge L Fmeasure'))

fig.update_layout(
    title='Comparison of NLP Model Performances on ROUGE L Metrics',
    xaxis_title='Models',
    yaxis_title='Scores',
    legend_title='Metrics',
    hovermode='x'
)

fig.show()
