# Install Libraries (Colab Only)

In [9]:
!pip install openai



In [10]:
!pip install tiktoken



# Define Parameters

### GPT version used for generation

In [1]:
gpt_version = "gpt-4-0125-preview" # change to this for use GPT3 "gpt-3.5-turbo-0125"

### Pricing

In [2]:
pricings = {"gpt-4-0125-preview":(10,30),
            "gpt-3.5-turbo-0125": (0.50, 1.50)}

### Max Tokens

In [3]:
max_tokens = 100

### Moral Values and their Descriptors

In [4]:
moral_values = {"Care": "underlies the virtues of kindness and nurturance, empathy and compassion towards other people",
                "Harm": "which represents the opposite of care, characterised by violent actions and intentions",
                "Fairness": " which underlies the virtues of honesty, fair treatment, justice, and dependability.",
                "Cheating": "which represents the opposite of fairness, encapsulating instances of unfairness and injustice.",
                "Loyalty": "which deals with group loyalty, self-sacrifice, and vigilance against betrayal",
                "Betrayal":  "which underlies wrongdoing and betraying your group or your relationship.",
                "Authority": "which underlies virtues of leadership and respect within hierarchical relationships",
                "Subversion": "which refers to acts of defiance against authority or hierarchy, and  rebellion against control",
                "Purity": " which is concerned with the sanctity of the body and spirit, promoting virtues like chastity and self-restraint",
                "Degradation": "that denotes the violation of purity and sanctity, including both physical and emotional corruption"}

In [5]:
all_moral_values = []
for values in moral_values:
  all_moral_values.append(values)
  possible_moral_values = all_moral_values.copy()
all_moral_values = [[value] for value in all_moral_values]

In [6]:
all_moral_values

[['Care'],
 ['Harm'],
 ['Fairness'],
 ['Cheating'],
 ['Loyalty'],
 ['Betrayal'],
 ['Authority'],
 ['Subversion'],
 ['Purity'],
 ['Degradation']]

### OpenAI API Key
Here, insert the API key for OpenAI. You need to create an account and add credit to it in order to use the code below

In [7]:
import openai

openai.api_key = "" #enter here the api key

# Utility Functions

#### compute number of tokens and pricing (pricing is per 1000 token)

In [8]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

print(num_tokens_from_string("Hello world, let's test tiktoken.", gpt_version))

9


In [9]:
def compute_actual_price(completion, gpt_version: str) -> int:
    """
    Compute maximum price for API call with given prompt
    The maximum price is computed by assuming the model will output
    the maximum number of tokens that it can output (i.e. 4096).
    Return price in dollars
    """
    price_in = completion.usage.prompt_tokens/1000000*pricings[gpt_version][0]
    price_out = completion.usage.completion_tokens/1000000*pricings[gpt_version][1]

    return price_in+price_out

#### Accessing Moral Values Descriptors

In [10]:
def get_moral_value(moral_value):
  for key, description in moral_values.items():
    if moral_value in key:
      return description

### Vectorise Moral Values
Create vector representation of moral values to store results

In [11]:
from typing import List

def vectorise_moral_values(moral_values: List[int], possible_moral_values):
  moral_values_vector = [0 for _ in range(len(possible_moral_values))]
  for value in moral_values:
    moral_values_vector[value] = 1
  return moral_values_vector

In [12]:
vectorise_moral_values([0, 6], possible_moral_values)

[1, 0, 0, 0, 0, 0, 1, 0, 0, 0]

### Prompt Creation and Response Formatting

In [13]:
import json

class JSON_QueryHandler:
    def __init__(
        self, model, print_output=False
    ):
        self.print_output = print_output
        self.model = model

    def check_response_valid(self, model_response: str):
        valid = True

        try:
            chat_completion_json = json.loads(model_response)
        except:
            print("Model response: " + str(model_response)[0:50] + "...")
            print(
                "Could not cast model response to JSON. " "Retrying by trimming ends."
            )
            model_response_trimmed = "[" + model_response.split("[", 1)[1]
            model_response_trimmed = model_response_trimmed.split("]", 1)[0] + "]"
            try:
                chat_completion_json = json.loads(model_response_trimmed)
            except:
                print(
                    "Still could not cast model response to JSON. "
                )

        try:
          moral_values = chat_completion_json["moral_values"]
          for value in moral_values:
            if value not in set(list(range(10))):
                print(f"Prediction index {value} out of possible moral value keys.")
                valid = False
        except TypeError:
          for value in chat_completion_json:
            if value not in set(list(range(10))):
                print(f"Prediction index {value} out of possible moral value keys.")
                valid = False
          if valid:
            chat_completion_json["moral_values"] = chat_completion_json
        except KeyError:
          try:
            moral_values = list(chat_completion_json.values())[0]

            for value in moral_values:
              if value not in set(list(range(10))):
                  print(f"Prediction index {value} out of possible moral value keys.")
                  valid = False
            if valid:
              chat_completion_json["moral_values"]=moral_values
          except:
            valid = False

        if valid:
          return valid, chat_completion_json
        else:
          return valid, self.create_cast_to_json_prompt(model_response)

    def create_cast_to_json_prompt(self, model_response: str):
        system_template = """
You are a JSON writing assistant. \
When given invalid JSON, you correct it. \
If JSON is already valid, return it as it is.
"""
        system_message_prompt = system_template
        human_prompt = f"Invalid JSON: \n{model_response} \n\nCorrected JSON:\n"
        prompt = [{"role": "system", "content": system_message_prompt},
                  {"role": "user", "content": human_prompt}]

        return prompt

In [14]:
import re

def prompt_creation(song,
                    gpt_version,
                    option=2,
                    CoT=False):

  if CoT:
    ending = "\n\nLet's think it step by step and output your reasoning together with the json results."
  else:
    ending = "\n\nDO NOT explain your reasoning.\nDO NOT output anything else but the json results."
  if not option:
      system_message = """
                    You will be provided with song lyrics. \
                    The song lyrics will be delimited with \
                    #### characters.
                    Classify each lyrics into 10 Possible Moral Values \
                    as defined in Moral Foundation Theory.

                    The availablle Moral Values are: \
                    0: care \
                    1: harm \
                    2: fairness \
                    3: cheating \
                    4: loyalty \
                    5: betrayal \
                    6: authority \
                    7: subversion \
                    8: purity \
                    9: degradation

                    Report the results in json format such that the key of the
                    correct moral values is reported in a list:
                    e.g. {"moral_values":[0]}
                    """
  elif option<3:
      system_message = """
                    You will be provided with song lyrics. \
                    The song lyrics will be delimited with \
                    #### characters.
                    Classify each lyrics into 10 Possible Moral Values \
                    as defined in Moral Foundation Theory.

                    The availablle Moral Values are: \
                    0: care \
                    1: harm \
                    2: fairness \
                    3: cheating \
                    4: loyalty \
                    5: betrayal \
                    6: authority \
                    7: subversion \
                    8: purity \
                    9: degradation

                    This is a multilabel classification problem: more than
                    one category at a time can be assigned.
                    Report the results in json format such that the keys of the
                    correct moral values are reported in a list:
                    e.g. {"moral_values":[0,2]}
                    """
  else:
    system_message = """
                    You will be provided with song lyrics. \
                    The song lyrics will be delimited with \
                    #### characters.
                    Classify each lyrics into 10 Possible Moral Foundations \
                    as defined in Moral Foundation Theory.

                    The availablle Moral Foundations are: \
                    0: care \
                    1: harm \
                    2: fairness \
                    3: cheating \
                    4: loyalty \
                    5: betrayal \
                    6: authority \
                    7: subversion \
                    8: purity \
                    9: degradation

                    The explanation of the moral foundations is as following:
                    Care: underlies the virtues of kindness, nurturance, empathy, and compassion towards others. 
                    Harm: represents the opposite of care, characterized by violent actions and intentions. 
                    Fairness: underlies virtues of honesty, fair treatment, justice, and dependability. 
                    Cheating: represents the opposite of fairness, encapsulating instances of unfairness and injustice. 
                    Loyalty: represents group loyalty, self-sacrifice, and vigilance against betrayal. 
                    Betrayal: represents the opposite of loyalty and underlies wrongdoing and betraying your group or your relationship. 
                    Authority: underlies virtues of leadership and respect within hierarchical relationships.
                    Subversion: represents the opposite of authority, referring to acts of defiance against hierarchy or tradition and rebellion against control. 
                    Purity: is concerned with the sanctity of the body and spirit, promoting virtues like chastity and self-restraint. 
                    Degradation: represents the opposite of purity, denoting the violation of sanctity, including both physical and emotional corruption.

                    This is a multilabel classification problem: where it's possible to assign one or multiple moral categories simultaneously.
                    It's important to consider the moral foundations and their opposing polarities.
                    (Care versus Harm, Fairness versus Cheating, and Purity versus Degradation, and so on). 
                    Typically, the lyrics of a song can be interpreted through either a positive (e.g. Loyalty) or negative (e.g. Betrayal) moral foundation, but not both opposites at the same time. 
                    Report the results in json format such that the keys of the
                    correct moral values are reported in a list:
                    e.g. {"moral_values":[0,2]}
                    """

  system_message += ending

  if not option:
    # option 0 does not include any role and it is single label
    prompt = [{"role": "system", "content": system_message},
              {"role": "user", "content": f"####{song}####"}]
  elif option==1:
    # option 1 does not include any role assignment or moral value explanation
    prompt = [{"role": "system", "content": system_message},
              {"role": "user", "content": f"####{song}####"}]
  elif option==2:
    # option 1 does include role assignment but no moral value explanation
    system_message = f"You are an assistant to a researcher studying psychology of music. {system_message}"
    prompt = [{"role": "system", "content": system_message},
              {"role": "user", "content": f"####{song}####"}]
  elif option==3:
    # option 2 does not include role assignment. Include moral value explanation
    prompt = [{"role": "system", "content": system_message},
              {"role": "user", "content": f"####{song}####"}]
  elif option==4:
    # option 3 does include role assignment and moral value explanation
    system_message = f"You are an assistant to a researcher studying psychology of music. {system_message}"
    prompt = [{"role": "system", "content": system_message},
              {"role": "user", "content": f"####{song}####"}]

  return prompt

def clean_completion(completion):
   return re.sub("\n\(.*?\)\n", "", completion)

def get_completion(song,
                   model,
                   query_handler=None,
                   temperature=0,
                   prompt_option=0,
                   CoT=False):

  prompt = prompt_creation(song, gpt_version=model, option=prompt_option, CoT=CoT)

  response = openai.chat.completions.create(

  model=model,

  messages=prompt,

  temperature=temperature,

  )

  answer = response.choices[0].message.content

  if CoT:
    results = re.findall("\{[\n]?.*?[\n]?\}", answer)
    if 0<len(results)<2:
      json_answer = results[0]
    else:
      actual_price = compute_actual_price(response, model)
      response = "FAILED"
      return response, actual_price, answer
  else:
    json_answer = answer

  actual_price = compute_actual_price(response, model)

  if query_handler is not None:
    valid, response = query_handler.check_response_valid(json_answer)
    if not valid:
      response = openai.chat.completions.create(

                model=model,

                messages=response,

                temperature=temperature,

                )

      actual_price += compute_actual_price(response, model)

      valid, response = query_handler.check_response_valid(response.choices[0].message.content)

      if not valid:
        response = "FAILED"

  return response, actual_price, answer

# Main Process

In [15]:
prompt_option = 4 # use both role assignment and moral value description, define artisti to mimic rather than genre more generally
temperature = 0 # increase for more creative outputs, decrease for more deterministic ones
CoT = True # this option makes the prompt a Chain of Thought, therefore outputting the reasoning of the model together with the answer (and possibly improving results)

In [2]:
import pandas as pd

# Here below you will substitute with the manually annotated dataset
df = pd.read_csv("../../Lyrics_Data/MFT_human_annotated_lyrics.csv")

In [3]:
df.head(3)

Unnamed: 0,song_id,song_title,artist_id,artist_name,song_year_combined,song_decade_combined,lyrics,Annotators,is_care,is_harm,is_fairness,is_cheating,is_loyalty,is_betrayal,is_authority,is_subversion,is_purity,is_degradation
0,5714dee725ac0d8aee5391e7,The Perfect Christmas,56d97c59cc2ddd0c0f6bcaba,The Cheetah Girls,2005,2000,The breeze with snow and mistletoe\nThe presen...,annotator_x,1,0,0,0,0,0,0,0,1,0
1,5714dec925ac0d8aee3ca657,Soul Collector,56d80c1753a7ddfc01f91d82,Cans,2004,2000,I move across the earth\nDark shadows call my ...,annotator_x,0,1,0,0,0,0,0,1,0,0
2,5714ded125ac0d8aee42d178,Candy Apple Red,56d834ce53a7ddfc01f9578a,George Hamilton IV,1964,1960,He came speeding through the town each day at ...,annotator_x,0,1,0,0,0,0,0,0,0,0


In [18]:
running_price = 0
results = []
answers = []

for idx, song in enumerate(df.lyrics):
  query_handler = JSON_QueryHandler(gpt_version, print_output=True)

  result, price, answer = get_completion(song,
                                         gpt_version,
                                         query_handler,
                                         temperature,
                                         prompt_option,
                                         CoT
                                        )
  running_price += price
  answers.append(answer)
  try:
      results.append(vectorise_moral_values(result["moral_values"], possible_moral_values))
  except TypeError:
      results.append([0 for _ in range(len(possible_moral_values))])
  if not idx%10:
    print(f"Running price: {running_price}")

Running price: 0.01711
Running price: 0.19152999999999998
Running price: 0.3755599999999999
Running price: 0.55238
Running price: 0.7250999999999999
Running price: 0.8803499999999999
Running price: 1.0710199999999996
Running price: 1.27362
Running price: 1.4629699999999997
Running price: 1.6413799999999994
Running price: 1.8240199999999995
Running price: 1.9925899999999992
Running price: 2.1645299999999996
Running price: 2.3392599999999995
Running price: 2.5194299999999985
Running price: 2.7047999999999988
Running price: 2.8852799999999985
Running price: 3.0741699999999983
Running price: 3.2382199999999983
Running price: 3.4379399999999984


In [19]:
result_df = {value:[] for value in possible_moral_values}
result_df["LLM-answer"] = answers

for result in results:
  for is_there, value in zip(result, possible_moral_values):
    result_df[value].append(is_there)

result_df = pd.DataFrame(result_df)

In [20]:
result_df

Unnamed: 0,Care,Harm,Fairness,Cheating,Loyalty,Betrayal,Authority,Subversion,Purity,Degradation,LLM-answer
0,1,0,1,0,1,0,1,0,1,0,The lyrics describe a joyful and idealized Chr...
1,0,1,0,0,0,0,0,0,0,1,The lyrics describe a character who embodies d...
2,1,1,0,1,1,0,1,1,0,0,This song tells a tragic story involving sever...
3,0,1,0,1,1,1,0,1,0,0,"```json\n{""moral_values"":[1,3,4,5,7]}\n```"
4,1,0,0,0,1,0,0,0,0,0,The lyrics of this song convey a message of un...
...,...,...,...,...,...,...,...,...,...,...,...
195,0,1,0,0,0,1,0,0,0,1,The lyrics describe a situation where the spea...
196,0,1,0,0,0,1,1,1,0,0,The lyrics describe a scenario where the singe...
197,0,1,0,0,0,1,0,0,0,0,The lyrics describe a situation where the spea...
198,0,1,0,0,0,1,0,0,0,0,"This song lyric explores themes of betrayal, d..."


In [21]:
result_df.isna().sum()

Care           0
Harm           0
Fairness       0
Cheating       0
Loyalty        0
Betrayal       0
Authority      0
Subversion     0
Purity         0
Degradation    0
LLM-answer     0
dtype: int64

In [25]:
GPT_human_annotators = pd.concat([df, result_df], axis = 1)

In [26]:
GPT_human_annotators.columns

Index(['song_id', 'song_title', 'artist_id', 'artist_name',
       'song_year_combined', 'song_decade_combined', 'lyrics', 'Annotators',
       'is_care', 'is_harm', 'is_fairness', 'is_cheating', 'is_loyalty',
       'is_betrayal', 'is_authority', 'is_subversion', 'is_purity',
       'is_degradation', 'Care', 'Harm', 'Fairness', 'Cheating', 'Loyalty',
       'Betrayal', 'Authority', 'Subversion', 'Purity', 'Degradation',
       'LLM-answer'],
      dtype='object')

In [27]:
GPT_human_annotators.rename(columns = {'is_care':'care_annot', 'is_harm':'harm_annot', 'is_fairness':'fairness_annot', 'is_cheating':'cheating_annot',
 'is_loyalty': 'loyalty_annot','is_betrayal':'betrayal_annot', 'is_authority':'authority_annot',
 'is_subversion':'subversion_annot', 'is_purity':'purity_annot','is_degradation':'degradation_annot',
 'Care':'care_pred', 'Harm':'harm_pred', 'Fairness':'fairness_pred', 'Cheating':'cheating_pred',
 'Loyalty':'loyalty_pred','Betrayal':'betrayal_pred', 'Authority':'authority_pred', 
 'Subversion':'subversion_pred', 'Purity':'purity_pred', 'Degradation':'degradation_pred'}, inplace = True)

In [30]:
# GPT_human_annotators.to_csv('../GPT_Predicted_Lyrics/200_MFT_annotated_songs_by_humans_vs_predicted_by_gpt.csv', index = None)

In [17]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)


In [30]:
GPT_human_annotators[82:]

Unnamed: 0,song_id,song_title,artist_id,artist_name,song_year_combined,song_decade_combined,lyrics,Annotators,care_annot,harm_annot,fairness_annot,cheating_annot,loyalty_annot,betrayal_annot,authority_annot,subversion_annot,purity_annot,degradation_annot,care_pred,harm_pred,fairness_pred,cheating_pred,loyalty_pred,betrayal_pred,authority_pred,subversion_pred,purity_pred,degradation_pred,LLM-answer
82,5714dedb25ac0d8aee4a299c,Man In The Mirror,56d9391fce06f50c0fed80b4,Mark Chesnutt,2008,2000,I used to sit on the stool\nWatch him shaveIf ...,"Vjosa, Nelly",1,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,This song lyric narrates a story of generation...
83,5714dec825ac0d8aee3bd125,Wasteland Warriors,56d8070653a7ddfc01f91604,Bone Thugs-N-Harmony,1997,1990,(feat. Souljah Boy)\n\n[Krayzie] (Talking)\nYe...,"Vjosa, Nelly",0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1,"```json\n{\n ""moral_values"": [1, 3, 5, 7, 9]\..."
84,5714deec25ac0d8aee57ce4d,Hold On,56d996e3cc2ddd0c0f6bf148,Young Buck,2007,2000,"(gunshot)\n\n(Intro: Young Buck)\nYeah niggaz,...","Vjosa, Nelly",0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1,1,0,0,0,"```json\n{\n ""moral_values"": [1, 5, 6]\n}\n```"
85,5714ded925ac0d8aee48daf7,Ghost,56d85ba153a7ddfc01f99001,Little Boots,2008,2000,Withdrawn into your other world\nI'm speaking ...,"Vjosa, Nelly",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,The lyrics describe a situation where the spea...
86,5714ded625ac0d8aee471ce0,Lighter!,56d8505e53a7ddfc01f97ea3,Kardinal Offishall,2008,2000,Lighter - Kardinal offical Lyrics Provided by ...,"Vjosa, Nelly",0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,1,1,0,0,"The lyrics of ""Lighter"" by Kardinal Offishal, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,5714dee625ac0d8aee52ce4d,Null And Void,56d977becc2ddd0c0f6bc40c,T.A.T.u.,2006,2000,Tell me with affection in your voice\nThat you...,Nelly,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,The lyrics describe a situation where the spea...
196,5714dec425ac0d8aee392e31,Truth,56d7f63853a7ddfc01f8fd68,Amos Lee,2006,2000,"Well my woman, she showed up\nWith your number...",Nelly,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,1,1,1,0,0,The lyrics describe a scenario where the singe...
197,5714dee225ac0d8aee4fffd4,Lies,56d9651dcc2ddd0c0f6ba771,Royal Hunt,1999,1990,Feels like I'm covered in lies\nso turn off th...,Nelly,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,The lyrics describe a situation where the spea...
198,5714decc25ac0d8aee3edf66,Coverdale/Page:Over Now,56d81b5153a7ddfc01f932d9,David Coverdale,1993,1990,You said you'd give me love\nInstead you cause...,Nelly,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,"This song lyric explores themes of betrayal, d..."


In [7]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report

moral_values = ['care', 'harm', 'fairness', 'cheating', 'loyalty', 
                'betrayal', 'authority', 'subversion', 'purity', 'degradation']

# Adjusting for the correct DataFrame based on the dataset loaded earlier
df = GPT_human_annotators.copy()

# Initialize an empty list to store metrics for each moral value
metrics_list = []

# Loop through each moral value to calculate metrics
for value in moral_values:
    pred_column = f'{value}_pred' 
    annot_column = f'{value}_annot'
    
    # Calculate macro and weighted metrics
    f1_macro = f1_score(df[annot_column], df[pred_column], average='binary', zero_division=0)
    f1_weighted = f1_score(df[annot_column], df[pred_column], average='weighted', zero_division=0)
    
    precision_macro = precision_score(df[annot_column], df[pred_column], average='binary', zero_division=0)
    precision_weighted = precision_score(df[annot_column], df[pred_column], average='weighted', zero_division=0)
    
    recall_macro = recall_score(df[annot_column], df[pred_column], average='binary', zero_division=0)
    recall_weighted = recall_score(df[annot_column], df[pred_column], average='weighted', zero_division=0)
    
    accuracy = accuracy_score(df[annot_column], df[pred_column])
    
#     report = classification_report(df[annot_column], df[pred_column])

#     print(report)



    # Append the calculated metrics
    metrics_list.append({
        'Moral Value': value,
        'F1 Score (Binary)': f1_macro,
        'F1 Score (Weighted)': f1_weighted,
        'Precision (Binary)': precision_macro,
        'Precision (Weighted)': precision_weighted,
        'Recall (Binary)': recall_macro,
        'Recall (Weighted)': recall_weighted,
        'Accuracy': accuracy
    })

# Convert the list to a DataFrame for better visualization
metrics_df = pd.DataFrame(metrics_list)
print('GPT Model')
metrics_df


GPT Model


Unnamed: 0,Moral Value,F1 Score (Binary),F1 Score (Weighted),Precision (Binary),Precision (Weighted),Recall (Binary),Recall (Weighted),Accuracy
0,care,0.640884,0.684814,0.47541,0.836207,0.983051,0.675,0.675
1,harm,0.707865,0.745501,0.583333,0.80471,0.9,0.74,0.74
2,fairness,0.38835,0.723954,0.28169,0.806931,0.625,0.685,0.685
3,cheating,0.16,0.8008,0.142857,0.812575,0.181818,0.79,0.79
4,loyalty,0.339286,0.668358,0.253333,0.744507,0.513514,0.63,0.63
5,betrayal,0.31068,0.718168,0.190476,0.89969,0.842105,0.645,0.645
6,authority,0.424242,0.754605,0.3,0.845885,0.724138,0.715,0.715
7,subversion,0.392523,0.720312,0.272727,0.828714,0.7,0.675,0.675
8,purity,0.56338,0.859552,0.454545,0.88755,0.740741,0.845,0.845
9,degradation,0.405063,0.806488,0.275862,0.892451,0.761905,0.765,0.765


### Bootstraping:

In [7]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

df = GPT_human_annotators.copy()


moral_values = ['care', 'harm', 'fairness', 'cheating', 'loyalty', 
                'betrayal', 'authority', 'subversion', 'purity', 'degradation']

# Initialize dictionaries for storing metric distributions from bootstrapping
metric_distributions = {
    'F1': {value: [] for value in moral_values},
    'Precision': {value: [] for value in moral_values},
    'Recall': {value: [] for value in moral_values},
    'Accuracy': {value: [] for value in moral_values}
}

num_samples = 1000  # Number of bootstrap samples

# Perform bootstrapping
for _ in range(num_samples):
    sample_df = df.sample(n=len(df), replace=True) 
    for value in moral_values:
        pred_column = f'{value}_pred'
        annot_column = f'{value}_annot'
        
        # Calculate metrics for this sample
        f1 = f1_score(sample_df[annot_column], sample_df[pred_column], average='binary', zero_division=0)
        precision = precision_score(sample_df[annot_column], sample_df[pred_column], average='binary', zero_division=0)
        recall = recall_score(sample_df[annot_column], sample_df[pred_column], average='binary', zero_division=0)
        accuracy = accuracy_score(sample_df[annot_column], sample_df[pred_column])
        
        # Append metrics to distributions
        metric_distributions['F1'][value].append(f1)
        metric_distributions['Precision'][value].append(precision)
        metric_distributions['Recall'][value].append(recall)
        metric_distributions['Accuracy'][value].append(accuracy)

# Calculate mean and standard deviation, then format results
results = {metric: {} for metric in metric_distributions}
for metric, values in metric_distributions.items():
    for value, scores in values.items():
        mean = np.mean(scores)
        std_dev = np.std(scores)
        results[metric][value] = f"{mean:.2f} ± {std_dev:.2f}"

# Convert results to DataFrame for visualization
results_df = pd.DataFrame(results)

# Transposing for better readability: moral values as rows, metrics as columns
# results_df = results_df.T





In [8]:
print("Weighted Average Scores for Lyrics Morals predicted with 0 Shot GPT 4")
results_df

Weighted Average Scores for Lyrics Morals predicted with 0 Shot GPT 4


Unnamed: 0,F1,Precision,Recall,Accuracy
care,0.64 ± 0.04,0.48 ± 0.05,0.98 ± 0.02,0.68 ± 0.03
harm,0.71 ± 0.04,0.59 ± 0.05,0.90 ± 0.03,0.74 ± 0.03
fairness,0.39 ± 0.06,0.28 ± 0.05,0.62 ± 0.09,0.68 ± 0.03
cheating,0.16 ± 0.07,0.14 ± 0.07,0.18 ± 0.09,0.79 ± 0.03
loyalty,0.34 ± 0.06,0.25 ± 0.05,0.51 ± 0.08,0.63 ± 0.03
betrayal,0.31 ± 0.06,0.19 ± 0.04,0.84 ± 0.09,0.64 ± 0.03
authority,0.42 ± 0.06,0.30 ± 0.05,0.73 ± 0.08,0.72 ± 0.03
subversion,0.39 ± 0.06,0.27 ± 0.05,0.70 ± 0.09,0.67 ± 0.03
purity,0.56 ± 0.07,0.46 ± 0.07,0.74 ± 0.09,0.85 ± 0.02
degradation,0.40 ± 0.07,0.28 ± 0.06,0.76 ± 0.09,0.77 ± 0.03
