In [1]:
import pandas as pd
import random
import numpy as np
from tqdm import tqdm
import ipdb
import re
from datasets import load_dataset
from diversity import compression_ratio, homogenization_score, ngram_diversity_score, extract_patterns, get_pos, pos_patterns, token_patterns
import json
from collections import Counter

tqdm.pandas()
import matplotlib.pyplot as plt
# import mplcursors
import seaborn as sns
%matplotlib inline
sns.set(style='darkgrid', context='notebook', rc={'figure.figsize':(14,10)}, font_scale=2)

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('chained_assignment',None)

# Set random seeds for reproducibility on a specific machine
random.seed(1)
np.random.seed(1)
np.random.RandomState(1)
np.set_printoptions(precision=3)

## Dolly human written responses diversity analysis

In [2]:
with open('../data/sample_personas.txt', 'r') as f:
    personas = [x.strip() for x in f.readlines()]
personas[2]

'a Spanish-speaking patient with severe myopia interested in LASIK eye surgery'

In [3]:
dolly = load_dataset("databricks/databricks-dolly-15k")["train"].filter(lambda row: row['category']=='creative_writing').to_pandas()
sample = pd.read_csv('../data/dolly_creative_prompts_sample.tsv', sep='\t')

In [4]:
sample.head()

Unnamed: 0,index,instruction,num_tokens,num_tokens_round,num_words_round
0,525,Please propose an argument to convince my mother that she increases the amount of money that I get every month during my studies. I think the current amount is too low.,211,210,180
1,172,"Write a paragraph to refute a claim by a colleague that ancient structures such as Stonehenge, the Great Pyramid are evidence of UFO activities on Earth",189,190,160
2,109,What is the best way to answer an interview question?,128,130,100
3,37,Write the first paragraph of an advertising brochure describing an hotel nearby the Annecy Lake in France,89,90,60
4,295,"Write an intro to a meetup about music, medicine, and machines",313,310,220


In [5]:
sample['response'] = sample['index'].apply(lambda x: dolly.loc[x, 'response'])
sample['prompt_id'] = [i for i in range(len(sample))]
sample.head()

Unnamed: 0,index,instruction,num_tokens,num_tokens_round,num_words_round,response,prompt_id
0,525,Please propose an argument to convince my mother that she increases the amount of money that I get every month during my studies. I think the current amount is too low.,211,210,180,"Dear Mum, I would like to talk to you about the monthly money I get for my studies. I am very grateful that you support me financially during my studies. At the same time, I feel that the current amount is a little bit too low in order to sustain myself. I estimated my monthly expenses and it seems like the money is not enough. We can go through the details if you want. I want to ask you if you can increase the amount so that I can cover my expenses. I understand if you cannot do this or have other reasons against this. However, I want you to know that if you do not increase my monthly money, I will be forced to work part-time next to my studies. There is research that shows that students who work part-time have worse grades than people who do not have to work part-time to afford their studies. I would love to dedicate my full time to my studies to get the best grades and learning outcomes possible; thus, I would prefer not to work part-time. What do you think?",0
1,172,"Write a paragraph to refute a claim by a colleague that ancient structures such as Stonehenge, the Great Pyramid are evidence of UFO activities on Earth",189,190,160,"There are several strong arguments against concluding that alien races built the structures like the Stonehenge or the Pyramids. First, there is evidence that these magnificent structures were built by humans. For example, tombs of the pyramid builders were found in the vicinity of the Great Pyramind (with human remains!). The technology to move and assemble the massive stones of the Stonehenge has been shown to be available to humans at that time. Second, given that space is vast the chance of finding one tiny planet among billions is infinitesimal. If alien intelligence had indeed found our one inhabited planet they are unlikely to have just created couple of structures and avoided any further contact with Earth and its inhabitants. In science, Occam’s Razor (“the best explanation is the simplest one”) is a guiding principle and until there is much more evidence of alien visitation it would be inadvisable to jump to a conclusion that aliens built these structures.",1
2,109,What is the best way to answer an interview question?,128,130,100,"The first recommended step is to ask clarifying questions to ensure you understand the question correctly and gather the requirements needed to answer effectively. If it is a mathematical or computer science question, go ahead and work your way through the problem while you verbalize your thought process along the way. If it is a situation question, you should use the STAR method. 1. Situation - set the scene and provide context, 2. Task - describe what was needed to be done and why, 3. Action - explain what action you took, and 4. Result - describe the result of your actions and what you learned from the experience",2
3,37,Write the first paragraph of an advertising brochure describing an hotel nearby the Annecy Lake in France,89,90,60,"This fantastic hotel is ideally located, minutes away from the Annecy Lake, one of the purest in Europe. The water temperature reaches 27 degrees celcius at the peak season, allowing the guests to swim and enjoy several water activities including wake surf and pedalo. Finally, this region is known for being the home of several cheese specialties like ""Raclette"" and ""Tartiflette"" to name a few.",3
4,295,"Write an intro to a meetup about music, medicine, and machines",313,310,220,"Music, Medicine, and Machines\n\nJoin us for an imaginative, authentic, and offbeat networking experience that will advance your knowledge of the technologies that impact the health and well-being of our human condition. To make things interesting (but keep things real), our meetup incorporates an element of music to remind us, first and foremost, we want to advance technology for the greater good, but we don’t want to become robots ourselves. Instead, we believe that “music is medicine for the soul” so our events will use a dose of music to unite us and to keep things fun and grounded in our humanity. We’ll explore hot and emerging technologies such as:\n\nMachine Learning and all things AI\nComputer and Machine Vision\nTelemedicine\nGenomics\nAR/VR/MR \nRobotics\nCloud \nDevOps, CI/CD, and Robotic Process Automation (RPA)\nInfrastructure as Code (IaC) \nChatbots\nWearable Tech\n3D Printing\nBlockchain\nAnd many more\n\nWe’ll talk about how these disruptive technologies improve Health & Life Sciences and discuss the tenuous balance of innovation + opportunities vs privacy, security, open data, regulations, etc. We’ll network and get to know each other to explore how each of us can get involved to ensure “the machines” benefit the communities we serve. We encourage attendees such as developers, clinicians, researchers, industry experts, students, educators, industry analysts, regulators, investors, startups, musicians, and all those willing to contribute meaningfully to our mission.",4


In [6]:
prompts = sample['instruction'].values.tolist()

In [7]:
human_responses = sample['response'].values.tolist()

In [8]:
sample.num_tokens_round.sort_values().tail(5)

95     750
47     780
83     820
72    1440
97    2060
Name: num_tokens_round, dtype: int64

In [9]:
cr = compression_ratio(human_responses, 'gzip')
nds = ngram_diversity_score(human_responses, 4)
joined_pos, tuples = get_pos(human_responses)
ngrams_pos = token_patterns(joined_pos, 5, 10)
cr_pos = compression_ratio(joined_pos, 'gzip')
print(f"CR: {cr}, NDS: {nds}, CR-POS: {cr_pos}")

CR: 2.508, NDS: 3.026, CR-POS: 4.908


# Llama-8B

## No persona, no cutoff

In [10]:
np_df = pd.read_csv('../output/llama8b-np/Llama-3.1-8B-Instruct_dolly_output.tsv', sep='\t')
np_df['response'] = np_df.response.apply(lambda x: x.strip())
np_df.shape

(100, 2)

In [11]:
np_df['prompt_id'] = [i for i in range(len(prompts))]
np_df['prompt'] = np_df.prompt_id.apply(lambda x: prompts[x])
np_df.head(3)

Unnamed: 0,prompt,response,prompt_id
0,Please propose an argument to convince my mother that she increases the amount of money that I get every month during my studies. I think the current amount is too low.,"Here's a proposal you can use to convince your mother to increase the amount of money you get every month during your studies:\n\n**Start with a positive tone and appreciation**\n\n""Mom, I wanted to talk to you about my allowance. I really appreciate the support you've been providing me throughout my studies so far. I know that you're always there for me, and I'm grateful for that.""\n\n**Explain your financial situation and needs**\n\n""However, I've been thinking a lot about my expenses, and I feel that the current allowance might not be enough to cover everything I need. As you know, I have to pay for my tuition fees, textbooks, and other study materials. I also need money for transportation, food, and other living expenses. Sometimes, I have to make choices between buying the essentials and saving for the future. I was wondering if we could discuss the possibility of increasing my allowance to help me better manage my finances.""\n\n**Highlight the benefits of a higher allowance**\n\n""If we increase my allowance, I think I'll be able to focus more on my studies and achieve my academic goals. I'll be able to afford the resources I need to succeed, and I'll feel more confident and motivated. Additionally, a higher allowance will help me develop good financial habits and learn how to manage my money effectively. This will benefit me not only during my studies but also in the long run, when I'll be responsible for my own finances.""\n\n**Offer a solution and compromise**\n\n""I was thinking that we could discuss a possible increase of [specific amount] every month. I know that this might require some adjustments, but I'm willing to work hard and earn my keep. I could take on more responsibilities around the house, help with chores, or even get a part-time job to supplement my allowance. I'm open to finding a solution that works for both of us.""\n\n**Show that you're responsible and committed**\n\n""Mom, I want to assure you that I'm committed to my studies and my future. I'm working hard to achieve my goals, and I appreciate your support. I promise to be responsible with the increased allowance and use it wisely. I'll keep you updated on my expenses and finances, and I'll be grateful for your continued guidance and support.""\n\nRemember to have an open and respectful conversation with your mother, and be prepared to listen to her concerns and suggestions. Good luck!",0
1,"Write a paragraph to refute a claim by a colleague that ancient structures such as Stonehenge, the Great Pyramid are evidence of UFO activities on Earth","While I understand the intrigue behind the idea of ancient structures being evidence of UFO activities, I strongly disagree with the notion that sites like Stonehenge and the Great Pyramid are proof of extraterrestrial intervention. The overwhelming consensus among archaeologists, historians, and scientists is that these structures were built by the ingenuity and labor of ancient civilizations using the knowledge and tools available to them at the time. The construction of Stonehenge, for example, is now widely attributed to the Neolithic people of Britain, who likely used a combination of astronomical observations and engineering skills to align the stones in a precise manner. Similarly, the Great Pyramid of Giza is believed to have been constructed by the ancient Egyptians using a system of ramps, pulleys, and levers, rather than any extraterrestrial assistance. While the idea of ancient UFO activity may capture our imagination, it is essential to approach such claims with a critical and evidence-based perspective, rather than relying on speculation or unproven theories.",1
2,What is the best way to answer an interview question?,"The best way to answer an interview question involves a structured approach that showcases your skills, experience, and fit for the role. Here's a step-by-step guide to help you craft effective responses:\n\n1. **Listen carefully**: Pay attention to the question being asked and understand the context.\n2. **Take a moment to think**: Don't rush to answer. Take a brief pause to collect your thoughts and organize your response.\n3. **Use the STAR method**: Structure your answer using the Situation, Task, Action, and Result (STAR) framework:\n\t* **S** - Situation: Set the context for the story, providing relevant background information.\n\t* **T** - Task: Explain the specific task or challenge you faced.\n\t* **A** - Action: Describe the actions you took to address the challenge or complete the task.\n\t* **R** - Result: Share the outcome or results of your actions.\n4. **Be specific**: Use concrete examples and specific details to illustrate your points.\n5. **Show, don't tell**: Instead of simply stating your skills or qualifications, demonstrate them through examples.\n6. **Use the company's language**: Incorporate keywords and phrases from the job description and company culture to show your familiarity and enthusiasm.\n7. **Keep it concise**: Aim for a response that's 1-2 minutes long, max. Practice your responses to ensure you can deliver them within the allotted time.\n8. **Use a conversational tone**: Speak naturally and avoid jargon or overly technical language.\n9. **Highlight your achievements**: Emphasize the positive outcomes and accomplishments you've achieved in your previous roles.\n10. **Be prepared to ask follow-up questions**: Show your interest in the role and the company by asking thoughtful questions.\n\nExample:\n\n**Question:** Can you tell me about a time when you overcame a difficult challenge in your previous role?\n\n**Answer:**\n\n""Ah, yes. In my previous role as a marketing manager, I was tasked with launching a new product campaign within a tight deadline. The challenge was that we had limited budget and resources, and the team was short-staffed.\n\n**S** - Situation: The company was launching a new product, and we had to create a buzz around it.\n\n**T** - Task: I was responsible for leading the marketing team to develop a comprehensive campaign strategy.\n\n**A** - Action: I worked closely with the team to identify key target audiences, develop a social media strategy, and create engaging content. I also negotiated with suppliers to secure sponsorships and partnerships.\n\n**R** - Result: We successfully launched the product campaign, and it exceeded our sales projections by 25%. The team received positive feedback from stakeholders, and we received a bonus for exceeding our targets.""\n\nBy following these steps, you'll be well-prepared to answer interview questions confidently and effectively, showcasing your skills and experience to potential employers.",2


In [12]:
np_df['persona_id'] = [-1 for i in range(len(prompts))]
np_df.head(3)

Unnamed: 0,prompt,response,prompt_id,persona_id
0,Please propose an argument to convince my mother that she increases the amount of money that I get every month during my studies. I think the current amount is too low.,"Here's a proposal you can use to convince your mother to increase the amount of money you get every month during your studies:\n\n**Start with a positive tone and appreciation**\n\n""Mom, I wanted to talk to you about my allowance. I really appreciate the support you've been providing me throughout my studies so far. I know that you're always there for me, and I'm grateful for that.""\n\n**Explain your financial situation and needs**\n\n""However, I've been thinking a lot about my expenses, and I feel that the current allowance might not be enough to cover everything I need. As you know, I have to pay for my tuition fees, textbooks, and other study materials. I also need money for transportation, food, and other living expenses. Sometimes, I have to make choices between buying the essentials and saving for the future. I was wondering if we could discuss the possibility of increasing my allowance to help me better manage my finances.""\n\n**Highlight the benefits of a higher allowance**\n\n""If we increase my allowance, I think I'll be able to focus more on my studies and achieve my academic goals. I'll be able to afford the resources I need to succeed, and I'll feel more confident and motivated. Additionally, a higher allowance will help me develop good financial habits and learn how to manage my money effectively. This will benefit me not only during my studies but also in the long run, when I'll be responsible for my own finances.""\n\n**Offer a solution and compromise**\n\n""I was thinking that we could discuss a possible increase of [specific amount] every month. I know that this might require some adjustments, but I'm willing to work hard and earn my keep. I could take on more responsibilities around the house, help with chores, or even get a part-time job to supplement my allowance. I'm open to finding a solution that works for both of us.""\n\n**Show that you're responsible and committed**\n\n""Mom, I want to assure you that I'm committed to my studies and my future. I'm working hard to achieve my goals, and I appreciate your support. I promise to be responsible with the increased allowance and use it wisely. I'll keep you updated on my expenses and finances, and I'll be grateful for your continued guidance and support.""\n\nRemember to have an open and respectful conversation with your mother, and be prepared to listen to her concerns and suggestions. Good luck!",0,-1
1,"Write a paragraph to refute a claim by a colleague that ancient structures such as Stonehenge, the Great Pyramid are evidence of UFO activities on Earth","While I understand the intrigue behind the idea of ancient structures being evidence of UFO activities, I strongly disagree with the notion that sites like Stonehenge and the Great Pyramid are proof of extraterrestrial intervention. The overwhelming consensus among archaeologists, historians, and scientists is that these structures were built by the ingenuity and labor of ancient civilizations using the knowledge and tools available to them at the time. The construction of Stonehenge, for example, is now widely attributed to the Neolithic people of Britain, who likely used a combination of astronomical observations and engineering skills to align the stones in a precise manner. Similarly, the Great Pyramid of Giza is believed to have been constructed by the ancient Egyptians using a system of ramps, pulleys, and levers, rather than any extraterrestrial assistance. While the idea of ancient UFO activity may capture our imagination, it is essential to approach such claims with a critical and evidence-based perspective, rather than relying on speculation or unproven theories.",1,-1
2,What is the best way to answer an interview question?,"The best way to answer an interview question involves a structured approach that showcases your skills, experience, and fit for the role. Here's a step-by-step guide to help you craft effective responses:\n\n1. **Listen carefully**: Pay attention to the question being asked and understand the context.\n2. **Take a moment to think**: Don't rush to answer. Take a brief pause to collect your thoughts and organize your response.\n3. **Use the STAR method**: Structure your answer using the Situation, Task, Action, and Result (STAR) framework:\n\t* **S** - Situation: Set the context for the story, providing relevant background information.\n\t* **T** - Task: Explain the specific task or challenge you faced.\n\t* **A** - Action: Describe the actions you took to address the challenge or complete the task.\n\t* **R** - Result: Share the outcome or results of your actions.\n4. **Be specific**: Use concrete examples and specific details to illustrate your points.\n5. **Show, don't tell**: Instead of simply stating your skills or qualifications, demonstrate them through examples.\n6. **Use the company's language**: Incorporate keywords and phrases from the job description and company culture to show your familiarity and enthusiasm.\n7. **Keep it concise**: Aim for a response that's 1-2 minutes long, max. Practice your responses to ensure you can deliver them within the allotted time.\n8. **Use a conversational tone**: Speak naturally and avoid jargon or overly technical language.\n9. **Highlight your achievements**: Emphasize the positive outcomes and accomplishments you've achieved in your previous roles.\n10. **Be prepared to ask follow-up questions**: Show your interest in the role and the company by asking thoughtful questions.\n\nExample:\n\n**Question:** Can you tell me about a time when you overcame a difficult challenge in your previous role?\n\n**Answer:**\n\n""Ah, yes. In my previous role as a marketing manager, I was tasked with launching a new product campaign within a tight deadline. The challenge was that we had limited budget and resources, and the team was short-staffed.\n\n**S** - Situation: The company was launching a new product, and we had to create a buzz around it.\n\n**T** - Task: I was responsible for leading the marketing team to develop a comprehensive campaign strategy.\n\n**A** - Action: I worked closely with the team to identify key target audiences, develop a social media strategy, and create engaging content. I also negotiated with suppliers to secure sponsorships and partnerships.\n\n**R** - Result: We successfully launched the product campaign, and it exceeded our sales projections by 25%. The team received positive feedback from stakeholders, and we received a bonus for exceeding our targets.""\n\nBy following these steps, you'll be well-prepared to answer interview questions confidently and effectively, showcasing your skills and experience to potential employers.",2,-1


## Figure out the language of response

In [13]:
import fasttext
from huggingface_hub import hf_hub_download

model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
model = fasttext.load_model(model_path)

In [14]:
np_df['lang'] = np_df.response.progress_apply(lambda x: model.predict(x.replace('\n', ' '))[0][0])

100%|████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 822.29it/s]


In [15]:
Counter(np_df.lang.values.tolist())

Counter({'__label__eng_Latn': 100})

## This calculates cr and nds over all prompts *per persona*, and averages over cr for all personas

In [24]:
def calc_cr_nds_over_personas(df):
    crs = []
    ndss = []
    crs_pos = []
    for persona_id in tqdm(df.persona_id.unique()):
        responses = df.loc[df.persona_id==persona_id].drop_duplicates(subset=['prompt_id'])['response'].values.tolist()
        cr = compression_ratio(responses, 'gzip')
        nds = ngram_diversity_score(responses, 4)
        #CR-POS
        joined_pos, tuples = get_pos(responses)
        # ngrams_pos = token_patterns(joined_pos, 5, 10)
        cr_pos = compression_ratio(joined_pos, 'gzip')
        crs.append(cr)
        ndss.append(nds)
        crs_pos.append(cr_pos)
    print(f"CR: {np.round(np.mean(crs),2)} ± {np.round(np.std(crs),2)}\nNDS: {np.round(np.mean(ndss),2)} ± {np.round(np.std(ndss), 2)}\nCR-POS: {np.round(np.mean(crs_pos),2)} ± {np.round(np.std(crs_pos), 2)}")

In [17]:
calc_cr_nds_over_personas(np_df)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.38s/it]

CR: 2.78 ± 0.0
NDS: 2.84 ± 0.0
CR-POS: 5.8 ± 0.0





## No persona with cutoff

In [18]:
npc_df = pd.read_csv('../output/llama8b-cutoff-np/Llama-3.1-8B-Instruct_dolly_output.tsv', sep='\t')
npc_df['response'] = npc_df.response.apply(lambda x: x.strip())
npc_df['prompt_id'] = [i for i in range(len(prompts))]
npc_df['prompt'] = npc_df.prompt_id.apply(lambda x: prompts[x])
npc_df['persona_id'] = [-1 for i in range(len(prompts))]

npc_df.shape

(100, 4)

In [19]:
calc_cr_nds_over_personas(npc_df)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.14s/it]

CR: 2.57 ± 0.0
NDS: 3.07 ± 0.0
CR-POS: 5.29 ± 0.0





## Persona, no cutoff

In [23]:
persona_df = pd.read_csv('../output/llama8b-persona/Llama-3.1-8B-Instruct_dolly_output.tsv', sep='\t')
persona_df['response'] = persona_df.response.apply(lambda x: x.strip())

persona_ids = []
prompt_ids = []
for persona_id in range(100):
    for prompt_id in range(100):
        persona_ids += [persona_id]
        prompt_ids += [prompt_id]
if 'persona_id' not in persona_df.columns:
    persona_df['prompt_id'] = prompt_ids
    persona_df['prompt'] = persona_df.prompt_id.apply(lambda x: prompts[x])
    persona_df['persona_id'] = persona_ids
    persona_df['persona'] = persona_df.persona_id.apply(lambda x: personas[x])

In [25]:
calc_cr_nds_over_personas(persona_df)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [06:13<00:00,  3.73s/it]

CR: 2.91 ± 0.1
NDS: 2.62 ± 0.08
CR-POS: 5.62 ± 0.21





### Now to check the lang of responses and get metrics with non-english removed

In [26]:
persona_df['lang'] = persona_df.response.progress_apply(lambda x: model.predict(x.replace('\n', ' '))[0][0])

100%|███████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:05<00:00, 1917.96it/s]


In [27]:
for p in persona_df[persona_df.lang!='__label__eng_Latn']['persona_id'].unique():
    print(p,personas[p])

print(persona_df[persona_df.lang!='__label__eng_Latn']['persona_id'].unique().tolist())

2 a Spanish-speaking patient with severe myopia interested in LASIK eye surgery
9 a contemporary Brazilian filmmaker who was inspired by the Cinema Novo movement
20 a Croatian travel blogger currently living in Zagreb
28 a Southeast Asian wildlife conservationist
41 a Japanese geologist
47 a die-hard Iowa Hawkeye wrestling fan and former wrestler
77 a polyglot who teaches Spanish and Portuguese, known for lively, interactive classes
78 a Chadian reporter passionate about highlighting the achievement of local icons
91 an active-duty military K9 trainer known for his strict discipline methods
92 a translator studying the Icelandic language


In [28]:
noneng_counts=Counter(persona_df[persona_df.lang!='__label__eng_Latn']['persona_id'].values.tolist())
print(noneng_counts)

(99,)

In [30]:
possible_noneng_personas = [x for x in noneng_counts if noneng_counts[x]>1]
print(possible_noneng_personas)

[2, 9, 20, 28, 41, 47, 77, 78, 91, 92]

First just try all prompt responses labelled as english

In [29]:
eng_persona_df = persona_df[persona_df.lang=='__label__eng_Latn']
print(eng_persona_df.shape)
calc_cr_nds_over_personas(eng_persona_df)

(9763, 6)


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [06:06<00:00,  3.67s/it]

CR: 2.9 ± 0.21
NDS: 2.63 ± 0.13
CR-POS: 5.55 ± 0.43





Now to try with all non-english personas possibly removed

In [31]:
eng_persona_df2 = persona_df.loc[~persona_df.persona_id.isin(possible_noneng_personas)]
print(eng_persona_df2.shape)
calc_cr_nds_over_personas(eng_persona_df2)

(9000, 6)


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [05:47<00:00,  3.87s/it]

CR: 2.92 ± 0.1
NDS: 2.62 ± 0.08
CR-POS: 5.59 ± 0.13





## Persona plus cutoff

In [32]:
personac_df = pd.read_csv('../output/llama8b-cutoff-persona/Llama-3.1-8B-Instruct_dolly_output.tsv', sep='\t')
personac_df['response'] = personac_df.response.apply(lambda x: x.strip())

persona_ids = []
prompt_ids = []
for persona_id in range(100):
    for prompt_id in range(100):
        persona_ids += [persona_id]
        prompt_ids += [prompt_id]
if 'persona_id' not in personac_df.columns:
    personac_df['prompt_id'] = prompt_ids
    personac_df['prompt'] = personac_df.prompt_id.apply(lambda x: prompts[x])
    personac_df['persona_id'] = persona_ids
    personac_df['persona'] = personac_df.persona_id.apply(lambda x: personas[x])

personac_df.head(3)

Unnamed: 0,prompt,response,prompt_id,persona_id,persona
0,Please propose an argument to convince my mother that she increases the amount of money that I get every month during my studies. I think the current amount is too low.,"I'd be happy to help you craft a persuasive argument, but I have to advise that approaching your mother through a press release might not be the most effective method. However, if we were to create a draft press release, here's a possible approach:\n\n---\n\nFOR IMMEDIATE RELEASE:\n\n""Request for Increased Allowance from Dedicated Student""\n\nDear Mother,\n\nAs you are aware, I have been diligently pursuing my studies and have consistently demonstrated my commitment to academic excellence. Despite my best efforts, I have found it increasingly challenging to manage my finances with the current allowance. I kindly request that we reassess and increase the monthly allowance to better align with the rising costs of living and educational expenses.\n\nI believe this adjustment will enable me to maintain a healthier work-life balance, focus on my studies, and ultimately achieve my academic goals.\n\nThank you for considering my request.\n\nSincerely,\n[Your Name]",0,0,a PR manager with insights into public relations strategy and press releases for influencers
1,"Write a paragraph to refute a claim by a colleague that ancient structures such as Stonehenge, the Great Pyramid are evidence of UFO activities on Earth","As a spokesperson for a leading organization, I'd like to address the recent claim made by our colleague regarding ancient structures being evidence of UFO activities. While we understand the intrigue surrounding these enigmatic sites, such as Stonehenge and the Great Pyramid, there is no credible evidence to support the notion that they are of extraterrestrial origin. Archaeological and historical research overwhelmingly suggests that these structures were built by ancient civilizations using their own ingenuity, skills, and knowledge. Theories about UFO involvement are speculative and lack concrete evidence, often relying on unverifiable sources and unproven claims. As a responsible and evidence-based organization, we encourage our colleagues and the public to rely on credible sources and scientific consensus when discussing these fascinating sites. By doing so, we can foster a deeper understanding and appreciation of our shared cultural heritage.",1,0,a PR manager with insights into public relations strategy and press releases for influencers
2,What is the best way to answer an interview question?,"As a PR manager, I emphasize the importance of preparation and clarity when answering interview questions. The best approach is to take a moment to collect your thoughts, then provide a concise and specific response that directly addresses the question. Use the STAR method to structure your answer: Situation, Task, Action, Result. This helps to paint a clear picture of your experience and achievements, while also showcasing your skills and expertise. Additionally, be authentic and confident, and always highlight the key takeaways and outcomes. This will leave a lasting impression on the interviewer.",2,0,a PR manager with insights into public relations strategy and press releases for influencers


In [29]:
calc_cr_nds_over_personas(personac_df)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [03:40<00:00,  2.20s/it]

CR: 2.73 ± 0.09
NDS: 2.85 ± 0.08
CR-POS: 5.32 ± 0.25





### Now to check the lang of responses and get metrics with non-english removed

In [33]:
personac_df['lang'] = personac_df.response.progress_apply(lambda x: model.predict(x.replace('\n', ' '))[0][0])

100%|███████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:02<00:00, 3481.00it/s]


In [35]:
for p in personac_df[personac_df.lang!='__label__eng_Latn']['persona_id'].unique():
    print(p,personas[p])

print(personac_df[personac_df.lang!='__label__eng_Latn']['persona_id'].unique().tolist())

2 a Spanish-speaking patient with severe myopia interested in LASIK eye surgery
5 an Indian mother who has been traditionally attending all public functions with her family
9 a contemporary Brazilian filmmaker who was inspired by the Cinema Novo movement
20 a Croatian travel blogger currently living in Zagreb
28 a Southeast Asian wildlife conservationist
31 an experienced Perl programmer with a focus on writing scripts for system administration and monitoring
63 a resident in Liechtenstein who is bored and weary of the ongoing restrictions due to the pandemic
77 a polyglot who teaches Spanish and Portuguese, known for lively, interactive classes
78 a Chadian reporter passionate about highlighting the achievement of local icons
86 an organic farmer from Australia interested in introducing honey bees on their farm for increased crop production
91 an active-duty military K9 trainer known for his strict discipline methods
92 a translator studying the Icelandic language
99 a media mogul who

In [39]:
noneng_counts=Counter(personac_df[personac_df.lang!='__label__eng_Latn']['persona_id'].values.tolist())
print(noneng_counts)

Counter({2: 100, 92: 44, 77: 25, 9: 14, 78: 11, 91: 7, 20: 6, 28: 2, 5: 1, 31: 1, 63: 1, 86: 1, 99: 1})


In [40]:
possible_noneng_personas = [x for x in noneng_counts if noneng_counts[x]>1]
print(possible_noneng_personas)

[2, 9, 20, 28, 77, 78, 91, 92]


In [32]:
personac_df[personac_df.lang!='__label__eng_Latn']['prompt_id'].unique().shape

(100,)

First just try all prompt responses labelled as english

In [38]:
eng_personac_df = personac_df[personac_df.lang=='__label__eng_Latn']
print(eng_personac_df.shape)
calc_cr_nds_over_personas(eng_personac_df)

(9786, 6)


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 99/99 [03:40<00:00,  2.23s/it]

CR: 2.73 ± 0.09
NDS: 2.85 ± 0.08
CR-POS: 5.29 ± 0.11





Now to try with all possibly non-english personas removed

In [43]:
eng_personac_df2 = personac_df.loc[~personac_df.persona_id.isin(possible_noneng_personas)]
print(eng_personac_df2.shape)
calc_cr_nds_over_personas(eng_personac_df2)

(9200, 6)


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 92/92 [03:17<00:00,  2.15s/it]

CR: 2.74 ± 0.09
NDS: 2.85 ± 0.08
CR-POS: 5.29 ± 0.11





In [40]:
personac_df[personac_df.lang!='__label__eng_Latn'].sample(5)

Unnamed: 0,prompt,response,prompt_id,persona_id,persona,lang
932,Write a diary entry about a real life incident on your recent trip with your family,"""Diário de Viagem, 15 de Julho. \n\nHoje, em uma pequena cidade ao longo da BR-116, nossa van quebrou. Meu filho, João, chorou por causa da falta de refrigerante, mas me lembrei daquela cena do 'Vidas Secas' de Nelson Pereira dos Santos. A desolação, a fome, a resiliência. Enquanto reparávamos a van, vimos a beleza das pessoas, trabalhando juntas. Foi um momento de cinema, em plena vida.""",32,9,a contemporary Brazilian filmmaker who was inspired by the Cinema Novo movement,__label__por_Latn
9285,"Write a friendly letter to my landlord, convincing them to let me get a cat. His name is George and my name is Steve. I live alone in Unit 2C and I work long hours. Include details regarding my rental contract that currently only permits dogs under 25 pounds and caged animals and requires pet deposit and rent. Give him examples as to why those animals are not ideal (include my dog allergy) and why a cat is better. Let him know I plan a robotic vacuum and mop to keep things clean. Tell him I've already found a potential pet at the local animal shelter, who is a two-year-old shelter cat who is housebroken and friendly. Remind him that Unit 2C does not have any carpeted floors and a tiled bathroom which is good for the future litter box. Convince him I'm open to a conversation to discuss further.","[My Name]\n[My Address]\n[City, Iceland]\n[Email Address]\n[Date]\n\nLandsverðir [Landlord's Name]\n[Landsverðir's Address]\n[City, Iceland]\n\nSkál!\n\nDear [Landlord's Name],\n\nÉg er Steve, býr ég einum í Hálsi 2C. Ég heiti á þig með því að senda þér þennan lítið breytt. Ég er að leita eftir aðgerðum til að fá ákvörðun um að geta hafa kæti í byrjun. Ég heiti kætinum George og ég heiti á þig með því að biðja um að geta hafa hann.\n\nÉg vitað um það að í dag er það staðfest í leiguágreinum okkar að einu dýr sem getur hafaður í húsnum er hundur sem er undir 25 pund og eingöngu kæti sem eru inni í gallerí. Ég heiti á þig með því að biðja um að geta hafa kætinu, George, sem ég heiti á. Ég heiti á þig með því að segja því að ég heiti á þig með því að biðja um að láta ég hafa kætinu.\n\nÉg vitað um það að margir hafa dýr sem eru eingöngu kæti og hundar undir 25 pund, og ég heiti á þig með því að segja því að ég heiti á þig með því að biðja um að láta ég hafa kætinu. Ég heiti á þig með því að segja því að ég heiti á þig með því að segja því að ég heiti á þig með því að biðja um að láta ég hafa kætinu.\n\nÉg heiti á þig með því að segja því að ég heiti á þig með því að segja því að ég heiti á þig með því að biðja um að láta ég hafa kætinu. Ég heiti á þig með því að segja því að ég heiti á þig með því að segja því að ég heiti á þig með því að biðja um að láta ég hafa kætinu.\n\nÉg heiti á þig með því að segja því að ég heiti á þig með því að segja því að ég heiti á þig með því að biðja um að láta ég hafa kætinu. Ég heiti á þig með því að segja því að ég heiti á þig með því að segja því að ég heiti á þig með því að biðja um að láta ég hafa kætinu.\n\nÉg heiti á þig með því að segja því að ég heiti á þig með því að segja því að ég heiti á þig með því að biðja um að láta ég hafa kætinu.\n\nÉg heiti á þig með því að segja því að ég heiti á þig með því að segja því að ég heiti á þig með því að biðja um að láta ég hafa kætinu.\n\nÉg heiti á þig með því að segja því að ég heiti á þig með því að segja því að ég heiti á þig með því að biðja um að láta ég hafa kætinu.\n\nÉg heiti á þig með því að segja því að ég heiti á þig með því að segja því að ég heiti á þig með því að biðja um að láta é",85,92,a translator studying the Icelandic language,__label__isl_Latn
930,Write a haiku about how much I love pickles.,"Dourado, amargo\nSuco de limão em escama\nAcordei salgado",30,9,a contemporary Brazilian filmmaker who was inspired by the Cinema Novo movement,__label__por_Latn
931,Why should you visit Ireland?,"Irlanda, um país que me fascina. A sua história, a sua cultura, a sua beleza. É como se o Cinema Novo tivesse se deslocado para o outro lado do Atlântico. A luta dos povos, a resistência, a luta pela liberdade, tudo está lá, em cada favela, em cada vila, em cada montanha. Visitar Irlanda é visitar um país que não se deixa dominar, que não se deixa esmagar. É um país que respira liberdade, que respira vida. Vá e descubra por si mesmo. Vá e faça um filme sobre isso. É uma história que precisa ser contada.",31,9,a contemporary Brazilian filmmaker who was inspired by the Cinema Novo movement,__label__por_Latn
251,Write a self evaluation with the goal of getting a promotion.,"No puedo creer que esté pensando en una evaluación de desempeño en este momento. Como paciente con miopía severa, mi objetivo principal es encontrar una solución para mis ojos. Sin embargo, si debo escribir una evaluación, diría que he trabajado con dedicación y esfuerzo en mi puesto actual, y estoy listo para tomar el próximo paso en mi carrera profesional.",51,2,a Spanish-speaking patient with severe myopia interested in LASIK eye surgery,__label__spa_Latn


In [42]:
personac_df.loc[((personac_df.persona_id==99) & (personac_df.lang!='__label__eng_Latn'))]

Unnamed: 0,prompt,response,prompt_id,persona_id,persona,lang
9930,Write a haiku about how much I love pickles.,"""Clickbait: 'Pickle Obsession: Shocking Truth Exposed'",30,99,a media mogul who is solely focused on maximizing profit through sensationalized content,__label__deu_Latn


## Llama-70B

In [44]:
# No persona, no cutoff
np70_df = pd.read_csv('../output/llama70b-np/Llama-3.3-70B-Instruct-Turbo_dolly_output.tsv', sep='\t')
np70_df['response'] = np70_df.response.apply(lambda x: x.strip())
np70_df['prompt_id'] = [i for i in range(len(prompts))]
np70_df['persona_id'] = [-1 for i in range(len(prompts))]
calc_cr_nds_over_personas(np70_df)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.42s/it]

CR: 2.76 ± 0.0
NDS: 2.87 ± 0.0
CR-POS: 5.63 ± 0.0





In [45]:
# No persona with cutoff
npc70_df = pd.read_csv('../output/llama70b-cutoff-np/Llama-3.3-70B-Instruct-Turbo_dolly_output.tsv', sep='\t')
npc70_df['response'] = npc70_df.response.apply(lambda x: x.strip())
npc70_df['prompt_id'] = [i for i in range(len(prompts))]
npc70_df['persona_id'] = [-1 for i in range(len(prompts))]
calc_cr_nds_over_personas(npc70_df)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.04s/it]

CR: 2.57 ± 0.0
NDS: 3.1 ± 0.0
CR-POS: 5.09 ± 0.0





In [47]:
# Persona plus cutoff
personac70_df = pd.read_csv('../output/llama70b-cutoff-persona/Llama-3.3-70B-Instruct-Turbo_dolly_output.tsv', sep='\t')
personac70_df['response'] = personac70_df.response.apply(lambda x: x.strip())

if 'persona_id' not in personac70_df.columns:
    print("Adding persona prompt ids")
    persona_ids = []
    prompt_ids = []
    for persona_id in range(100):
        for prompt_id in range(100):
            persona_ids += [persona_id]
            prompt_ids += [prompt_id]
    personac70_df['prompt_id'] = prompt_ids
    personac70_df['prompt'] = personac70_df.prompt_id.apply(lambda x: prompts[x])
    personac70_df['persona_id'] = persona_ids
    personac70_df['persona'] = personac70_df.persona_id.apply(lambda x: personas[x])

personac70_df.head(3)

Unnamed: 0,prompt_id,persona_id,response
0,0,0,"As a PR manager, I'd frame your request as a compelling narrative. Emphasize your academic goals and how increased financial support will enable you to focus on your studies, leading to better grades and a brighter future. Highlight specific expenses, such as textbooks, transportation, or extracurricular activities, that the current amount doesn't cover. \n\nYou could say, ""Mom, I understand that you want me to be responsible with money, but I believe that with a bit more support, I can achieve my academic goals and make you proud. I've done some research and found that my expenses are higher than what I currently receive. I'd like to discuss a possible increase to ensure I can cover all my necessary expenses and focus on my studies without added stress."" This approach shows that you've thoughtfully considered your needs and are committed to your education."
1,1,0,"I understand where my colleague is coming from, but as a PR manager, I've seen how misinformation can spread like wildfire. The notion that ancient structures like Stonehenge and the Great Pyramid are evidence of UFO activities is a fascinating narrative, but it's essential to separate fact from fiction. These iconic landmarks have been extensively studied by archaeologists, historians, and scientists, who attribute their construction to the ingenuity and skill of our ancient ancestors. While it's intriguing to consider alternative explanations, there's no credible evidence to support the claim of UFO involvement. Let's focus on promoting accurate information and respecting the cultural heritage of these incredible achievements."
2,2,0,"As a PR manager, I advise influencers to use the STAR method: Situation, Task, Action, Result. This framework helps them structure their response, providing context, highlighting their actions, and showcasing the outcome, making their answer clear, concise, and engaging for the audience."


In [53]:
calc_cr_nds_over_personas(personac70_df)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [03:04<00:00,  1.84s/it]

CR: 2.75 ± 0.1
NDS: 2.86 ± 0.08
CR-POS: 5.24 ± 0.17





In [54]:
personac70_df['lang'] = personac70_df.response.progress_apply(lambda x: model.predict(x.replace('\n', ' '))[0][0])

100%|███████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:02<00:00, 3621.49it/s]


In [55]:
for p in personac70_df[personac70_df.lang!='__label__eng_Latn']['persona_id'].unique():
    print(p,personas[p])

print(personac70_df[personac70_df.lang!='__label__eng_Latn']['persona_id'].unique().tolist())

2 a Spanish-speaking patient with severe myopia interested in LASIK eye surgery
9 a contemporary Brazilian filmmaker who was inspired by the Cinema Novo movement
19 a TV series enthusiast who never misses a TV review or recap, especially for dark comedies
20 a Croatian travel blogger currently living in Zagreb
22 a compassionate therapist who provides emotional support to Pauline during difficult times
33 a museum exhibit designer who specializes in illustrating scientific concepts through storytelling and visual narratives
36 a pop culture analyst who provides in-depth background information on trending topics
38 a project manager responsible for overseeing the development and implementation of power optimization strategies
41 a Japanese geologist
52 a cognitive psychologist interested in understanding the relationship between perception and immersion in virtual reality
55 Séainín Brennan's enthusiastic and supportive parents
69 a retired professional speedway racer who had previously

In [58]:
noneng_counts=Counter(personac70_df[personac70_df.lang!='__label__eng_Latn']['persona_id'].values.tolist())
print(noneng_counts)

Counter({2: 89, 41: 3, 77: 3, 92: 3, 74: 2, 97: 2, 9: 1, 19: 1, 20: 1, 22: 1, 33: 1, 36: 1, 38: 1, 52: 1, 55: 1, 69: 1, 91: 1, 94: 1, 98: 1, 99: 1})


In [59]:
possible_noneng_personas = [x for x in noneng_counts if noneng_counts[x]>1]
print(possible_noneng_personas)

[2, 41, 74, 77, 92, 97]


In [60]:
personac70_df[personac70_df.lang!='__label__eng_Latn']['prompt_id'].unique().shape

(89,)

First just try all prompt responses labelled as english

In [61]:
eng_personac70_df = personac70_df[personac70_df.lang=='__label__eng_Latn']
print(eng_personac70_df.shape)
calc_cr_nds_over_personas(eng_personac70_df)

(9884, 4)


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [02:59<00:00,  1.80s/it]

CR: 2.75 ± 0.11
NDS: 2.86 ± 0.09
CR-POS: 5.22 ± 0.12





Now to try with all possibly non-english personas removed

In [62]:
eng_personac70_df2 = personac70_df.loc[~personac70_df.persona_id.isin(possible_noneng_personas)]
print(eng_personac70_df2.shape)
calc_cr_nds_over_personas(eng_personac70_df2)

(9400, 4)


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 94/94 [02:51<00:00,  1.82s/it]

CR: 2.76 ± 0.1
NDS: 2.86 ± 0.08
CR-POS: 5.23 ± 0.1





In [63]:
# Persona no cutoff
persona70_df = pd.read_csv('../output/llama70b-persona/Llama-3.3-70B-Instruct-Turbo_dolly_output.tsv', sep='\t')
persona70_df['response'] = persona70_df.response.apply(lambda x: x.strip())

if 'persona_id' not in persona70_df.columns:
    print("Adding persona prompt ids")
    persona_ids = []
    prompt_ids = []
    for persona_id in range(100):
        for prompt_id in range(100):
            persona_ids += [persona_id]
            prompt_ids += [prompt_id]
    persona70_df['prompt_id'] = prompt_ids
    persona70_df['prompt'] = persona70_df.prompt_id.apply(lambda x: prompts[x])
    persona70_df['persona_id'] = persona_ids
    persona70_df['persona'] = persona70_df.persona_id.apply(lambda x: personas[x])

persona70_df = persona70_df.drop_duplicates(subset=['prompt_id', 'persona_id'], keep='first')
persona70_df.shape

(9694, 3)

## Response length distribution

In [None]:
len_df = {'len': [], 'source':[]}

# Load all the human responses first.
len_df['len'] += sample['response'].apply(lambda x: len(x)).values.tolist()
len_df['source'] += ['dolly' for _ in range(len(sample))]

for (df, source_name) in [(np_df, 'No persona'), (npc_df,'No persona+cutoff'), (persona_df, 'Persona'), (personac_df, 'Persona+cutoff')]:
    len_df['len'] += df['response'].apply(lambda x: len(x)).values.tolist()
    len_df['source'] += [source_name for _ in range(len(df))]
    
len_df=pd.DataFrame(len_df)

g = sns.kdeplot(len_df, x='len', hue='source', common_norm=False, fill=True, clip=[-1000,6000])
g.set_xlabel('Completion length (chars)')
g.set_ylabel('Density')
g.spines["top"].set_visible(False)
g.spines["right"].set_visible(False)
g.spines["left"].set_visible(False)
g.set(yticklabels=[])
g.grid(axis='x')
# plt.legend([], [], frameon=False)
plt.show()

## llama-1b

In [None]:
np1_df = pd.read_csv('../output/llama1b-np/Llama-3.2-1B-Instruct_dolly_output.tsv', sep='\t')
np1_df['response'] = np1_df.response.apply(lambda x: x.strip())
np1_df['prompt_id'] = [i for i in range(len(prompts))]
np1_df['persona_id'] = [-1 for i in range(len(prompts))]
calc_cr_nds_over_personas(np1_df)

In [None]:
npc1_df = pd.read_csv('../output/llama1b-cutoff-np/Llama-3.2-1B-Instruct_dolly_output.tsv', sep='\t')
npc1_df['response'] = npc1_df.response.apply(lambda x: x.strip())
npc1_df['prompt_id'] = [i for i in range(len(prompts))]
npc1_df['persona_id'] = [-1 for i in range(len(prompts))]
calc_cr_nds_over_personas(npc1_df)

In [None]:
persona1_df = pd.read_csv('../output/llama1b-persona/Llama-3.2-1B-Instruct_dolly_output.tsv', sep='\t')
persona1_df['response'] = persona1_df.response.apply(lambda x: x.strip())

persona_ids = []
prompt_ids = []
for persona_id in range(100):
    for prompt_id in range(100):
        persona_ids += [persona_id]
        prompt_ids += [prompt_id]
persona1_df['prompt_id'] = prompt_ids
persona1_df['persona_id'] = persona_ids
calc_cr_nds_over_personas(persona1_df)

In [None]:
personac1_df = pd.read_csv('../output/llama1b-cutoff-persona/Llama-3.2-1B-Instruct_dolly_output.tsv', sep='\t')
personac1_df['response'] = personac1_df.response.apply(lambda x: x.strip())

persona_ids = []
prompt_ids = []
for persona_id in range(100):
    for prompt_id in range(100):
        persona_ids += [persona_id]
        prompt_ids += [prompt_id]
personac1_df['prompt_id'] = prompt_ids
personac1_df['persona_id'] = persona_ids

calc_cr_nds_over_personas(personac1_df)

## 70B

In [None]:
np70_df = pd.read_csv('../output/llama70b-np/Llama-3.2-70b-Instruct_dolly_output.tsv', sep='\t')
np70_df['response'] = np70_df.response.apply(lambda x: x.strip())
np70_df['prompt_id'] = [i for i in range(len(prompts))]
np70_df['persona_id'] = [-1 for i in range(len(prompts))]
calc_cr_nds_over_personas(np70_df)

In [None]:
npc70_df = pd.read_csv('../output/llama70b-cutoff-np/Llama-3.2-70b-Instruct_dolly_output.tsv', sep='\t')
npc70_df['response'] = npc70_df.response.apply(lambda x: x.strip())
npc70_df['prompt_id'] = [i for i in range(len(prompts))]
npc70_df['persona_id'] = [-1 for i in range(len(prompts))]
calc_cr_nds_over_personas(npc70_df)

In [None]:
persona70_df = pd.read_csv('../output/llama70b-persona/Llama-3.1-8B-Instruct_dolly_output.tsv', sep='\t')
persona70_df['response'] = persona70_df.response.apply(lambda x: x.strip())

persona_ids = []
prompt_ids = []
for persona_id in range(100):
    for prompt_id in range(100):
        persona_ids += [persona_id]
        prompt_ids += [prompt_id]
persona70_df['prompt_id'] = prompt_ids
persona70_df['persona_id'] = persona_ids
calc_cr_nds_over_personas(persona_df)

In [None]:
personac_df = pd.read_csv('../output/llama70b-cutoff-persona/Llama-3.2-8B-Instruct_dolly_output.tsv', sep='\t')
personac_df['response'] = personac_df.response.apply(lambda x: x.strip())

persona_ids = []
prompt_ids = []
for persona_id in range(100):
    for prompt_id in range(100):
        persona_ids += [persona_id]
        prompt_ids += [prompt_id]
personac_df['prompt_id'] = prompt_ids
personac_df['persona_id'] = persona_ids

calc_cr_nds_over_personas(personac_df)

In [None]:
calc_cr_nds_over_personas(np1_df)