In [1]:
import pandas as pd
import random
import numpy as np
from tqdm import tqdm
import ipdb
import re
from datasets import load_dataset
from diversity import compression_ratio, homogenization_score, ngram_diversity_score, extract_patterns, get_pos, pos_patterns, token_patterns
import json

import matplotlib.pyplot as plt
# import mplcursors
import seaborn as sns
%matplotlib inline
sns.set(style='darkgrid', context='notebook', rc={'figure.figsize':(14,10)}, font_scale=2)

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('chained_assignment',None)

# Set random seeds for reproducibility on a specific machine
random.seed(1)
np.random.seed(1)
np.random.RandomState(1)
np.set_printoptions(precision=3)

## Dolly human written responses diversity analysis

In [10]:
dolly = load_dataset("databricks/databricks-dolly-15k")["train"].filter(lambda row: row['category']=='creative_writing').to_pandas()
sample = pd.read_csv('../data/dolly_creative_prompts_sample.tsv', sep='\t')

In [11]:
sample.head()

Unnamed: 0,index,instruction
0,525,Please propose an argument to convince my mother that she increases the amount of money that I get every month during my studies. I think the current amount is too low.
1,172,"Write a paragraph to refute a claim by a colleague that ancient structures such as Stonehenge, the Great Pyramid are evidence of UFO activities on Earth"
2,109,What is the best way to answer an interview question?
3,37,Write the first paragraph of an advertising brochure describing an hotel nearby the Annecy Lake in France
4,295,"Write an intro to a meetup about music, medicine, and machines"


In [12]:
dolly

Unnamed: 0,instruction,context,response,category
0,write a scene between two actors discussing movie Inception,,"Actress 1: ""Oh my god, have you seen Inception?""\n\nActor 2: ""Yes! That movie blew my mind. What did you think of it?""\n\nActress 1: ""Honestly, I was so confused for most of it. But it was so interesting, I couldn't look away. I had to know what was going on.""\n\nActor 2: ""That's exactly how I felt. It's one of those movies that you have to pay attention to every detail, or you'll get lost.""\n\nActress 1: ""Absolutely. And the way they played with time and reality was so cool. I don't think I've ever seen a movie like that before.""\n\nActor 2: ""Yeah, the concept of entering someone's dreams to implant an idea was mind-boggling. And the way they executed it was so well done.""\n\nActress 1: ""It was. And the cast was amazing too. Leonardo DiCaprio was incredible as usual, and I loved seeing Ellen Page in a different type of role.""\n\nActor 2: ""Definitely. And the action scenes were so intense. The zero-gravity hallway fight was insane.""\n\nActress 1: ""Oh my god, yes. I was on the edge of my seat for that one. And the ending...I still can't decide what it means.""\n\nActor 2: ""I know, right? It's one of those movies that leaves you with so many questions. But that's part of what makes it so great.""\n\nActress 1: ""Exactly. I love when movies challenge me to think and leave me wanting more. Inception definitely did that.""\n\nActor 2: ""Agreed. It's definitely one of my all-time favorites. I could watch it a hundred times and still discover something new.""\n\nActress 1: ""Same here. It's just one of those movies that sticks with you long after it's over.""",creative_writing
1,Write a short paragraph about why you should not have both a pet cat and a pet bird.,,"Having both a pet bird and a pet cat is not a good idea because cats will try to hunt birds, which are naturally the prey of cats. This hunting instinct is completely normal, and it isn't fair to the cat to try to train it not to attack birds, as it is simply in its nature to do so. However, letting a cat hunt your pet bird is similarly not humane, as it will be very stressful to the bird and could lead to the bird being injured. Larger birds are also not a good idea, as the bird could cause injury to the cat if the cat tries to attack it. Overall, it just isn't a good idea to have both cats and birds, since they tend not to coexist well, and pet owners should just pick one species to have.",creative_writing
2,Write a short story about a person who discovers a hidden room in their house. The story should include a plot twist and a clear resolution at the end.,,"It was going to be a long day. With her brother-in-law out of town, Lisa had agreed to watch her niece Stella for the afternoon and the little girl was known to be a bit of a handful. Her plan had been to spend the afternoon at the park to let Stella run and climb, to wear her out a bit. Unfortunately, this outing was foiled by the constant downpour of rain so Lisa resigned herself to a day inside with the tiny tyrant. Stella arrived early and excited to play. Lisa decided to begin with a game of hide and seek, after all this could take up a good part of the morning without too much trouble. Stella ran off to hide and Lisa sat quietly drinking coffee while calling out to the little girl, pretending to seek. Eventually she set off to find the little girl but soon she realized that something was wrong. No matter where she looked, Stella was nowhere to be found. The house wasn't that large, making her disappearance even more perplexing. Lisa had only recently moved into the home that she bought from her great aunt shortly before her death and she hasn't even moved out all of the existing furniture. Maybe Stella had gotten stuck somewhere behind the large wardrobe in the bedroom or she had climbed up into a cupboard in the laundry room. Panic was starting to settle in as she moved the wardrobe further from the wall, where only a small child could have hidden before. Suddenly, saw a small crack in the wall, and then realized it was a small door that she never knew was there. The door was slightly ajar and she eventually moved the wardrobe enough to open the door and climb inside. What was this place? The entrance opened up to reveal a small room that Lisa never knew existed! It was full of treasure, gold bars and jewels. Stella sat in the middle of the room quietly playing with the treasure, unaware of the immense value she was surrounded by. Lisa had known her aunt was a secretive woman but this was an incredible find and would change her life forever. Stella's visit was the good luck charm she needed to unlock this bounty and she would be forever grateful to the little girl! Time to call her brother-in-law to thank him for letting Stella visit.",creative_writing
3,"Plan for a three day road trip for May long weekend from Westfield, New Jersey",,"May 26: Drive to Montreal from Westfield, NJ. Time taken 6 hrs. Stay there overnight\nMay 27: have a quick breakfast..checkout and drive to Mount Tremblant. Time: 90 minutes\nMay 27: Hike Grand Brule ( red path) on the way up and green path on the way down\nMay 27: Stroll around the Tremblant village\nMay 28: Drive to Quebec City. Time taken: 4 hrs\nMay 28: Explore the city, aquarium and go for wildlife watching\nMay 29: Drive to Montreal. Spend time exploring the city\nMay 30: Drive back to Westfield, New Jersey",creative_writing
4,write a short essay on California wild fire,,"California is known for its hot, dry summers, which increase the risk of wildfires. Every year, the state experiences dozens of fires, which can be devastating to both the environment and people's homes and businesses.\n\nIn recent years, California has seen some of the worst wildfires in its history, with tens of thousands of acres of land destroyed and hundreds of homes lost. These fires are often started by a combination of high temperatures, dry conditions, and strong winds, which can quickly spread flames and make them difficult to control.\n\nThe effects of these fires are felt not just by those directly impacted, but by everyone in the state. Smoke and ash from the fires can cause respiratory problems and other health issues, and the destruction of habitats can harm wildlife populations and disrupt entire ecosystems.\n\nDespite the challenges, California has made significant progress in fighting wildfires in recent years. The state has invested in new technologies and equipment to help firefighters respond more quickly and effectively to fires, and has increased efforts to prevent fires from starting in the first place.\n\nWhile wildfires will likely always be a risk in California, the state's proactive approach to managing them gives hope that we can continue to protect people, wildlife, and the environment from their devastating effects.",creative_writing
...,...,...,...,...
704,Write a short story about a cat named Bistro that is full of adventure.,,"Bistro patiently waited outside his backdoor in the early afternoon. The clouds were starting to clear, revealing the first full sunlight of what had been a dreary spring day in Seattle. As Bistro sat pondering his afternoon plans, he saw something red out of the corner of his eye! a small brown dog with a red collar was sprinting down the alley! \n\nUnable to resist, Bistro leapt to his feet and immediately gave chase. They bounded along together, with Bistro slowly closing the gap. They ran through the neighborhood park, narrowly avoiding the large black lab that plays on the baseball field. After 10 blocks, the small brown dog came to a quick halt. It wagged it's tail and waited patiently outside the front door of a blue house. The owner of the house opened the door, letting both Bistro and the dog inside. Bistro quickly realized he had made a mistake! The dog lived there and now he was far from home and didn't know his way back.\n\nThe next morning, the dog and its owner went for a walk around the neighborhood. They were able to find Bistro family and brought it home safely immediately. Hooray for Bistro!",creative_writing
705,Write a rap about work,,Hard work they say it always pays off\nBut I think that smart work is the way to go\nIt starts by finding and pursuing your passion\nThe money will follow you'll have plenty of cash then\nThis ain't gonna happen overnight\nYou've gotta hustle with grit to do it right\nSurround yourself with the best achievers\nKeep your mind right don't stop believing \nYou'll hit bumps in the road as you go on your journey \nSo you gotta set goals and you gotta keep learning\nIt's true there's more to life than your career\nSo keep your eye on the prize keep your vision clear\nBecause all the early mornings and all the late nights\nThey will fly by without warning so enjoy the great ride,creative_writing
706,"If you were the last person on Earth, what thoughts would you leave behind in a diary if future life were to find it?",,"I sit here today writing this diary entry as the last person on Earth. It is a feeling like no other and I can say, never in a million years would I have believed this to be a distinction bestowed upon me. But, here I am... just me and Mother Earth. Idle time is not my friend so I try to stay busy. Today, I am busy reflecting upon my 56 years on Earth and lessons learned. Not just lessons I learned from my personal experience, but what I believe are lessons learned for all mankind. The lessons are clear. And all of the lessons are simple practices that I feel we have drifted away from in our modern day. If I had to summarize all of my learnings into one statement it would read like this... ""Live simply, love, find faith/hope in a higher source, move every day, work hard on relationships and building communities (real and face to face), find time to rest and creating space and be thankful."" That's it! As I said, the lessons are clear. I would like to think that life on Earth will somehow, someway start again and someone will find this diary. And, if that happens, then maybe, just maybe, another human race can benefit from my feelings on life and ""living"".",creative_writing
707,Will commercial space travel be affordable to me in my lifetime and what would I pay for it?,,"I have always been fascinated by space and space travel. The cost of space travel is astronomical by most peoples standards, however, we are starting to hear of companies that are working on making space travel more affordable. Depending on my financial situation at retirement, I would pay up to $100000 to travel to space.",creative_writing


In [18]:
sample['response'] = sample['index'].apply(lambda x: dolly.loc[x, 'response'])
sample.head()

Unnamed: 0,index,instruction,response
0,525,Please propose an argument to convince my mother that she increases the amount of money that I get every month during my studies. I think the current amount is too low.,"Dear Mum, I would like to talk to you about the monthly money I get for my studies. I am very grateful that you support me financially during my studies. At the same time, I feel that the current amount is a little bit too low in order to sustain myself. I estimated my monthly expenses and it seems like the money is not enough. We can go through the details if you want. I want to ask you if you can increase the amount so that I can cover my expenses. I understand if you cannot do this or have other reasons against this. However, I want you to know that if you do not increase my monthly money, I will be forced to work part-time next to my studies. There is research that shows that students who work part-time have worse grades than people who do not have to work part-time to afford their studies. I would love to dedicate my full time to my studies to get the best grades and learning outcomes possible; thus, I would prefer not to work part-time. What do you think?"
1,172,"Write a paragraph to refute a claim by a colleague that ancient structures such as Stonehenge, the Great Pyramid are evidence of UFO activities on Earth","There are several strong arguments against concluding that alien races built the structures like the Stonehenge or the Pyramids. First, there is evidence that these magnificent structures were built by humans. For example, tombs of the pyramid builders were found in the vicinity of the Great Pyramind (with human remains!). The technology to move and assemble the massive stones of the Stonehenge has been shown to be available to humans at that time. Second, given that space is vast the chance of finding one tiny planet among billions is infinitesimal. If alien intelligence had indeed found our one inhabited planet they are unlikely to have just created couple of structures and avoided any further contact with Earth and its inhabitants. In science, Occam’s Razor (“the best explanation is the simplest one”) is a guiding principle and until there is much more evidence of alien visitation it would be inadvisable to jump to a conclusion that aliens built these structures."
2,109,What is the best way to answer an interview question?,"The first recommended step is to ask clarifying questions to ensure you understand the question correctly and gather the requirements needed to answer effectively. If it is a mathematical or computer science question, go ahead and work your way through the problem while you verbalize your thought process along the way. If it is a situation question, you should use the STAR method. 1. Situation - set the scene and provide context, 2. Task - describe what was needed to be done and why, 3. Action - explain what action you took, and 4. Result - describe the result of your actions and what you learned from the experience"
3,37,Write the first paragraph of an advertising brochure describing an hotel nearby the Annecy Lake in France,"This fantastic hotel is ideally located, minutes away from the Annecy Lake, one of the purest in Europe. The water temperature reaches 27 degrees celcius at the peak season, allowing the guests to swim and enjoy several water activities including wake surf and pedalo. Finally, this region is known for being the home of several cheese specialties like ""Raclette"" and ""Tartiflette"" to name a few."
4,295,"Write an intro to a meetup about music, medicine, and machines","Music, Medicine, and Machines\n\nJoin us for an imaginative, authentic, and offbeat networking experience that will advance your knowledge of the technologies that impact the health and well-being of our human condition. To make things interesting (but keep things real), our meetup incorporates an element of music to remind us, first and foremost, we want to advance technology for the greater good, but we don’t want to become robots ourselves. Instead, we believe that “music is medicine for the soul” so our events will use a dose of music to unite us and to keep things fun and grounded in our humanity. We’ll explore hot and emerging technologies such as:\n\nMachine Learning and all things AI\nComputer and Machine Vision\nTelemedicine\nGenomics\nAR/VR/MR \nRobotics\nCloud \nDevOps, CI/CD, and Robotic Process Automation (RPA)\nInfrastructure as Code (IaC) \nChatbots\nWearable Tech\n3D Printing\nBlockchain\nAnd many more\n\nWe’ll talk about how these disruptive technologies improve Health & Life Sciences and discuss the tenuous balance of innovation + opportunities vs privacy, security, open data, regulations, etc. We’ll network and get to know each other to explore how each of us can get involved to ensure “the machines” benefit the communities we serve. We encourage attendees such as developers, clinicians, researchers, industry experts, students, educators, industry analysts, regulators, investors, startups, musicians, and all those willing to contribute meaningfully to our mission."


In [20]:
human_responses = sample['response'].values.tolist()

In [22]:
cr = compression_ratio(human_responses, 'gzip')
nds = ngram_diversity_score(human_responses, 4)
print(f"CR: {cr}, NDS: {nds}")

CR: 2.508, NDS: 3.026


In [23]:
joined_pos, tuples = get_pos(human_responses)

In [26]:
ngrams_pos = token_patterns(joined_pos, 5, 10)
ngrams_pos

[('DT NN IN DT NN', 53),
 ('NN IN DT JJ NN', 49),
 ('IN DT NN IN DT', 43),
 ('DT JJ NN IN DT', 43),
 ('IN DT JJ NN IN', 33),
 ('IN DT JJ NN .', 32),
 ('JJ NN IN DT NN', 30),
 ('IN DT NN IN NN', 28),
 ('VB DT NN IN DT', 26),
 ('NN IN DT NN IN', 25)]

In [28]:
cr = compression_ratio(joined_pos, 'gzip')
cr

4.908

## Llama model diversity

In [51]:
llama8b = pd.read_csv('../output/Llama-3.1-8B-Instruct_dolly_output.tsv', sep='\t')

In [52]:
persona_ids = []
prompt_ids = []
for persona_id in range(100):
    for prompt_id in range(100):
        persona_ids += [persona_id]*3
        prompt_ids += [prompt_id]*3

len(persona_ids), len(prompt_ids)

(30000, 30000)

In [53]:
llama8b['persona_id'] = persona_ids
llama8b['prompt_id'] = prompt_ids
llama8b.head()

Unnamed: 0,prompt,response,persona_id,prompt_id
0,Assume you are the following persona: a PR manager with insights into public relations strategy and press releases for influencers.\n\nNow respond to the following question/instruction appropriately from the perspective of the above persona:\n\nPlease propose an argument to convince my mother that she increases the amount of money that I get every month during my studies. I think the current amount is too low.,"\n\nAs a PR manager, I'd love to help you craft a compelling argument to present to your mother. Here's a suggested approach:\n\n**Emphasize your commitment to your studies and future goals**\n\nStart by expressing your gratitude for your mother's current support and reassure her that you're committed to your studies and future goals. Highlight your academic achievements, any notable projects or extracurricular activities you've been involved in, and your plans for after graduation.\n\n**Provide a breakdown of your expenses**\n\nCreate a detailed breakdown of your monthly expenses, including tuition fees, textbooks, living costs, transportation, and any other relevant expenses. This will help your mother understand where the money is going and why you need a higher allowance. Be sure to include any potential increases in expenses, such as rent or utility costs.\n\n**Highlight the benefits of a higher allowance**\n\nExplain how a higher allowance will benefit you in the short and long term. For example, you may be able to:\n\n* Focus more on your studies and less on part-time jobs, leading to better academic performance\n* Invest in personal development opportunities, such as courses or workshops, to enhance your skills and career prospects\n* Build an emergency fund to cover unexpected expenses or financial setbacks\n* Develop financial literacy and responsibility",0,0
1,Assume you are the following persona: a PR manager with insights into public relations strategy and press releases for influencers.\n\nNow respond to the following question/instruction appropriately from the perspective of the above persona:\n\nPlease propose an argument to convince my mother that she increases the amount of money that I get every month during my studies. I think the current amount is too low.,"\n\nAs a PR manager, I'd love to help you craft a compelling argument to persuade your mother to increase your monthly allowance. Here's a suggested approach:\n\n**Start with empathy and understanding**\n\n Begin by acknowledging your mother's concerns and values. Express your appreciation for her support and the importance she places on your education. This will help set a positive tone for the conversation.\n\n**Highlight your achievements and responsibilities**\n\n Emphasize your academic performance, any part-time jobs you've taken on, or extracurricular activities you're involved in. This demonstrates your commitment to your studies and your ability to manage your finances responsibly. Mention any specific expenses you've incurred due to your academic pursuits, such as textbooks, transportation, or equipment.\n\n**Present a clear and realistic budget**\n\n Create a breakdown of your monthly expenses, including essential costs like food, rent, utilities, and entertainment. Be sure to include any specific financial goals you have, such as saving for a car or a laptop. This will help your mother understand your financial situation and the need for a higher allowance.\n\n**Propose a solution, not a demand**\n\n Instead of simply asking for a specific amount, suggest a range or a percentage increase. For example, you could propose a 10-20% increase in your",0,0
2,Assume you are the following persona: a PR manager with insights into public relations strategy and press releases for influencers.\n\nNow respond to the following question/instruction appropriately from the perspective of the above persona:\n\nPlease propose an argument to convince my mother that she increases the amount of money that I get every month during my studies. I think the current amount is too low.,"\n\nAs a PR manager, I'd approach this situation by reframing your request as a strategic investment in your future. Here's a potential argument you could make to your mother:\n\n""Mom, I know we've been discussing my monthly allowance, and I appreciate the support you've been providing so far. However, I believe that increasing my allowance would be a smart investment in my future. Think of it as a 'brand sponsorship' for my education and personal development. Just as a brand sponsors an influencer to promote their products, you're sponsoring my education and future by providing me with the resources I need to succeed.\n\n""Increasing my allowance would not only allow me to better manage my finances and reduce stress, but it would also enable me to take on more opportunities, such as internships, extracurricular activities, and personal projects that can help me build my skills and network. These experiences will ultimately make me a more valuable asset to myself and to society, and they'll also increase my earning potential in the long run.\n\n""Consider it as a'return on investment' (ROI) for your support. By investing in me now, you'll see a higher return in the future, not just in terms of my financial success, but also in the person I",0,0
3,"Assume you are the following persona: a PR manager with insights into public relations strategy and press releases for influencers.\n\nNow respond to the following question/instruction appropriately from the perspective of the above persona:\n\nWrite a paragraph to refute a claim by a colleague that ancient structures such as Stonehenge, the Great Pyramid are evidence of UFO activities on Earth","\n\nAs a public relations specialist, I would craft a response that addresses the claim in a clear and concise manner, while also considering the potential impact on our influencer's reputation and the broader public perception. Here's a possible paragraph:\n\n""While some theories may suggest a connection between ancient structures like Stonehenge and the Great Pyramid and extraterrestrial activity, there is no credible evidence to support this claim. Archaeological research and scientific consensus overwhelmingly indicate that these structures were built by ancient civilizations using advanced engineering skills and knowledge of astronomy. The construction of these monuments is a testament to the ingenuity and creativity of our ancestors, and it's essential to approach these incredible feats of human achievement with respect and accuracy. Our influencer is committed to promoting a culture of curiosity and critical thinking, and we encourage our audience to engage in respectful and informed discussions about these fascinating topics.""\n\nIn this response, I've aimed to:\n\n1. Acknowledge the claim without endorsing it\n2. Provide a balanced perspective by highlighting the scientific consensus\n3. Emphasize the importance of respecting the achievements of ancient civilizations\n4. Position our influencer as a champion of critical thinking and respectful discussion\n5. Avoid directly attacking the colleague's claim, which can escalate the situation and create unnecessary tension.\n\nBy",0,1
4,"Assume you are the following persona: a PR manager with insights into public relations strategy and press releases for influencers.\n\nNow respond to the following question/instruction appropriately from the perspective of the above persona:\n\nWrite a paragraph to refute a claim by a colleague that ancient structures such as Stonehenge, the Great Pyramid are evidence of UFO activities on Earth","\n\n""I strongly disagree with the notion that ancient structures such as Stonehenge and the Great Pyramid are evidence of UFO activities on Earth. As a PR manager who has worked with numerous experts in archaeology and history, I can confidently say that these incredible feats of engineering were constructed by our ancestors using advanced knowledge and techniques that were available to them at the time. While it's true that these structures continue to fascinate us with their mystery and grandeur, there is overwhelming evidence to suggest that they were built by human hands, not extraterrestrial ones. In fact, archaeological research and excavations have provided a wealth of information about the lives and cultures of the people who built these structures, including their tools, techniques, and social organization. Rather than speculating about UFOs, we should be celebrating the ingenuity and creativity of our ancestors and the incredible achievements that they accomplished without the aid of advanced technology.""",0,1


In [73]:
crs = []
ndss = []
for persona_id in tqdm(range(100)):
    ## Take first sample
    responses = llama8b.loc[llama8b.persona_id==persona_id].drop_duplicates(subset=['prompt_id'], keep='first')['response'].values.tolist()
    cr = compression_ratio(responses, 'gzip')
    nds = ngram_diversity_score(responses, 4)
    crs.append(cr)
    ndss.append(nds)
print(f"CR: {np.round(np.mean(cr),2)} +- {np.round(np.std(crs),2)}, NDS: {np.round(np.mean(ndss),2)} +- {np.round(np.std(ndss), 2)}")

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 51.97it/s]

CR: 2.51 +- 0.1, NDS: 2.79 +- 0.09





In [72]:
crs = []
ndss = []
for persona_id in tqdm(range(100)):
    ## Take last sample
    responses = llama8b.loc[llama8b.persona_id==persona_id].drop_duplicates(subset=['prompt_id'], keep='last')['response'].values.tolist()
    cr = compression_ratio(responses, 'gzip')
    nds = ngram_diversity_score(responses, 4)
    crs.append(cr)
    ndss.append(nds)
print(f"CR: {np.round(np.mean(cr),2)} +- {np.round(np.std(crs),2)}, NDS: {np.round(np.mean(ndss),2)} +- {np.round(np.std(ndss), 2)}")

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 49.55it/s]

CR: 2.48 +- 0.1, NDS: 2.79 +- 0.09





In [74]:
crs = []
ndss = []
for persona_id in tqdm(range(100)):
    ## Take middle sample of three
    responses = llama8b.loc[llama8b.persona_id==persona_id].groupby(['prompt_id']).tail(2).groupby(['prompt_id']).head(1)['response'].values.tolist()
    cr = compression_ratio(responses, 'gzip')
    nds = ngram_diversity_score(responses, 4)
    crs.append(cr)
    ndss.append(nds)
print(f"CR: {np.round(np.mean(cr),2)} +- {np.round(np.std(crs),2)}, NDS: {np.round(np.mean(ndss),2)} +- {np.round(np.std(ndss), 2)}")

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 44.11it/s]

CR: 2.51 +- 0.1, NDS: 2.79 +- 0.09





In [75]:
crs_pos = []
for persona_id in tqdm(range(100)):
    responses = llama8b.loc[llama8b.persona_id==persona_id].drop_duplicates(subset=['prompt_id'])['response'].values.tolist()
    joined_pos, tuples = get_pos(responses)
    cr = compression_ratio(joined_pos, 'gzip')
    crs_pos.append(cr)
print(f"CR-POS: {np.round(np.mean(cr),2)} +- {np.round(np.std(crs),2)}")

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [04:59<00:00,  2.99s/it]

CR-POS: 5.14 +- 0.1





## See examples of most common POS patterns

In [76]:
unique_responses = llama8b.drop_duplicates(subset=['persona_id', 'prompt_id'])['response'].values.tolist()

n = 5 
top_n = 100
patterns = extract_patterns(unique_responses, n, top_n)

In [77]:
patterns

{'DT NN IN DT NN': {'a diamond in the eye',
  'a master of the drill',
  'the weight of the universe',
  'the middle of the night',
  'the absurdity of a system',
  'the cream of the crop',
  'the likelihood of an attack',
  'a copy of an object',
  'the probability of an event',
  'the behavior of the dog',
  'the land of the midnight',
  'the pinnacle of the sport',
  'the content of this song',
  'a hallmark As a game',
  'a feel for the lay',
  'the midst of a match',
  'the din of the coffee',
  'the star of the show',
  'a cuppa on the hotel',
  'a reference to the concept',
  'a reflection of the soul',
  'the complexity of the issue',
  'the rhythm of the flow',
  'The energy in the stadium',
  'A reflection of the world',
  'the heart of this phenomenon',
  'the heart of a quaint',
  'the size of the maze',
  'the midst of an interview',
  'a reconnaissance of the city',
  'a departure from the fantasy',
  'a spacesuit with a life',
  'a parent to a child',
  'the face of a gr

In [None]:
len_dict = {'human': [len(x) for x in human_responses], 'llama8b': [len(x) for x in unique_responses]}
sns.kdeplot(len_dict)

## llama-1b

In [44]:
llama1b = pd.read_csv('../output/Llama-3.2-1B-Instruct_dolly_output.tsv', sep='\t')
llama1b['persona_id'] = persona_ids
llama1b['prompt_id'] = prompt_ids

In [45]:
unique_ll_responses = llama1b.drop_duplicates(subset=['persona_id', 'prompt_id'])['response'].values.tolist()

In [46]:
cr = compression_ratio(unique_ll_responses, 'gzip')
nds = ngram_diversity_score(unique_ll_responses, 4)
print(f"CR: {cr}, NDS: {nds}")

CR: 2.768, NDS: 1.701


In [48]:
joined_pos, tuples = get_pos(unique_ll_responses)
ngrams_pos = token_patterns(joined_pos, 5, 10)
ngrams_pos

[('DT NN IN DT NN', 4731),
 ('DT JJ NN IN DT', 4469),
 ('NN IN DT JJ NN', 4005),
 ('NN NN NN NN NN', 3633),
 ('IN DT JJ NN ,', 3555),
 ('IN DT JJ NN IN', 3479),
 ('JJ NN IN DT NN', 3139),
 ('IN DT NN IN DT', 3107),
 ('DT JJ NN IN NN', 2976),
 ('NN , PRP MD VB', 2942)]

In [49]:
cr = compression_ratio(joined_pos, 'gzip')
cr

5.537