In [31]:
import pandas as pd
import random
import numpy as np
from tqdm import tqdm
import ipdb
import re
from datasets import load_dataset
import os
from diversity import compression_ratio, homogenization_score, ngram_diversity_score, extract_patterns, get_pos, pos_patterns, token_patterns, self_repetition_score
import json
from collections import Counter
from random import shuffle, randint

tqdm.pandas()

import matplotlib.pyplot as plt
# import mplcursors
import seaborn as sns
%matplotlib inline
sns.set(style='darkgrid', context='notebook', rc={'figure.figsize':(14,10)}, font_scale=2)

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('chained_assignment',None)

# Set random seeds for reproducibility on a specific machine
random.seed(1)
np.random.seed(1)
np.random.RandomState(1)
np.set_printoptions(precision=3)

## Dolly human written responses diversity analysis

In [2]:
with open('../data/sample_personas.txt', 'r') as f:
    personas = [x.strip() for x in f.readlines()]
personas[2]

'a Spanish-speaking patient with severe myopia interested in LASIK eye surgery'

In [3]:
dolly = load_dataset("databricks/databricks-dolly-15k")["train"].filter(lambda row: row['category']=='creative_writing').to_pandas()
sample = pd.read_csv('../data/dolly_creative_prompts_sample.tsv', sep='\t')

In [4]:
sample.head()

Unnamed: 0,index,instruction,num_tokens,num_tokens_round,num_words_round
0,525,Please propose an argument to convince my mother that she increases the amount of money that I get every month during my studies. I think the current amount is too low.,211,210,180
1,172,"Write a paragraph to refute a claim by a colleague that ancient structures such as Stonehenge, the Great Pyramid are evidence of UFO activities on Earth",189,190,160
2,109,What is the best way to answer an interview question?,128,130,100
3,37,Write the first paragraph of an advertising brochure describing an hotel nearby the Annecy Lake in France,89,90,60
4,295,"Write an intro to a meetup about music, medicine, and machines",313,310,220


In [5]:
sample['response'] = sample['index'].apply(lambda x: dolly.loc[x, 'response'])
sample['prompt_id'] = [i for i in range(len(sample))]
sample.head()

Unnamed: 0,index,instruction,num_tokens,num_tokens_round,num_words_round,response,prompt_id
0,525,Please propose an argument to convince my mother that she increases the amount of money that I get every month during my studies. I think the current amount is too low.,211,210,180,"Dear Mum, I would like to talk to you about the monthly money I get for my studies. I am very grateful that you support me financially during my studies. At the same time, I feel that the current amount is a little bit too low in order to sustain myself. I estimated my monthly expenses and it seems like the money is not enough. We can go through the details if you want. I want to ask you if you can increase the amount so that I can cover my expenses. I understand if you cannot do this or have other reasons against this. However, I want you to know that if you do not increase my monthly money, I will be forced to work part-time next to my studies. There is research that shows that students who work part-time have worse grades than people who do not have to work part-time to afford their studies. I would love to dedicate my full time to my studies to get the best grades and learning outcomes possible; thus, I would prefer not to work part-time. What do you think?",0
1,172,"Write a paragraph to refute a claim by a colleague that ancient structures such as Stonehenge, the Great Pyramid are evidence of UFO activities on Earth",189,190,160,"There are several strong arguments against concluding that alien races built the structures like the Stonehenge or the Pyramids. First, there is evidence that these magnificent structures were built by humans. For example, tombs of the pyramid builders were found in the vicinity of the Great Pyramind (with human remains!). The technology to move and assemble the massive stones of the Stonehenge has been shown to be available to humans at that time. Second, given that space is vast the chance of finding one tiny planet among billions is infinitesimal. If alien intelligence had indeed found our one inhabited planet they are unlikely to have just created couple of structures and avoided any further contact with Earth and its inhabitants. In science, Occam’s Razor (“the best explanation is the simplest one”) is a guiding principle and until there is much more evidence of alien visitation it would be inadvisable to jump to a conclusion that aliens built these structures.",1
2,109,What is the best way to answer an interview question?,128,130,100,"The first recommended step is to ask clarifying questions to ensure you understand the question correctly and gather the requirements needed to answer effectively. If it is a mathematical or computer science question, go ahead and work your way through the problem while you verbalize your thought process along the way. If it is a situation question, you should use the STAR method. 1. Situation - set the scene and provide context, 2. Task - describe what was needed to be done and why, 3. Action - explain what action you took, and 4. Result - describe the result of your actions and what you learned from the experience",2
3,37,Write the first paragraph of an advertising brochure describing an hotel nearby the Annecy Lake in France,89,90,60,"This fantastic hotel is ideally located, minutes away from the Annecy Lake, one of the purest in Europe. The water temperature reaches 27 degrees celcius at the peak season, allowing the guests to swim and enjoy several water activities including wake surf and pedalo. Finally, this region is known for being the home of several cheese specialties like ""Raclette"" and ""Tartiflette"" to name a few.",3
4,295,"Write an intro to a meetup about music, medicine, and machines",313,310,220,"Music, Medicine, and Machines\n\nJoin us for an imaginative, authentic, and offbeat networking experience that will advance your knowledge of the technologies that impact the health and well-being of our human condition. To make things interesting (but keep things real), our meetup incorporates an element of music to remind us, first and foremost, we want to advance technology for the greater good, but we don’t want to become robots ourselves. Instead, we believe that “music is medicine for the soul” so our events will use a dose of music to unite us and to keep things fun and grounded in our humanity. We’ll explore hot and emerging technologies such as:\n\nMachine Learning and all things AI\nComputer and Machine Vision\nTelemedicine\nGenomics\nAR/VR/MR \nRobotics\nCloud \nDevOps, CI/CD, and Robotic Process Automation (RPA)\nInfrastructure as Code (IaC) \nChatbots\nWearable Tech\n3D Printing\nBlockchain\nAnd many more\n\nWe’ll talk about how these disruptive technologies improve Health & Life Sciences and discuss the tenuous balance of innovation + opportunities vs privacy, security, open data, regulations, etc. We’ll network and get to know each other to explore how each of us can get involved to ensure “the machines” benefit the communities we serve. We encourage attendees such as developers, clinicians, researchers, industry experts, students, educators, industry analysts, regulators, investors, startups, musicians, and all those willing to contribute meaningfully to our mission.",4


In [6]:
prompts = sample['instruction'].values.tolist()

In [7]:
human_responses = sample['response'].values.tolist()

In [8]:
cr = compression_ratio(human_responses, 'gzip')
nds = ngram_diversity_score(human_responses, 4)
joined_pos, tuples = get_pos(human_responses)
# ngrams_pos = token_patterns(joined_pos, 5, 10)
cr_pos = compression_ratio(joined_pos, 'gzip')

srep = self_repetition_score(human_responses, verbose=True)

print(f"CR: {np.round(cr,2)}\nNDS: {np.round(nds,2)}\nCR-POS: {np.round(cr_pos,2)}\nSelf-rep: {np.round(srep, 2)}")

Calculating self-repetition score: 100%|███████████████████████████████████████████████████████| 100/100 [00:00<00:00, 86213.85it/s]

CR: 2.51
NDS: 3.03
CR-POS: 4.93
Self-rep: 0.55





In [9]:
rouge = homogenization_score(human_responses, 'rougel', verbose=True)
print(rouge)
bleu = homogenization_score(human_responses, 'bleu', verbose=True)
print(bleu)
# bertscore = homogenization_score(human_responses, 'bertscore', verbose=True)
# print(f"HS-RougeL: {np.round(rouge,2)}\nself-bleu: {np.round(bleu,2)}\nHS-bert: {np.round(bertscore,2)}")

==> Scoring all pairs


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:15<00:00,  6.29it/s]


0.096
==> Scoring all pairs


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [01:00<00:00,  1.65it/s]

0.0





## Main metric calculation function (can find mean and SD over persona column)

In [9]:
def calc_cr_nds_over_personas(df):
    crs = []
    ndss = []
    crs_pos = []
    sreps = []
    for persona_id in tqdm(df.persona_id.unique()):
        responses = df.loc[df.persona_id==persona_id].drop_duplicates(subset=['prompt_id'])['response'].values.tolist()
        cr = compression_ratio(responses, 'gzip')
        nds = ngram_diversity_score(responses, 4)
        #CR-POS
        joined_pos, tuples = get_pos(responses)
        # ngrams_pos = token_patterns(joined_pos, 5, 10)
        cr_pos = compression_ratio(joined_pos, 'gzip')
        srep = self_repetition_score(responses, verbose=False)
        crs.append(cr)
        ndss.append(nds)
        crs_pos.append(cr_pos)
        sreps.append(srep)
    print(f"CR: {np.round(np.mean(crs),2)} ± {np.round(np.std(crs),2)}\nCR-POS: {np.round(np.mean(crs_pos),2)} ± {np.round(np.std(crs_pos),2)}\nNDS: {np.round(np.mean(ndss),2)} ± {np.round(np.std(ndss),2)}\nSelf-rep:{np.round(np.mean(sreps),2)} ± {np.round(np.std(sreps),2)}")

In [20]:
def calc_hom_over_personas(df):
    bss = []
    bleus = []
    rls = []
    for persona_id in tqdm(df.persona_id.unique()):
        responses = df.loc[df.persona_id==persona_id].drop_duplicates(subset=['prompt_id'])['response'].values.tolist()
        
        bleu = homogenization_score(responses, 'bleu', verbose=False)
        bleus.append(bleu)

        # bs = homogenization_score(responses, 'bertscore', verbose=False)
        # bss.append(bs)

        rl = homogenization_score(responses, 'rougel', verbose=False)
        rls.append(rl)
        
    print(f"Hom-bleu: {np.round(np.mean(bleus),2)} ± {np.round(np.std(bleus),2)}\nHom-RL:: {np.round(np.mean(rls),2)} ± {np.round(np.std(rls),2)}")

In [11]:
# No persona, no cutoff
npnc = pd.read_csv('../output/deepseek-np/DeepSeek-V3_dolly_output.tsv', sep='\t')
npnc['response'] = npnc.response.apply(lambda x: x.strip())
npnc['prompt_id'] = [i for i in range(len(prompts))]
npnc['persona_id'] = [-1 for i in range(len(prompts))]
calc_cr_nds_over_personas(npnc)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.54s/it]

CR: 2.57 ± 0.0
CR-POS: 5.7 ± 0.0
NDS: 3.02 ± 0.0
Self-rep:1.23 ± 0.0





In [14]:
calc_hom_over_personas(npnc)

==> Scoring all pairs


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [01:05<00:00,  1.53it/s]

==> Scoring all pairs



100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:26<00:00,  3.72it/s]


Hom-bleu: 0.02 ± 0.0
Hom-RL:: 0.1 ± 0.0


In [16]:
# No persona with cutoff
npc = pd.read_csv('../output/deepseek-np-cutoff/DeepSeek-V3_dolly_output.tsv', sep='\t')
npc['response'] = npc.response.apply(lambda x: x.strip())
npc['prompt_id'] = [i for i in range(len(prompts))]
npc['persona_id'] = [-1 for i in range(len(prompts))]
calc_cr_nds_over_personas(npc)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.17s/it]

CR: 2.45 ± 0.0
CR-POS: 5.08 ± 0.0
NDS: 3.22 ± 0.0
Self-rep:0.27 ± 0.0





In [17]:
calc_hom_over_personas(npc)

==> Scoring all pairs


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [01:00<00:00,  1.65it/s]

==> Scoring all pairs



100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:12<00:00,  7.82it/s]

Hom-bleu: 0.0 ± 0.0
Hom-RL:: 0.08 ± 0.0





In [18]:
# Persona plus cutoff
pc = pd.read_csv('../output/deepseek-cutoff-persona/DeepSeek-V3_dolly_output.tsv', sep='\t')
pc['response'] = pc.response.apply(lambda x: x.strip())
pc = pc.drop_duplicates(subset=['prompt_id', 'persona_id'])
calc_cr_nds_over_personas(pc)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [09:17<00:00,  5.58s/it]

CR: 2.56 ± 0.08
CR-POS: 5.04 ± 0.24
NDS: 3.09 ± 0.06
Self-rep:1.8 ± 0.72





In [21]:
calc_hom_over_personas(pc)

Process SpawnPoolWorker-759:                                                                      | 2/100 [02:24<1:57:30, 71.94s/it]
Process SpawnPoolWorker-755:
Process SpawnPoolWorker-757:
Process SpawnPoolWorker-753:
Process SpawnPoolWorker-754:
Process SpawnPoolWorker-758:
Process SpawnPoolWorker-756:
Process SpawnPoolWorker-752:                                                                      | 2/100 [02:29<2:02:17, 74.87s/it]
Process SpawnPoolWorker-751:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/venkat/micromamba/envs/diversity/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/venkat/micromamba/envs/diversity/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)


KeyboardInterrupt: 

Interrupt
  File "/Users/venkat/micromamba/envs/diversity/lib/python3.11/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
                    ^^^^^^^^^^^^^^^^^^^
  File "/Users/venkat/micromamba/envs/diversity/lib/python3.11/multiprocessing/pool.py", line 48, in mapstar
    return list(map(*args))
           ^^^^^^^^^^^^^^^^
  File "/Users/venkat/micromamba/envs/diversity/lib/python3.11/site-packages/diversity/homogenization.py", line 64, in wrapped_score
    return _calculate_score(*args)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/venkat/micromamba/envs/diversity/lib/python3.11/site-packages/diversity/utils/memoize.py", line 23, in __call__
    value = self.func(*args)
            ^^^^^^^^^^^^^^^^
  File "/Users/venkat/micromamba/envs/diversity/lib/python3.11/site-packages/diversity/homogenization.py", line 82, in _calculate_score
    score = scorer.compute(predictions=[pair[0]],
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Us

In [40]:
pc.shape

(9996, 3)

## Coarse persona

In [17]:
# Persona plus cutoff
coarse_pc70 = pd.read_csv('../output/coarse/llama-cutoff-persona/Llama-3.3-70B-Instruct-Turbo_dolly_output.tsv', sep='\t')
coarse_pc70['response'] = coarse_pc70.response.apply(lambda x: x.strip())
coarse_pc70 = coarse_pc70.drop_duplicates(subset=['prompt_id', 'persona_id'])
calc_cr_nds_over_personas(coarse_pc70)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [09:08<00:00,  5.48s/it]

CR: 2.65 ± 0.06
CR-POS: 5.18 ± 0.15
NDS: 2.96 ± 0.06
Self-rep:2.27 ± 0.72





In [15]:
coarse_pc70[coarse_pc70.response.str.len()<8].shape

(0, 3)

In [16]:
coarse_pc70.persona_id.unique()

array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
       '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34',
       '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45',
       '46', '47', 'persona_id', '48', '49', '50', '51', '52', '53', '54',
       '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65',
       '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76',
       '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87',
       '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98',
       '99'], dtype=object)

## What if we calculate the metrics with each prompt answered by a different persona?

In [8]:
def calc_cr_nds_over_personas(df):
    crs = []
    ndss = []
    crs_pos = []
    sreps = []
    for persona_id in tqdm(df.persona_id.unique()):
        responses = df.loc[df.persona_id==persona_id].drop_duplicates(subset=['prompt_id'])['response'].values.tolist()
        cr = compression_ratio(responses, 'gzip')
        nds = ngram_diversity_score(responses, 4)
        #CR-POS
        joined_pos, tuples = get_pos(responses)
        # ngrams_pos = token_patterns(joined_pos, 5, 10)
        cr_pos = compression_ratio(joined_pos, 'gzip')
        srep = self_repetition_score(responses, verbose=False)
        crs.append(cr)
        ndss.append(nds)
        crs_pos.append(cr_pos)
        sreps.append(srep)
    print(f"CR: {np.round(np.mean(crs),2)} ± {np.round(np.std(crs),2)}\nCR-POS: {np.round(np.mean(crs_pos),2)} ± {np.round(np.std(crs_pos),2)}\nNDS: {np.round(np.mean(ndss),2)} ± {np.round(np.std(ndss),2)}\nSelf-rep:{np.round(np.mean(sreps),2)} ± {np.round(np.std(sreps),2)}")

In [9]:
pc = pd.read_csv('../output/deepseek-cutoff-persona/DeepSeek-V3_dolly_output.tsv', sep='\t')
pc['response'] = pc.response.apply(lambda x: x.strip())
pc = pc.drop_duplicates(subset=['prompt_id', 'persona_id'])
pc.shape

(9996, 3)

In [14]:
newpc = pc.set_index(['persona_id', 'prompt_id'])


response    "Mom, I really appreciate everything you’re doing to support my studies—it means so much to me. I want to propose that we revisit the amount you’re sending me each month because, with rising costs, it’s becoming challenging to manage everything. I’m fully committed to my studies and my future, and I know this investment will pay off. I’ve been tracking my expenses carefully, and things like textbooks, software, and even basic living costs have gone up significantly. I’m not asking for more than necessary—just enough to ensure I can focus on my education without unnecessary stress. \n\nThink of it this way: by increasing my budget slightly, you’re helping me avoid distractions like taking on additional part-time work that could take away from my studies. I’m dedicated to making the most of this opportunity and giving myself the best shot at a secure and successful career. Your support now is an investment in my future, and I’m committed to making you proud. Let’s work togeth

In [28]:
def calc_cr_nds_sr(responses):
    cr = compression_ratio(responses, 'gzip')
    nds = ngram_diversity_score(responses, 4)
    #CR-POS
    joined_pos, tuples = get_pos(responses)
    # ngrams_pos = token_patterns(joined_pos, 5, 10)
    cr_pos = compression_ratio(joined_pos, 'gzip')
    srep = self_repetition_score(responses, verbose=False)
    return cr, cr_pos, nds, srep
    # print(f"CR: {np.round(np.mean(cr),2)} \nCR-POS: {np.round(np.mean(cr_pos),2)}\nNDS: {np.round(np.mean(nds),2)}\nSelf-rep: {np.round(np.mean(srep),2)}")

In [29]:
# What if every prompt was answered by a different persona?
crs = []
ndss = []
crs_pos = []
sreps = []

for _ in tqdm(range(10)):
    # Get random personas paired with every prompt
    persona_ids_shuffled = [i for i in range(100)]
    shuffle(persona_ids_shuffled)
    prompt_ids = [i for i in range(100)]
    pairs = list(zip(persona_ids_shuffled, prompt_ids))
    responses = newpc.loc[pairs, 'response'].values.tolist()
    
    # Calculate metrics
    cr, cr_pos, nds, srep = calc_cr_nds_sr(responses)

    crs.append(cr)
    ndss.append(nds)
    crs_pos.append(cr_pos)
    sreps.append(srep)

print(f"CR: {np.round(np.mean(crs),2)} ± {np.round(np.std(crs),2)}\nCR-POS: {np.round(np.mean(crs_pos),2)} ± {np.round(np.std(crs_pos),2)}\nNDS: {np.round(np.mean(ndss),2)} ± {np.round(np.std(ndss),2)}\nSelf-rep:{np.round(np.mean(sreps),2)} ± {np.round(np.std(sreps),2)}")

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:53<00:00,  5.39s/it]

CR: 2.42 ± 0.01
CR-POS: 4.95 ± 0.04
NDS: 3.21 ± 0.02
Self-rep:0.32 ± 0.06





In [33]:
# What if there are 10 different personas that answered all 100 prompts?

crs = []
ndss = []
crs_pos = []
sreps = []

for _ in tqdm(range(10)):
    # Get random personas paired with every prompt
    persona_ids_shuffled = [randint(0,99) for _ in range(10)]*10
    shuffle(persona_ids_shuffled)
    prompt_ids = [i for i in range(100)]
    pairs = list(zip(persona_ids_shuffled, prompt_ids))
    responses = newpc.loc[pairs, 'response'].values.tolist()
    
    # Calculate metrics
    cr, cr_pos, nds, srep = calc_cr_nds_sr(responses)

    crs.append(cr)
    ndss.append(nds)
    crs_pos.append(cr_pos)
    sreps.append(srep)

print(f"CR: {np.round(np.mean(crs),2)} ± {np.round(np.std(crs),2)}\nCR-POS: {np.round(np.mean(crs_pos),2)} ± {np.round(np.std(crs_pos),2)}\nNDS: {np.round(np.mean(ndss),2)} ± {np.round(np.std(ndss),2)}\nSelf-rep:{np.round(np.mean(sreps),2)} ± {np.round(np.std(sreps),2)}")


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:53<00:00,  5.35s/it]

CR: 2.45 ± 0.04
CR-POS: 4.99 ± 0.08
NDS: 3.18 ± 0.02
Self-rep:0.83 ± 0.22



