In [6]:
import pandas as pd
import random
import numpy as np
from tqdm import tqdm
import ipdb
import re
from datasets import load_dataset
import os
from diversity import compression_ratio, ngram_diversity_score, extract_patterns, get_pos, pos_patterns, token_patterns, self_repetition_score
import json
from collections import Counter
from random import shuffle
import markdown
from bs4 import BeautifulSoup
import emoji

tqdm.pandas()

import matplotlib.pyplot as plt
# import mplcursors
import seaborn as sns
%matplotlib inline
sns.set(style='darkgrid', context='notebook', rc={'figure.figsize':(14,10)}, font_scale=2)

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('chained_assignment',None)

# Set random seeds for reproducibility on a specific machine
random.seed(1)
np.random.seed(1)
np.random.RandomState(1)
np.set_printoptions(precision=3)

In [2]:
def calc_cr_nds_sr(responses):
    cr = compression_ratio(responses, 'gzip')
    nds = ngram_diversity_score(responses, 4)
    #CR-POS
    joined_pos, tuples = get_pos(responses)
    # ngrams_pos = token_patterns(joined_pos, 5, 10)
    cr_pos = compression_ratio(joined_pos, 'gzip')
    srep = self_repetition_score(responses, verbose=False)
    return cr, cr_pos, nds, srep

def calc_diversity(df, num_shuffles=10):
    '''
    Randomly assigns personas with prompts, calculates metrics over responses for these
    pairings, then calculates mean and S.D over 10 different random pairings
    '''
    random.seed(1)
    crs = []
    ndss = []
    crs_pos = []
    sreps = []
    new_df = df.set_index(['persona_id', 'prompt_id'])
    for _ in tqdm(range(num_shuffles)):
        # Get random personas paired with every prompt
        persona_ids_shuffled = [i for i in range(100)]
        random.shuffle(persona_ids_shuffled)
        prompt_ids = [i for i in range(100)]
        pairs = list(zip(persona_ids_shuffled, prompt_ids))
        responses = new_df.loc[pairs, 'response'].values.tolist()
        
        # Calculate metrics
        cr, cr_pos, nds, srep = calc_cr_nds_sr(responses)
    
        crs.append(cr)
        ndss.append(nds)
        crs_pos.append(cr_pos)
        sreps.append(srep)
    
    print(f"CR: {np.round(np.mean(crs),2)} ± {np.round(np.std(crs),2)}\nCR-POS: {np.round(np.mean(crs_pos),2)} ± {np.round(np.std(crs_pos),2)}\nNDS: {np.round(np.mean(ndss),2)} ± {np.round(np.std(ndss),2)}\nSelf-rep:{np.round(np.mean(sreps),2)} ± {np.round(np.std(sreps),2)}")

## Dolly human responses

In [3]:
with open('../data/sample_personas.txt', 'r') as f:
    personas = [x.strip() for x in f.readlines()]
dolly = load_dataset("databricks/databricks-dolly-15k")["train"].filter(lambda row: row['category']=='creative_writing').to_pandas()
sample = pd.read_csv('../data/dolly_creative_prompts_sample.tsv', sep='\t')
sample['response'] = sample['index'].apply(lambda x: dolly.loc[x, 'response'])
sample['prompt_id'] = [i for i in range(len(sample))]
prompts = sample['instruction'].values.tolist()

In [None]:
# No persona, no cutoff
np70_df = pd.read_csv('../output/llama-temp0.7/llama70b-np/Llama-3.3-70B-Instruct-Turbo_dolly_output.tsv', sep='\t')
np70_df['response'] = np70_df.response.apply(lambda x: x.strip())
np70_df = np70_df.drop_duplicates(subset=['prompt_id', 'persona_id'], keep='first')
cr, cr_pos, nds, srep = calc_cr_nds_sr(np70_df['response'].values.tolist())
print(f"CR: {np.round(cr,2)}\nCR-POS: {np.round(cr_pos,2)}\nNDS: {np.round(nds,2)}\nSelf-rep: {np.round(srep, 2)}")

In [None]:
# No persona with cutoff
npc70_df = pd.read_csv('../output/llama-temp0.7/llama70b-cutoff-np/Llama-3.3-70B-Instruct-Turbo_dolly_output.tsv', sep='\t')
npc70_df['response'] = npc70_df.response.apply(lambda x: x.strip())
npc70_df = npc70_df.drop_duplicates(subset=['prompt_id', 'persona_id'], keep='first')
cr, cr_pos, nds, srep = calc_cr_nds_sr(npc70_df['response'].values.tolist())
print(f"CR: {np.round(cr,2)}\nCR-POS: {np.round(cr_pos,2)}\nNDS: {np.round(nds,2)}\nSelf-rep: {np.round(srep, 2)}")

In [None]:
persona70_df = pd.read_csv('../output/llama-temp0.7/llama70b-persona/Llama-3.3-70B-Instruct-Turbo_dolly_output.tsv', sep='\t')
persona70_df['response'] = persona70_df.response.apply(lambda x: x.strip())
persona70_df = persona70_df.drop_duplicates(subset=['prompt_id', 'persona_id'], keep='first')
calc_diversity(persona70_df, num_shuffles=100)

In [None]:
# Persona plus cutoff
personac70_df = pd.read_csv('../output/llama-temp0.7/llama70b-cutoff-persona/Llama-3.3-70B-Instruct-Turbo_dolly_output.tsv', sep='\t')
personac70_df['response'] = personac70_df.response.apply(lambda x: x.strip())
calc_diversity(personac70_df, num_shuffles=100)

In [None]:
# Coarse personas plus cutoff
persona70_df_coarse = pd.read_csv('../output/coarse/llama-cutoff-persona/Llama-3.3-70B-Instruct-Turbo_dolly_output.tsv', sep='\t')
persona70_df_coarse['response'] = persona70_df_coarse.response.apply(lambda x: x.strip())
persona70_df_coarse = persona70_df_coarse.drop_duplicates(subset=['prompt_id', 'persona_id'], keep='first')
calc_diversity(persona70_df_coarse, num_shuffles=100)

## Deepseek

In [41]:
def unformat(text):
    html = markdown.markdown(text.strip())
    soup = BeautifulSoup(html, features='html.parser')
    unformatted_text = soup.get_text()
    final_text = emoji.replace_emoji(unformatted_text, replace='').replace('\n', ' ')
    return final_text

In [42]:
# No persona, no cutoff
deep_npnc = pd.read_csv('../output/deepseek-np/DeepSeek-V3_dolly_output.tsv', sep='\t')
deep_npnc['response'] = deep_npnc.response.apply(lambda x: unformat(x))
cr, cr_pos, nds, srep = calc_cr_nds_sr(deep_npnc['response'].values.tolist())
print(f"CR: {np.round(cr,2)}\nCR-POS: {np.round(cr_pos,2)}\nNDS: {np.round(nds,2)}\nSelf-rep: {np.round(srep, 2)}")

CR: 2.32
CR-POS: 5.07
NDS: 3.13
Self-rep: 0.85


In [43]:
# No persona, cutoff
deep_npc = pd.read_csv('../output/deepseek-np-cutoff/DeepSeek-V3_dolly_output.tsv', sep='\t')
deep_npc['response'] = deep_npc.response.apply(lambda x: unformat(x))
cr, cr_pos, nds, srep = calc_cr_nds_sr(deep_npc['response'].values.tolist())
print(f"CR: {np.round(cr,2)}\nCR-POS: {np.round(cr_pos,2)}\nNDS: {np.round(nds,2)}\nSelf-rep: {np.round(srep, 2)}")

CR: 2.28
CR-POS: 4.74
NDS: 3.27
Self-rep: 0.11


In [11]:
# Persona cutoff
deep_pc = pd.read_csv('../output/deepseek-cutoff-persona/DeepSeek-V3_dolly_output.tsv', sep='\t')
deep_pc['response'] = deep_pc.response.apply(lambda x: unformat(x))
deep_pc = deep_pc.drop_duplicates(subset=['prompt_id', 'persona_id'])
calc_diversity(deep_pc, 100)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [09:17<00:00,  5.57s/it]

CR: 2.2 ± 0.02
CR-POS: 4.64 ± 0.03
NDS: 3.4 ± 0.01
Self-rep:0.09 ± 0.04





In [12]:
# Coarse persona cutoff
deep_pc_coarse = pd.read_csv('../output/coarse/deepseek-cutoff-persona/DeepSeek-V3_dolly_output.tsv', sep='\t')
deep_pc_coarse['response'] = deep_pc_coarse.response.apply(lambda x: unformat(x))
deep_pc_coarse = deep_pc_coarse.drop_duplicates(subset=['prompt_id', 'persona_id'])
calc_diversity(deep_pc_coarse,100)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [09:18<00:00,  5.59s/it]

CR: 2.21 ± 0.02
CR-POS: 4.68 ± 0.03
NDS: 3.39 ± 0.02
Self-rep:0.09 ± 0.04





In [35]:
random.randint(0,100), random.randint(0,100)

(39, 96)

In [36]:
prompts[39], personas[96]

('How to stay calm in stressful situations ?',
 'a savvy talent agent specializing in representing independent film directors')

In [None]:
len_df = {'len': [], 'source':[]}

# Load all the human responses first.
len_df['len'] += sample['response'].apply(lambda x: len(x)).values.tolist()
len_df['source'] += ['Human responses' for _ in range(len(sample))]

for (df, source_name) in [(deep_npc,'No persona+cutoff'), (deep_pc, 'Persona+cutoff'), (deep_pc_coarse, 'Coarse Persona+cutoff')]:
    if df.shape[0]>100:
        df['len'] = df.response.apply(lambda x: len(x))
        len_df['len'] += df.loc[:, ['prompt_id', 'len']].groupby('prompt_id').mean().len.values.tolist()
    else:
        len_df['len'] += df['response'].apply(lambda x: len(x)).values.tolist()
    len_df['source'] += [source_name for _ in range(100)]
    
len_df=pd.DataFrame(len_df)

g = sns.kdeplot(len_df, x='len', hue='source', common_norm=False, fill=True, clip=[0,6000])
g.set_xlabel('Completion length (chars)')
g.set_ylabel('Density')
g.spines["top"].set_visible(False)
g.spines["right"].set_visible(False)
g.spines["left"].set_visible(False)
g.set(yticklabels=[])
g.grid(axis='x')
# plt.legend([], [], frameon=False)
plt.show()

The above isn't right though, it makes the density look higher because i have more responses in the persona case 🤔

In [None]:
len_df

In [None]:
deep_pc.loc[:, ['prompt_id', 'len']].groupby('prompt_id').mean().len.values.tolist()

In [37]:
deep_npnc[deep_npnc.prompt_id==39]

Unnamed: 0,prompt_id,persona_id,response
39,39,-1,"Staying calm in stressful situations is a skill that can be developed with practice. Here are some effective strategies to help you manage stress and stay composed: \n1. Pause and Breathe\n\nTake deep breaths: Slow, deep breathing activates your parasympathetic nervous system, reducing stress. Try the 4-7-8 technique: Inhale for 4 seconds, hold for 7, exhale for 8. \nCount to 10: A brief pause gives your brain time to reset before reacting. \n\n2. Ground Yourself in the Present\n\nUse the 5-4-3-2-1 technique: Name 5 things you see, 4 things you feel, 3 things you hear, 2 things you smell, and 1 thing you taste to bring your focus back to the present. \nMantras/Affirmations: Repeat a calming phrase like, ""This is temporary, I can handle this."" \n\n3. Reframe Your Thoughts\n\nChallenge negative thinking: Ask yourself, ""Is this as bad as it seems?"" or ""What’s the worst that can happen?"" \nFocus on solutions, not problems: Shift from ""This is too much"" to ""What’s the first step I can take?"" \n\n4. Control What You Can\n\nMake a quick action plan: Breaking tasks into smaller steps makes stress manageable. \nAccept what you can’t control: Some things are out of your hands—focus on reactions, not outcomes. \n\n5. Use Physical Calming Techniques\n\nRelax your muscles: Tense and release each muscle group (progressive muscle relaxation). \nSplash cold water on your face: Triggers the ""dive reflex,"" slowing your heart rate. \nMove your body: A short walk or stretching can reduce stress hormones. \n\n6. Shift Perspective\n\nAsk: ""Will this matter in 5 years?"" Often, stress feels bigger than it really is. \nHumor: Lightening the mood with a joke (even internally) can reduce tension. \n\n7. Prepare for Future Stress\n\nPractice mindfulness/meditation: Regular practice builds resilience. \nTrain under pressure: Simulate stressful situations to improve response (e.g., public speaking practice). \n\n8. Seek Support (If Needed)\n\nTalk to someone: Verbalizing stress can provide relief and new solutions. \nStep away if possible: A brief break (even 2 minutes) can help reset emotions. \n\nQuick Reminder\nStress is a reaction—not a life sentence. The calmer you stay, the better decisions you’ll make. \nWould you like specific tips for a particular situation (e.g., work, exams, conflicts)?"


In [38]:
deep_npc[deep_npc.prompt_id==39]

Unnamed: 0,prompt_id,persona_id,response
39,39,-1,"To stay calm in stressful situations:\n1. Breathe deeply to slow your heart rate.\n2. Pause before reacting—take a moment to assess.\n3. Reframe thoughts—focus on solutions, not problems.\n4. Prioritize tasks—break them into manageable steps.\n5. Practice mindfulness or quick meditation.\n6. Use positive self-talk to stay composed.\n7. Take breaks to recharge when needed. \nStaying calm improves decision-making and reduces stress."


In [39]:
deep_pc[((deep_pc.prompt_id==39) & (deep_pc.persona_id==96))]

Unnamed: 0,prompt_id,persona_id,response
9639,39,96,"In high-stress moments—like a blown budget or a temperamental A-lister—I ground myself with deep breaths and focus on solutions, not panic. I remind my directors (and myself) that stress is temporary; the film isn’t. Preparation is key: anticipate fires before they spark. And humor? Always. A quick laugh diffuses tension. Stay cool, stay sharp—your clarity keeps the set steady. That’s how we turn chaos into art."


In [40]:
deep_pc_coarse[((deep_pc_coarse.prompt_id==39) & (deep_pc_coarse.persona_id==96))]

Unnamed: 0,prompt_id,persona_id,response
9638,39,96,"""Stay sharp under pressure by breathing deeply and focusing on what you can control. Visualize success—confidence is key. Prioritize tasks, delegate when possible, and trust your prep. Take quick mental breaks to reset; even 10 seconds can recharge you. Remember, stress is adrenaline—harness it to perform, not panic. Always keep your endgame in sight. Clients and deals thrive on your calm, so own it like the pro you are."""


In [None]:
print(hom_bs(data, batch_size=16, verbose=args.noverb))

In [None]:
if df.persona_id.unique().shape[0]==1:
    data = df.response.values.tolist()
    print(hom_bs(data, batch_size=args.batch, verbose=args.noverb))
else:
    bs_scores = []
    random.seed(1)
    for _ in range(args.num_shuffles):
        persona_ids_shuffled = [i for i in range(100)]
        random.shuffle(persona_ids_shuffled)
        prompt_ids = [i for i in range(100)]
        pairs = list(zip(persona_ids_shuffled, prompt_ids))
        
        new_df = df.set_index(['persona_id', 'prompt_id'])
        data = new_df.loc[pairs, 'response'].values.tolist()
        bs_scores.append(hom_bs(data, batch_size=args.batch, verbose=args.noverb))
    print(bs_scores)
    print(f"Mean:{np.round(np.mean(bs_scores),2)}, SD: {np.round(np.std(bs_scores),3)}")