In [1]:
import pandas as pd
import random
import numpy as np
from tqdm import tqdm
import ipdb
import re
from datasets import load_dataset
import os
from diversity import compression_ratio, ngram_diversity_score, extract_patterns, get_pos, pos_patterns, token_patterns, self_repetition_score
import json
from collections import Counter
from random import shuffle
from transformers import AutoTokenizer
tqdm.pandas()

import matplotlib.pyplot as plt
# import mplcursors
import seaborn as sns
%matplotlib inline
sns.set(style='darkgrid', context='notebook', rc={'figure.figsize':(14,10), 'font.family': 'Times'}, font_scale=3)

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('chained_assignment',None)

# Set random seeds for reproducibility on a specific machine
random.seed(1)
np.random.seed(1)
np.random.RandomState(1)
np.set_printoptions(precision=3)

In [2]:
def calc_cr_nds_sr(responses):
    cr = compression_ratio(responses, 'gzip')
    nds = ngram_diversity_score(responses, 4)
    #CR-POS
    joined_pos, tuples = get_pos(responses)
    # ngrams_pos = token_patterns(joined_pos, 5, 10)
    cr_pos = compression_ratio(joined_pos, 'gzip')
    srep = self_repetition_score(responses, verbose=False)
    return cr, cr_pos, nds, srep

def calc_diversity(df, num_shuffles=10):
    '''
    Randomly assigns personas with prompts, calculates metrics over responses for these
    pairings, then calculates mean and S.D over 10 different random pairings
    '''
    random.seed(1)
    crs = []
    ndss = []
    crs_pos = []
    sreps = []
    new_df = df.set_index(['persona_id', 'prompt_id'])
    for _ in tqdm(range(num_shuffles)):
        # Get random personas paired with every prompt
        persona_ids_shuffled = [i for i in range(100)]
        random.shuffle(persona_ids_shuffled)
        prompt_ids = [i for i in range(100)]
        pairs = list(zip(persona_ids_shuffled, prompt_ids))
        responses = new_df.loc[pairs, 'response'].values.tolist()
        
        # Calculate metrics
        cr, cr_pos, nds, srep = calc_cr_nds_sr(responses)
    
        crs.append(cr)
        ndss.append(nds)
        crs_pos.append(cr_pos)
        sreps.append(srep)
    
    print(f"CR: {np.round(np.mean(crs),2)} ± {np.round(np.std(crs),2)}\nCR-POS: {np.round(np.mean(crs_pos),2)} ± {np.round(np.std(crs_pos),2)}\nNDS: {np.round(np.mean(ndss),2)} ± {np.round(np.std(ndss),2)}\nSelf-rep:{np.round(np.mean(sreps),2)} ± {np.round(np.std(sreps),2)}")

## Dolly human responses

In [3]:
with open('../data/sample_personas.txt', 'r') as f:
    personas = [x.strip() for x in f.readlines()]
dolly = load_dataset("databricks/databricks-dolly-15k")["train"].filter(lambda row: row['category']=='creative_writing').to_pandas()
sample = pd.read_csv('../data/dolly_creative_prompts_sample.tsv', sep='\t')
sample['response'] = sample['index'].apply(lambda x: dolly.loc[x, 'response'])
sample['prompt_id'] = [i for i in range(len(sample))]
prompts = sample['instruction'].values.tolist()
human_responses = sample['response'].values.tolist()

In [4]:
cr = compression_ratio(human_responses, 'gzip')
nds = ngram_diversity_score(human_responses, 4)
joined_pos, tuples = get_pos(human_responses)
# ngrams_pos = token_patterns(joined_pos, 5, 10)
cr_pos = compression_ratio(joined_pos, 'gzip')
# rouge = homogenization_score(human_responses, 'rougel', verbose=False)
# bleu = homogenization_score(human_responses, 'bleu', verbose=False)
srep = self_repetition_score(human_responses)
# print(f"CR: {np.round(cr,2)}\nNDS: {np.round(nds,2)}\nCR-POS: {np.round(cr_pos,2)}\nHS-RougeL: {np.round(rouge,2)}\nself-bleu: {np.round(bleu,2)}\nSelf-rep: {np.round(srep, 2)}")
print(f"CR: {np.round(cr,2)}\nNDS: {np.round(nds,2)}\nCR-POS: {np.round(cr_pos,2)}\nSelf-rep: {np.round(srep, 2)}")

Calculating self-repetition score: 100%|██████████████████████████████████████████████████████| 100/100 [00:00<00:00, 103307.98it/s]

CR: 2.51
NDS: 3.03
CR-POS: 4.93
Self-rep: 0.55





## Llama-70B

In [38]:
# No persona, no cutoff
df = pd.read_csv('../output/llama-temp1/llama70b-np/Llama-3.3-70B-Instruct-Turbo_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
df = df.drop_duplicates(subset=['prompt_id', 'persona_id'], keep='first')
cr, cr_pos, nds, srep = calc_cr_nds_sr(df['response'].values.tolist())
print(f"CR: {np.round(cr,2)}\nCR-POS: {np.round(cr_pos,2)}\nNDS: {np.round(nds,2)}\nSelf-rep: {np.round(srep, 2)}")

CR: 2.77
CR-POS: 5.73
NDS: 2.87
Self-rep: 1.89


In [39]:
# No persona with cutoff
df = pd.read_csv('../output/llama-temp1/llama70b-cutoff-np/Llama-3.3-70B-Instruct-Turbo_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
df = df.drop_duplicates(subset=['prompt_id', 'persona_id'], keep='first')
cr, cr_pos, nds, srep = calc_cr_nds_sr(df['response'].values.tolist())
print(f"CR: {np.round(cr,2)}\nCR-POS: {np.round(cr_pos,2)}\nNDS: {np.round(nds,2)}\nSelf-rep: {np.round(srep, 2)}")

CR: 2.57
CR-POS: 5.16
NDS: 3.08
Self-rep: 0.52


In [40]:
df = pd.read_csv('../output/llama-temp1/llama70b-persona/Llama-3.3-70B-Instruct-Turbo_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
df = df.drop_duplicates(subset=['prompt_id', 'persona_id'], keep='first')
calc_diversity(df, num_shuffles=100)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [10:21<00:00,  6.21s/it]

CR: 2.71 ± 0.01
CR-POS: 5.38 ± 0.03
NDS: 2.84 ± 0.01
Self-rep:2.5 ± 0.1





In [41]:
# Persona plus cutoff
df = pd.read_csv('../output/llama-temp1/llama70b-cutoff-persona/Llama-3.3-70B-Instruct-Turbo_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
df = df.drop_duplicates(subset=['prompt_id', 'persona_id'], keep='first')
calc_diversity(df, num_shuffles=100)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [10:31<00:00,  6.31s/it]

CR: 2.51 ± 0.02
CR-POS: 5.04 ± 0.03
NDS: 3.08 ± 0.02
Self-rep:0.68 ± 0.09





In [42]:
# Coarse persona
df = pd.read_csv('../output/llama-temp1/llama70b-coarse/Llama-3.3-70B-Instruct-Turbo_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
df = df.drop_duplicates(subset=['prompt_id', 'persona_id'], keep='first')
calc_diversity(df, num_shuffles=100)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [10:04<00:00,  6.05s/it]

CR: 2.71 ± 0.01
CR-POS: 5.41 ± 0.03
NDS: 2.85 ± 0.02
Self-rep:2.39 ± 0.13





In [43]:
# Coarse personas plus cutoff
df = pd.read_csv('../output/llama-temp1/llama70b-coarse-cutoff/Llama-3.3-70B-Instruct-Turbo_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
df = df.drop_duplicates(subset=['prompt_id', 'persona_id'], keep='first')
calc_diversity(df, num_shuffles=100)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [09:35<00:00,  5.76s/it]

CR: 2.51 ± 0.02
CR-POS: 5.06 ± 0.04
NDS: 3.09 ± 0.02
Self-rep:0.61 ± 0.08





## Deepseek

In [44]:
# No persona, no cutoff
df = pd.read_csv('../output/deepseek/deepseek-np/DeepSeek-V3_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
cr, cr_pos, nds, srep = calc_cr_nds_sr(df['response'].values.tolist())
print(f"CR: {np.round(cr,2)}\nCR-POS: {np.round(cr_pos,2)}\nNDS: {np.round(nds,2)}\nSelf-rep: {np.round(srep, 2)}")

CR: 2.36
CR-POS: 5.5
NDS: 3.15
Self-rep: 0.86


In [46]:
# No persona, cutoff
df = pd.read_csv('../output/deepseek/deepseek-np-cutoff/DeepSeek-V3_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
cr, cr_pos, nds, srep = calc_cr_nds_sr(df['response'].values.tolist())
print(f"CR: {np.round(cr,2)}\nCR-POS: {np.round(cr_pos,2)}\nNDS: {np.round(nds,2)}\nSelf-rep: {np.round(srep, 2)}")

CR: 2.29
CR-POS: 4.95
NDS: 3.32
Self-rep: 0.11


In [47]:
# Persona
df = pd.read_csv('../output/deepseek/deepseek-persona/DeepSeek-V3_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
df = df.drop_duplicates(subset=['prompt_id', 'persona_id'])
calc_diversity(df, 100)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [10:03<00:00,  6.04s/it]

CR: 2.27 ± 0.01
CR-POS: 4.9 ± 0.03
NDS: 3.26 ± 0.01
Self-rep:0.59 ± 0.11





In [48]:
# Persona cutoff
df = pd.read_csv('../output/deepseek/deepseek-cutoff-persona/DeepSeek-V3_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
df = df.drop_duplicates(subset=['prompt_id', 'persona_id'])
calc_diversity(df, 100)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [10:27<00:00,  6.27s/it]

CR: 2.2 ± 0.02
CR-POS: 4.71 ± 0.03
NDS: 3.38 ± 0.01
Self-rep:0.09 ± 0.04





In [51]:
# Coarse persona no cutoff
df = pd.read_csv('../output/deepseek/deepseek-coarse/DeepSeek-V3_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
df = df.drop_duplicates(subset=['prompt_id', 'persona_id'])
calc_diversity(df,100)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [10:27<00:00,  6.27s/it]

CR: 2.3 ± 0.01
CR-POS: 5.01 ± 0.03
NDS: 3.23 ± 0.02
Self-rep:0.54 ± 0.1





In [52]:
# Coarse persona cutoff
df = pd.read_csv('../output/deepseek/deepseek-coarse-cutoff/DeepSeek-V3_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
df = df.drop_duplicates(subset=['prompt_id', 'persona_id'])
calc_diversity(df,100)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [09:57<00:00,  5.98s/it]

CR: 2.24 ± 0.02
CR-POS: 4.78 ± 0.04
NDS: 3.37 ± 0.02
Self-rep:0.09 ± 0.04





## Llama-8B

In [12]:
# No persona, no cutoff
df = pd.read_csv('../output/llama-temp1/llama8b-np/Llama-3.1-8B-Instruct_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
cr, cr_pos, nds, srep = calc_cr_nds_sr(df['response'].values.tolist())
print(f"CR: {np.round(cr,2)}\nCR-POS: {np.round(cr_pos,2)}\nNDS: {np.round(nds,2)}\nSelf-rep: {np.round(srep, 2)}")

CR: 2.77
CR-POS: 5.78
NDS: 2.86
Self-rep: 1.59


In [13]:
# No persona, plus cutoff
df = pd.read_csv('../output/llama-temp1/llama8b-cutoff-np/Llama-3.1-8B-Instruct_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
cr, cr_pos, nds, srep = calc_cr_nds_sr(df['response'].values.tolist())
print(f"CR: {np.round(cr,2)}\nCR-POS: {np.round(cr_pos,2)}\nNDS: {np.round(nds,2)}\nSelf-rep: {np.round(srep, 2)}")

CR: 2.52
CR-POS: 5.24
NDS: 3.13
Self-rep: 0.5


In [14]:
# Fine persona, no cutoff
df = pd.read_csv('../output/llama-temp1/llama8b-persona/Llama-3.1-8B-Instruct_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
df = df.drop_duplicates(subset=['prompt_id', 'persona_id'])
calc_diversity(df,100)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [33:22<00:00, 20.03s/it]

CR: 2.63 ± 0.02
CR-POS: 5.36 ± 0.04
NDS: 2.9 ± 0.02
Self-rep:2.04 ± 0.13





In [15]:
# Fine persona, cutoff
df = pd.read_csv('../output/llama-temp1/llama8b-cutoff-persona/Llama-3.1-8B-Instruct_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
df = df.drop_duplicates(subset=['prompt_id', 'persona_id'])
calc_diversity(df,100)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [09:32<00:00,  5.73s/it]

CR: 2.47 ± 0.02
CR-POS: 5.06 ± 0.03
NDS: 3.09 ± 0.02
Self-rep:0.77 ± 0.11





In [17]:
# Coarse persona, no cutoff
df = pd.read_csv('../output/llama-temp1/llama8b-coarse/Llama-3.1-8B-Instruct_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
df = df.drop_duplicates(subset=['prompt_id', 'persona_id'])
calc_diversity(df,100)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [10:09<00:00,  6.09s/it]

CR: 2.64 ± 0.02
CR-POS: 5.42 ± 0.03
NDS: 2.9 ± 0.01
Self-rep:2.0 ± 0.1





In [18]:
# Coarse persona with cutoff
df = pd.read_csv('../output/llama-temp1/llama8b-coarse-cutoff/Llama-3.1-8B-Instruct_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
df = df.drop_duplicates(subset=['prompt_id', 'persona_id'])
calc_diversity(df,100)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [09:37<00:00,  5.77s/it]

CR: 2.48 ± 0.02
CR-POS: 5.1 ± 0.03
NDS: 3.1 ± 0.02
Self-rep:0.7 ± 0.08





## Llama-1B

In [19]:
# No persona, no cutoff
df = pd.read_csv('../output/llama-temp1/llama1b-np/Llama-3.2-1B-Instruct_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
cr, cr_pos, nds, srep = calc_cr_nds_sr(df['response'].values.tolist())
print(f"CR: {np.round(cr,2)}\nCR-POS: {np.round(cr_pos,2)}\nNDS: {np.round(nds,2)}\nSelf-rep: {np.round(srep, 2)}")

CR: 2.74
CR-POS: 5.7
NDS: 2.87
Self-rep: 1.57


In [20]:
# No persona, cutoff
df = pd.read_csv('../output/llama-temp1/llama1b-cutoff-np/Llama-3.2-1B-Instruct_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
cr, cr_pos, nds, srep = calc_cr_nds_sr(df['response'].values.tolist())
print(f"CR: {np.round(cr,2)}\nCR-POS: {np.round(cr_pos,2)}\nNDS: {np.round(nds,2)}\nSelf-rep: {np.round(srep, 2)}")

CR: 2.56
CR-POS: 5.37
NDS: 3.0
Self-rep: 0.58


In [21]:
# Fine persona, no cutoff
df = pd.read_csv('../output/llama-temp1/llama1b-persona/Llama-3.2-1B-Instruct_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
df = df.drop_duplicates(subset=['prompt_id', 'persona_id'])
calc_diversity(df,100)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [10:05<00:00,  6.06s/it]

CR: 2.62 ± 0.02
CR-POS: 5.34 ± 0.03
NDS: 2.91 ± 0.02
Self-rep:1.88 ± 0.13





In [22]:
# Fine persona, cutoff
df = pd.read_csv('../output/llama-temp1/llama1b-cutoff-persona/Llama-3.2-1B-Instruct_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
df = df.drop_duplicates(subset=['prompt_id', 'persona_id'])
calc_diversity(df,100)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [09:46<00:00,  5.86s/it]

CR: 2.47 ± 0.04
CR-POS: 5.12 ± 0.07
NDS: 3.08 ± 0.04
Self-rep:0.68 ± 0.1





In [23]:
# Coarse persona, no cutoff
df = pd.read_csv('../output/llama-temp1/llama1b-coarse/Llama-3.2-1B-Instruct_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
df = df.drop_duplicates(subset=['prompt_id', 'persona_id'])
calc_diversity(df,100)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [09:50<00:00,  5.90s/it]

CR: 2.61 ± 0.03
CR-POS: 5.38 ± 0.04
NDS: 2.91 ± 0.02
Self-rep:1.87 ± 0.13





In [24]:
# Coarse persona, cutoff
df = pd.read_csv('../output/llama-temp1/llama1b-coarse-cutoff/Llama-3.2-1B-Instruct_dolly_output.tsv', sep='\t')
df['response'] = df.response.apply(lambda x: x.strip())
df = df.drop_duplicates(subset=['prompt_id', 'persona_id'])
calc_diversity(df,100)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [09:28<00:00,  5.69s/it]

CR: 2.47 ± 0.03
CR-POS: 5.13 ± 0.04
NDS: 3.09 ± 0.02
Self-rep:0.66 ± 0.1





## Response length distribution

In [54]:
tokenizer = AutoTokenizer.from_pretrained('deepseek-ai/DeepSeek-V3-0324')
tokenizer('world hello', add_special_tokens=False)['input_ids']

[29616, 44388]

In [55]:
len_df = {'len': [], 'source':[]}

# Load all the human responses first.
len_df['len'] += sample['response'].apply(lambda x: len(tokenizer(x, add_special_tokens=False)['input_ids'])).values.tolist()
len_df['source'] += ['Dolly' for _ in range(len(sample))]

df_np = pd.read_csv('../output/deepseek/deepseek-np/DeepSeek-V3_dolly_output.tsv', sep='\t')
df_fp = pd.read_csv('../output/deepseek/deepseek-persona/DeepSeek-V3_dolly_output.tsv', sep='\t')
df_cp = pd.read_csv('../output/llama-temp1/llama8b-coarse/Llama-3.1-8B-Instruct_dolly_output.tsv', sep='\t')

for (df, source_name) in [(df_np,'Deepseek-NP'), (df_fp, 'Deepseek-FP'), (df_cp, 'Deepseek-CP')]:
    if df.shape[0]>100:
        df['len'] = df.response.apply(lambda x: len(tokenizer(x, add_special_tokens=False)['input_ids']))
        len_df['len'] += df.loc[:, ['prompt_id', 'len']].groupby('prompt_id').mean().len.values.tolist()
    else:
        len_df['len'] += df['response'].apply(lambda x: len(tokenizer(x, add_special_tokens=False)['input_ids'])).values.tolist()
    len_df['source'] += [source_name for _ in range(100)]
    
len_df=pd.DataFrame(len_df)

In [61]:
g = sns.kdeplot(len_df, x='len', hue='source', common_norm=False, fill=True, clip=[0,1250])
g.set_xlabel('Completion length (tokens)')
g.set_ylabel('')
g.spines["top"].set_visible(False)
g.spines["right"].set_visible(False)
g.spines["left"].set_visible(False)
g.set(yticklabels=[])
g.grid(axis='x')
# plt.legend([], [], frameon=False)
# plt.show()
plt.savefig('length.pdf', bbox_inches='tight', transparent=True)