In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np


In [3]:
def compute_diversity(lst):
    # Compute the average cosine similarity between all pairs of sentences in the list
    model = SentenceTransformer("all-mpnet-base-v2")
    embeddings = model.encode(lst)
    sim = np.inner(embeddings, embeddings)
    return np.mean(sim)

In [4]:
def compute_overall_diversity(filenanme):

    df = pd.read_csv(filenanme)
    df_train = df[df["info"] =="train"]
    df_test = df[df["info"] =="test"]

    train_diversity = []
    for _, row in df_train.iterrows():
        train_diversity.append(compute_diversity(list(row)[3:13]))
    test_diversity = []
    for _, row in df_test.iterrows():
        test_diversity.append(compute_diversity(list(row)[3:13]))
    train_diversity = 1 - np.mean(train_diversity)
    test_diversity = 1 - np.mean(test_diversity)
    print("train diversity: ", train_diversity, "test diversity: ", test_diversity)
    return train_diversity, test_diversity  

In [6]:
compute_overall_diversity("dpo_gpto_400_parsed_results.csv")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



train diversity:  0.7147020101547241 test diversity:  0.7057546377182007


(0.7147020101547241, 0.7057546377182007)

In [7]:
np.range(0, 10)

AttributeError: module 'numpy' has no attribute 'range'

In [None]:
def compute_overall_gpto_diversity(filenanme):
    
    df = pd.read_csv(filenanme)
    df_train = df[df["info"] =="train"]
    df_test = df[df["info"] =="test"]

    train_diversity = []
    for _, row in df_train.iterrows():
        train_diversity.append(compute_diversity(list(row)[0:13]))
    test_diversity = []
    for _, row in df_test.iterrows():
        test_diversity.append(compute_diversity(list(row)[3:13]))
    train_diversity = 1 - np.mean(train_diversity)
    test_diversity = 1 - np.mean(test_diversity)
    print("train diversity: ", train_diversity, "test diversity: ", test_diversity)
    return train_diversity, test_diversity  

compute_overall_gpto_diversity("gpto_cap.csv")

In [None]:
compute_overall_diversity("dpo_gpto_40_parsed_result.csv")

In [68]:
results = compute_overall_diversity("sft_results.csv")

train diversity:  0.5858057737350464 test diversity:  0.5407741665840149


In [69]:
compute_overall_diversity("sft_top1000_results.csv")

train diversity:  0.5794702470302582 test diversity:  0.5424542129039764


(0.5794702470302582, 0.5424542129039764)

In [70]:
compute_overall_diversity("sft_top100_results.csv")

train diversity:  0.488442063331604 test diversity:  0.4799564480781555


(0.488442063331604, 0.4799564480781555)

In [71]:
compute_overall_diversity("sft_dpo_results.csv") # DPO trained on Contest 711 

train diversity:  0.5365310311317444 test diversity:  0.5353531241416931


(0.5365310311317444, 0.5353531241416931)

In [72]:
compute_overall_diversity("dpo_results.csv")

train diversity:  0.6438504159450531 test diversity:  0.5977967381477356


(0.6438504159450531, 0.5977967381477356)

In [11]:
for i in range(100,1001, 100): 
    print("dpo {} i iter:".format(i)) 
    compute_overall_diversity("dpo_results_ckpt{}.csv".format(i))

dpo 100 i iter:
train diversity:  0.5907401144504547 test diversity:  0.5666814148426056
dpo 200 i iter:
train diversity:  0.6341615617275238 test diversity:  0.6147825419902802
dpo 300 i iter:
train diversity:  0.6385546922683716 test diversity:  0.6277735233306885
dpo 400 i iter:
train diversity:  0.6230852007865906 test diversity:  0.6135237812995911
dpo 500 i iter:
train diversity:  0.6277709305286407 test diversity:  0.6222366392612457
dpo 600 i iter:
train diversity:  0.6237146556377411 test diversity:  0.6169854998588562
dpo 700 i iter:
train diversity:  0.6167405843734741 test diversity:  0.5888241529464722
dpo 800 i iter:
train diversity:  0.6134612262248993 test diversity:  0.5922105610370636
dpo 900 i iter:
train diversity:  0.6159301996231079 test diversity:  0.5834469199180603
dpo 1000 i iter:
train diversity:  0.6249347925186157 test diversity:  0.621736615896225


In [17]:
def compute_unique_words(sent):
    return  set([''.join(c for c in s if c.isalpha()) for s in sent.split()])

def compute_dict_diversity(lst):
    jaccard_indices = []  
    lsts = [compute_unique_words(s) for s in lst]
    for i in lsts:
        for j in lsts: 
            if i != j:
                jaccard_indices.append(len(i.intersection(j))/len(i.union(j)))
    return np.mean(jaccard_indices)
            
def compute_overall_dict_diversity(filenanme):
    
    df = pd.read_csv(filenanme)
    df_train = df[df["info"] =="train"]
    df_test = df[df["info"] =="test"]

    train_diversity = []
    for _, row in df_train.iterrows():
        train_diversity.append(compute_dict_diversity(list(row)[3:8]))
    test_diversity = []
    for _, row in df_test.iterrows():
        test_diversity.append(compute_dict_diversity(list(row)[3:8]))
    train_diversity = 1 - np.mean(train_diversity)
    test_diversity = 1 - np.mean(test_diversity)
    print("train diversity: ", train_diversity, "test diversity: ", test_diversity)
    return train_diversity, test_diversity  

In [18]:
for i in range(100,1001, 100): 
    print("dpo {} i iter:".format(i)) 
    compute_overall_dict_diversity("dpo_results_ckpt{}.csv".format(i))

dpo 100 i iter:
train diversity:  0.9414171534385013 test diversity:  0.9318706769466559
dpo 200 i iter:
train diversity:  0.9504627438080477 test diversity:  0.9435193475156249
dpo 300 i iter:
train diversity:  0.9491013504564905 test diversity:  0.9438309805954141
dpo 400 i iter:
train diversity:  0.9431267781475152 test diversity:  0.9429968519239856
dpo 500 i iter:
train diversity:  0.944726184579757 test diversity:  0.9453017977636887
dpo 600 i iter:
train diversity:  0.9383937718926292 test diversity:  0.9381115470697334
dpo 700 i iter:
train diversity:  0.9460554366571772 test diversity:  0.9387151411509401
dpo 800 i iter:
train diversity:  0.9475344778698822 test diversity:  0.9393691957192183
dpo 900 i iter:
train diversity:  0.9460287621768707 test diversity:  0.9435984006519336
dpo 1000 i iter:
train diversity:  0.9426759985894563 test diversity:  0.9484276327745458
