In [1]:
from datasets import load_dataset
import numpy as np
from sklearn.metrics.pairwise import cosine_distances
import random
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pandas as pd
import numpy as np
import json

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("/datadrive/pavan/CurLL/src/data/stage0_split.csv")

In [3]:
df['split'].value_counts()

split
train    3275181
val        42540
test        2610
Name: count, dtype: int64

In [4]:
df_val = df[df['split']=='val']

In [21]:
df_test = df[df['split']=='test']


In [19]:
df[df['id']=='i1']['context_template'].value_counts()


context_template
Repeat back instructions        903
Describe what happened          903
Continue the story              903
Follow simple directions        903
Explain the action              903
Mimic the sound                 903
Predict what's next             903
Respond to question             903
Confirm understanding           903
Re-tell the sequence            903
Action demonstration request    903
Imitate observed behavior       903
Describe common features        903
Identify differences            903
Expand on statement             903
Explain feelings                903
Simple cause effect             903
Answer yes or no                903
Identify missing piece          903
What comes before?              903
What comes after?               903
Extend the idea                 903
Describe the problem            903
Ask clarifying question         903
Respond to greeting             903
Echo the phrase                 903
Extend the conversation         903
React to st

In [22]:
df_test[df_test['id']=='i1']['context_template'].value_counts()

context_template
Repeat back instructions    88
Identify missing piece      50
Name: count, dtype: int64

In [23]:
df_test[df_test['id']=='i1'].head()

Unnamed: 0,output,id,indicator,skill,subskill,goal,age_group,stage,context_template,word_list,instruction,response,POS,split
23465,"{'instruction': 'Okay, Leo, I want you to tell...",i1,"Shows ongoing connection to a conversation, gr...",Language and Communication,Attending and Understanding,Child attends to communication and language fr...,0-5,0,Repeat back instructions,rest,"Okay, Leo, I want you to tell me what I just a...",You want me to say what you said? You wanted m...,Noun,test
23469,"{'instruction': ""Okay, Leo, I'm going to tell ...",i1,"Shows ongoing connection to a conversation, gr...",Language and Communication,Attending and Understanding,Child attends to communication and language fr...,0-5,0,Repeat back instructions,rinse,"Okay, Leo, I'm going to tell you what we need ...",Rinse our hands! We have to rinse our hands wi...,Verb,test
23472,"{'instruction': 'Okay, so I want you to tell m...",i1,"Shows ongoing connection to a conversation, gr...",Language and Communication,Attending and Understanding,Child attends to communication and language fr...,0-5,0,Repeat back instructions,reach,"Okay, so I want you to tell me what I just ask...",You want me to...to tell you what you said! Yo...,Verb,test
23480,"{'instruction': ""Okay, Leo, I'm going to tell ...",i1,"Shows ongoing connection to a conversation, gr...",Language and Communication,Attending and Understanding,Child attends to communication and language fr...,0-5,0,Repeat back instructions,rule,"Okay, Leo, I'm going to tell you something imp...",Put the blocks back in the bin... that's the r...,Noun,test
23494,"{'instruction': 'Okay, so I want you to tell m...",i1,"Shows ongoing connection to a conversation, gr...",Language and Communication,Attending and Understanding,Child attends to communication and language fr...,0-5,0,Repeat back instructions,pretzel,"Okay, so I want you to tell me what I just ask...",Um... you want me to tell you what you said? Y...,Noun,test


In [2]:
def get_dataframe_stage(stage):
    ds = load_dataset(f"Pavankalyan/stage{stage}_instruct_cleaned")
    df = ds['train'].to_pandas()
    return df

In [3]:
stage = 0
df = get_dataframe_stage(stage)

In [4]:
def compute_distance(embeddings, A, B):
    return cosine_distances(embeddings[A], embeddings[B]).mean()

def relative_distance_zscores(embeddings, selected, Z_indices, num_samples=1000):
    all_indices = set(range(len(embeddings)))
    selected_set = set(selected)
    Z_set = set(Z_indices)

    complement_Y = list(all_indices - selected_set)
    complement_Z = list(Z_set - selected_set)
    nonZ = list(all_indices - Z_set)

    x = len(selected)
    Z_pool = list(Z_set - selected_set)

    def sample_random_subsets(pool):
        return [random.sample(pool, x) for _ in range(num_samples)]

    def compute_distribution(target_fn):
        scores = [target_fn(s) for s in sample_random_subsets(Z_indices)]
        mu = np.mean(scores)
        sigma = np.std(scores)
        return mu, sigma

    def get_z(actual, mu, sigma):
        return (actual - mu) / sigma if sigma > 0 else float('nan')

    results = {}

    # D1: to Y - selected
    actual_D1 = compute_distance(embeddings, selected, list(all_indices - set(selected)))
    mu1, sigma1 = compute_distribution(lambda s: compute_distance(embeddings, s, list(all_indices - set(s))))
    results["D1 (to Y-Zₓ)"] = (actual_D1, mu1, sigma1, get_z(actual_D1, mu1, sigma1))

    # D2: to Z - selected
    actual_D2 = compute_distance(embeddings, selected, list(Z_set - selected_set)) if len(complement_Z) > 0 else float('nan')
    mu2, sigma2 = compute_distribution(lambda s: compute_distance(embeddings, s, list(Z_set - set(s))))
    results["D2 (to Z-Zₓ)"] = (actual_D2, mu2, sigma2, get_z(actual_D2, mu2, sigma2))

    # D3: to Y - Z
    actual_D3 = compute_distance(embeddings, selected, list(nonZ))
    mu3, sigma3 = compute_distribution(lambda s: compute_distance(embeddings, s, list(nonZ)))
    results["D3 (to Y-Z)"] = (actual_D3, mu3, sigma3, get_z(actual_D3, mu3, sigma3))

    return results
    
def find_diverse_subset_from_Z(Y, Z_indices, x, num_samples=1000):
    """
    From list Y, and subset Z_indices, select x elements from Z that are maximally different from Y \ selected_x.
    
    Returns:
        best_indices: indices (in Y) of x elements from Z that are most different.
        raw_score: average distance of best_indices to Y \ best_indices.
        relative_score: z-score compared to random x-subsets of Z.
    """
    assert x <= len(Z_indices), "x must be <= size of Z"
    assert x < len(Y), "x must be < total size of Y"

    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(Y, normalize_embeddings=True)
    dist_matrix = cosine_distances(embeddings)

    # Step 1: Greedy selection from Z_indices
    selected = []
    candidates = list(Z_indices)
    Y_indices = set(range(len(Y)))

    for _ in range(x):
        best_idx = None
        best_score = -np.inf

        for idx in candidates:
            temp_selected = selected + [idx]
            temp_rest = list(Y_indices - set(temp_selected))
            avg_dist = dist_matrix[np.ix_(temp_selected, temp_rest)].mean()
            if avg_dist > best_score:
                best_score = avg_dist
                best_idx = idx

        selected.append(best_idx)
        candidates.remove(best_idx)

    results = relative_distance_zscores(embeddings, selected, Z_indices)

    return selected, results, embeddings


def plot_tsne(embeddings, selected, Z_indices, Y):
    tsne = TSNE(n_components=2, perplexity=5, random_state=42)
    reduced = tsne.fit_transform(embeddings)

    all_indices = set(range(len(Y)))
    Z_set = set(Z_indices)
    selected_set = set(selected)

    plt.figure(figsize=(8, 6))
    for i in range(len(Y)):
        if i in selected_set:
            plt.scatter(*reduced[i], color='red', label='Zₓ (Selected)' if 'Zₓ' not in plt.gca().get_legend_handles_labels()[1] else "")
        elif i in Z_set:
            plt.scatter(*reduced[i], color='blue', alpha=0.5, label='Z - Zₓ' if 'Z - Zₓ' not in plt.gca().get_legend_handles_labels()[1] else "")
        else:
            plt.scatter(*reduced[i], color='gray', alpha=0.3, label='Y - Z' if 'Y - Z' not in plt.gca().get_legend_handles_labels()[1] else "")
    plt.title("t-SNE of Topic Embeddings")
    plt.legend()
    plt.tight_layout()
    plt.show()


def get_test_templates(df, id, num_templates, num_indi):
    topics = list(df['context_template'].unique())
    Z_topics = list(set(df[df['id']==id]['context_template'].values))
    Z_indices = [topics.index(t) for t in Z_topics]
    selected_ids, results, embeddings = find_diverse_subset_from_Z(topics, Z_indices, x=num_templates)
    selected_topics = [topics[i] for i in selected_ids]
    results_dict = {}
    for name, (actual, mu, sigma, z) in results.items():
        results_dict[name] = {
            "actual": actual,
            "mu": mu,
            "sigma": sigma,
            "z": z
        }
    results_dict["selected_topics"] =  selected_topics
    all_inds = list(df['indicator'].unique())
    main_ind = df[df['id']==id]['indicator'].values[0]
    index_ind = all_inds.index(main_ind)
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings_inds = model.encode(all_inds, normalize_embeddings=True)
    dist_matrix = cosine_distances(embeddings_inds)
    distances = dist_matrix[index_ind]
    closest_indices = np.argsort(distances)
    closest_indices = closest_indices[closest_indices != index_ind][:num_indi]
    closest_elements = [all_inds[j] for j in closest_indices]
    closest_ids = [df[df['indicator']==closest_elements[j]]['id'].values[0] for j in range(len(closest_elements))]
    results_dict["closest_indicators"] = closest_elements
    results_dict["closest_indicators_distances"] = distances[closest_indices]
    results_dict["closest_indicators_ids"] = closest_ids
    return results_dict

def make_json_serializable(obj):
    if isinstance(obj, dict):
        return {k: make_json_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [make_json_serializable(i) for i in obj]
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, (np.float32, np.float64, np.int32, np.int64)):
        return obj.item()
    else:
        return obj

  """


In [5]:
res = make_json_serializable(get_test_templates(df, id="i0", num_templates=2, num_indi=2))

In [6]:
def concat_instruction_response(example):
    return example["instruction"] + "\n\n" + example["response"]

In [7]:
df['split'] = "train"

In [8]:
id = "i0"
tempdf1 = df[df['id'] == id].reset_index(drop=True)
tempdf2 = df[
        (df['id'] == res['closest_indicators_ids'][0]) |
        (df['id'] == res['closest_indicators_ids'][1])
    ].reset_index(drop=True)
tempdf1['ir_output'] = tempdf1.apply(concat_instruction_response, axis=1)
tempdf2['ir_output'] = tempdf2.apply(concat_instruction_response, axis=1)
tempdf = pd.concat([tempdf1, tempdf2]).reset_index(drop=True)

In [9]:
sel_topic = res['selected_topics'][0]
tempdf_t = tempdf1[tempdf1['context_template'] == sel_topic].reset_index(drop=True)
main_irs = tempdf_t['ir_output'].tolist()
all_irs = tempdf['ir_output'].tolist()

In [10]:
Z_indices = [all_irs.index(m) for m in main_irs]

In [11]:
model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")

In [12]:
embeddings = model.encode(all_irs, normalize_embeddings=True, show_progress_bar=True)

Batches: 100%|██████████| 1694/1694 [00:16<00:00, 99.67it/s] 


In [13]:
dist_matrix = cosine_distances(embeddings)

In [14]:
# Step 1: Greedy selection from Z_indices
selected = []
candidates = list(Z_indices)
Y_indices = set(range(len(all_irs)))

In [15]:
num_test_samples = 50

In [16]:
for _ in tqdm(range(num_test_samples)):
    best_idx = None
    best_score = -np.inf

    for idx in candidates:
        temp_selected = selected + [idx]
        temp_rest = list(Y_indices - set(temp_selected))
        avg_dist = dist_matrix[np.ix_(temp_selected, temp_rest)].mean()
        if avg_dist > best_score:
            best_score = avg_dist
            best_idx = idx

    selected.append(best_idx)
    candidates.remove(best_idx)

100%|██████████| 50/50 [04:51<00:00,  5.84s/it]


In [17]:
selected_rows = tempdf.iloc[selected]

In [18]:
for idx in tqdm(selected_rows.index):
    row = tempdf.loc[idx]
    df.loc[
        (df['id'] == id) &
        (df['context_template'] == sel_topic) &
        (df['instruction'] == row['instruction']),
        'split'
    ] = 'test'

In [21]:
# Create a boolean mask for all matching rows
mask = (
    df['id'] == id
) & (
    df['context_template'] == sel_topic
) & (
    df['instruction'].isin(tempdf.loc[selected_rows.index, 'instruction'])
)

# Assign 'val' to the 'split' column for the matching rows
df.loc[mask, 'split'] = 'test'


In [22]:
remaining_rows = tempdf.drop(selected_rows.index)

In [23]:
# Create a boolean mask for all matching rows
mask = (
    df['id'] == id
) & (
    df['context_template'] == sel_topic
) & (
    df['instruction'].isin(tempdf.loc[remaining_rows.index, 'instruction'])
)

# Assign 'val' to the 'split' column for the matching rows
df.loc[mask, 'split'] = 'val'


In [24]:
df['split'].value_counts()

split
train    3319428
val          853
test          50
Name: count, dtype: int64

In [None]:
all_ids = list(df['id'].unique())
all_results = []
for i in tqdm(range(len(all_ids))):
    res = get_test_templates(df, id=all_ids[i], num_templates=2, num_indi=2)
    all_results.append(res)

In [15]:
#save the results as jsonl
import json
with open(f"/datadrive/pavan/CurLL/data/stage{stage}_test_topics.jsonl", "w") as f:
    json.dump(all_results, f, indent=4)

In [18]:
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

@torch.no_grad()
def encode_texts(texts, model, batch_size=64):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        emb = model.encode(batch, convert_to_tensor=True, normalize_embeddings=True)
        embeddings.append(emb)
    return torch.cat(embeddings, dim=0)  # Shape: (N, D)

def compute_distance_torch(embeddings, A, B):
    A_emb = embeddings[A]
    B_emb = embeddings[B]
    sim = torch.nn.functional.cosine_similarity(
        A_emb.unsqueeze(1), B_emb.unsqueeze(0), dim=2
    )
    return (1 - sim).mean().item()

def relative_distance_zscores_torch(embeddings, selected, Z_indices, num_samples=1000):
    N = embeddings.shape[0]
    all_indices = set(range(N))
    selected_set = set(selected)
    Z_set = set(Z_indices)

    complement_Y = list(all_indices - selected_set)
    complement_Z = list(Z_set - selected_set)
    nonZ = list(all_indices - Z_set)
    x = len(selected)

    def sample_random_subsets(pool):
        return [random.sample(pool, x) for _ in range(num_samples)]

    def compute_distribution(target_fn, pool):
        samples = sample_random_subsets(pool)
        scores = [target_fn(s) for s in samples]
        mu, sigma = np.mean(scores), np.std(scores)
        return mu, sigma

    def get_z(actual, mu, sigma):
        return (actual - mu) / sigma if sigma > 0 else float('nan')

    results = {}

    actual_D1 = compute_distance_torch(embeddings, selected, complement_Y)
    mu1, sigma1 = compute_distribution(lambda s: compute_distance_torch(embeddings, s, list(all_indices - set(s))), Z_indices)
    results["D1 (to Y-Zₓ)"] = (actual_D1, mu1, sigma1, get_z(actual_D1, mu1, sigma1))

    if len(complement_Z) > 0:
        actual_D2 = compute_distance_torch(embeddings, selected, complement_Z)
        mu2, sigma2 = compute_distribution(lambda s: compute_distance_torch(embeddings, s, list(Z_set - set(s))), Z_indices)
    else:
        actual_D2, mu2, sigma2 = float('nan'), float('nan'), float('nan')
    results["D2 (to Z-Zₓ)"] = (actual_D2, mu2, sigma2, get_z(actual_D2, mu2, sigma2))

    actual_D3 = compute_distance_torch(embeddings, selected, nonZ)
    mu3, sigma3 = compute_distribution(lambda s: compute_distance_torch(embeddings, s, nonZ), Z_indices)
    results["D3 (to Y-Z)"] = (actual_D3, mu3, sigma3, get_z(actual_D3, mu3, sigma3))

    return results


@torch.no_grad()
def find_diverse_subset_from_Z(Y, Z_indices, x, num_samples=1000):
    assert x <= len(Z_indices), "x must be <= size of Z"
    assert x < len(Y), "x must be < total size of Y"

    # GPU-accelerated SentenceTransformer
    model = SentenceTransformer("all-MiniLM-L6-v2", device='cuda')
    embeddings = encode_texts(Y, model)

    N = embeddings.shape[0]
    dist_matrix = 1 - torch.matmul(embeddings, embeddings.T)  # cosine distance matrix

    selected = []
    candidates = list(Z_indices)
    Y_indices = set(range(N))

    for _ in range(x):
        best_score, best_idx = -1, None
        for idx in candidates:
            temp_selected = selected + [idx]
            temp_rest = list(Y_indices - set(temp_selected))
            avg_dist = dist_matrix[temp_selected][:, temp_rest].mean().item()
            if avg_dist > best_score:
                best_score, best_idx = avg_dist, idx
        selected.append(best_idx)
        candidates.remove(best_idx)

    results = relative_distance_zscores_torch(embeddings, selected, Z_indices, num_samples)
    return selected, results, embeddings


In [None]:
def concat_instruction_response(example):
    return {
        "output": example["instruction"] + "\n\n" + example["response"]
    }

# Ensure 'split' column exists with default "train"
if 'split' not in df.columns:
    df['split'] = "train"

for i in tqdm(range(len(all_results))):
    
    tempdf1 = df[df['id'] == all_ids[i]].reset_index(drop=True)
    tempdf2 = df[
        (df['id'] == all_results[i]['closest_indicators_ids'][0]) |
        (df['id'] == all_results[i]['closest_indicators_ids'][1])
    ].reset_index(drop=True)
    tempdf = pd.concat([tempdf1, tempdf2]).reset_index(drop=True)
    

    for t in tqdm(all_results[i]['selected_topics'], leave=False):
        
        tempdf_t = tempdf1[tempdf1['context_template'] == t].reset_index(drop=True)
        main_irs = tempdf_t.apply(concat_instruction_response, axis=1).apply(lambda x: x['output']).tolist()
        all_irs = tempdf.apply(concat_instruction_response, axis=1).apply(lambda x: x['output']).tolist()
        

        Z_indices = [all_irs.index(m) for m in main_irs]
        selected_ids, results, embeddings = find_diverse_subset_from_Z(all_irs, Z_indices, x=50)
        selected_rows = tempdf_t.iloc[selected_ids]

        # update split
        for idx in selected_rows.index:
            row = tempdf_t.loc[idx]
            df.loc[
                (df['id'] == all_ids[i]) &
                (df['context_template'] == t) &
                (df['instruction'] == row['instruction']),
                'split'
            ] = 'test'

        remaining_rows = tempdf_t.drop(selected_rows.index)
        for idx in remaining_rows.index:
            row = tempdf_t.loc[idx]
            df.loc[
                (df['id'] == all_ids[i]) &
                (df['context_template'] == t) &
                (df['instruction'] == row['instruction']),
                'split'
            ] = 'val'

        t4 = time.perf_counter()
        print(f"Topic loop timing: filtering={t3-t2:.2f}s, updating split={t4-t3:.2f}s")

    print(f"Iteration {i}: prep={t1-t0:.2f}s, per-topic={t4-t2:.2f}s")

  0%|          | 0/182 [00:00<?, ?it/s]

  0%|          | 0/182 [05:22<?, ?it/s]


KeyboardInterrupt: 

In [1]:
import pickle
#load a pickle file
with open(f"/datadrive/pavan/CurLL/nanotron/inference_results_test.pkl", "rb") as f:
    d = pickle.load(f)

In [2]:
d[0]

{'original_prompt_text': "<|user|>I'm going to say three letters, and you tell me which one doesn't belong, and why. Okay? Here they are: A, P, and 2. Which one is different?<|assistant|>",
 'generated_response': "<|user|>\n<|user|> word First<|user|><|user|><|user|><|user|>'s<|user|>,<|user|>.'m<|user|><|user|><|user|><|user|>,<|user|> you<|user|> First going<|user|><|user|><|user|><|user|>,<|user|>...<|user|>'m to<|user|>\n<|user|><|user|> I<|user|>.<|user|> going tell<|user|>,<|user|>\n just\n I<|user|> to three<|user|> let<|user|>'m going already'm<|user|> start sounds<|user|> really\n<|user|> to<|user|> going<|user|> something,\n thinking, to write<|user|> to<|user|> things but, to Leo tell two write say<|user|>, you Leo tell started you things some a<|user|> and tell have a thinking people., numbers<|user|> each me<|user|> foods to. starting were,\n with if to, say like I\n Can<|user|> them one say and we you'm a me<|user|> makes sounds your... o read you can if<|user|>'t't kids 