In [30]:
import torch
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"


# SAMPLE VAL DATA

In [31]:
grad_relative_path = '../../'

In [32]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# load sgd gradients of ref points
ref_grads = torch.load(grad_relative_path + 'grads/TinyLlama/TinyLlama-1.1B-Chat-v1.0-p0.1-lora-seed3/mmlu-ckpt92-sgd/dim8192/all_orig.pt')
if not torch.is_tensor(ref_grads):
    ref_grads = torch.tensor(ref_grads)
ref_grads = ref_grads.to(device).float()
# load adam gradients of val points
val_grads = torch.load(grad_relative_path + 'grads/TinyLlama/TinyLlama-1.1B-Chat-v1.0-p0.1-lora-seed3/fake_val/dolly-ckpt92-adam/dim8192/all_orig.pt')
if not torch.is_tensor(val_grads):
    val_grads = torch.tensor(val_grads)
val_grads = val_grads.to(device).float()

In [33]:
val_grads.shape

torch.Size([7506, 8192])

In [34]:
def sample_tensor(tensor, sample_fraction=0.2, seed=0): 
    torch.manual_seed(seed)
    num_samples = int(tensor.size(0) * sample_fraction)
    indices = torch.randperm(tensor.size(0))[:num_samples]
    return tensor[indices], indices

sampled_val_grads, sampled_indices = sample_tensor(val_grads, 0.2, seed=0) # sampled_indices is the indices of samples from the original file
print(sampled_val_grads.shape)

torch.Size([1501, 8192])


# COS

In [35]:
def calculate_cosine_similarity(training_info: torch.Tensor, validation_info: torch.Tensor):
    """Calculate the cosine similarity.

    Args:
        training_info (torch.Tensor): training info (gradients/representations) stored in a tensor of shape N x N_DIM
        validation_info (torch.Tensor): validation info (gradients/representations) stored in a tensor of shape N_VALID x N_DIM
    """
    # N x N_VALID
    cosine_similarity = torch.matmul(
        training_info, validation_info.transpose(0, 1))
    # cosine_similarity = cosine_similarity / (training_info.norm(dim=1)[:, None] * validation_info.norm(dim=1))
    # cosine_distance = 1 - cosine_similarity
    return cosine_similarity

In [36]:
grads_cos = calculate_cosine_similarity(sampled_val_grads, ref_grads)
grads_cos.shape

torch.Size([1501, 285])

In [37]:
mean_cos = grads_cos.mean(-1)
mean_cos.shape

torch.Size([1501])

# THRESHOLD

In [38]:
sorted_mean_cos, _ = torch.sort(mean_cos)
# print(sorted_mean_cos)


In [39]:
n_samples = sorted_mean_cos.size(0)
chunk_size = n_samples // 3
low_threshold = sorted_mean_cos[chunk_size]
high_threshold = sorted_mean_cos[2 * chunk_size]
print(low_threshold, high_threshold)

tensor(2.7546e-07, device='cuda:0') tensor(0.0056, device='cuda:0')


# Label

In [40]:
labels = torch.empty_like(mean_cos, dtype=torch.long)

labels[mean_cos < low_threshold] = 0
labels[(mean_cos >= low_threshold) & (mean_cos < high_threshold)] = 1
labels[mean_cos >= high_threshold] = 2

# print(labels)
labels.shape

torch.Size([1501])

In [41]:
labels[:5]

tensor([0, 0, 2, 2, 1], device='cuda:0')

# Save (Text, Grad, Label)

In [45]:
from datasets import load_dataset
from typing import Union, List

def load_raw_dataset(train_files: Union[List[str], str]):
    """ load raw dataset """
    if isinstance(train_files, str):
        train_files = [train_files]
    processed_datasets = load_dataset(
        "json",
        data_files=train_files,
    )["train"]
    return processed_datasets

In [46]:
text_path = '../data/train/processed/dolly/val_dolly_data.jsonl'
texts = load_raw_dataset(text_path)

In [47]:
from datasets import Dataset

def sample_dataset(dataset: Dataset, indices: torch.Tensor) -> Dataset:
    """Sample from a dataset based on indices tensor.

    Args:
        dataset (Dataset): The dataset to sample from.
        indices (torch.Tensor): The indices tensor.

    Returns:
        Dataset: The sampled dataset.
    """
    sampled_dataset = dataset.select(indices.tolist())
    return sampled_dataset



In [48]:
import json

with open('../data/filter/dolly/all.jsonl', 'w', encoding='utf-8') as file:
    for text, cos, label in zip(texts, mean_cos, labels):
        data = {
            "text": text,
            "cos": cos.item(),
            "label": label.item()
        }
        file.write(json.dumps(data, ensure_ascii=False) + '\n')


In [49]:
texts[0]

{'dataset': 'dolly',
 'id': 'dolly_7505',
 'messages': [{'role': 'user',
   'content': "What was Canon EOS DCS 1 resolution?\nInput: The Canon EOS DCS 1 was Kodak's third Canon-based Digital SLR camera (a rebranded Kodak EOS DCS-1). It was released in December 1995, following the cheaper EOS DCS 3, which was released earlier that year. Like that camera, it combined an EOS-1N body with a modified Kodak DCS 460 digital back. Despite offering a then-enormous resolution of 6 megapixels with a relatively large APS-H sensor, a number of technical issues (together with its 3.6 million yen price) meant that it was never a very popular camera other than for a few people with specialized roles.\n\nAlthough the sensor was much larger than the EOS DCS 3, the DCS 1 had a lower fixed sensitivity of ISO 80. The large image size resulted in a burst rate of just over one image per second for two images, followed by an eight-second delay to clear the buffer. A typical contemporary 340MB PCMCIA card or I

In [50]:
mean_cos[0]

tensor(-0.0063, device='cuda:0')

In [51]:
labels[0]

tensor(0, device='cuda:0')