In [59]:
from rosemary import jpt_in_notebook
from llm.submit import submit_job

shell_scripts_template = """
echo "Running on $SLURM_JOB_NODELIST"
echo "======"

master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
master_port=10002
RDZV_ENDPOINT=$master_addr:$master_port

source ~/.profile
conda activate open-instruct
cd /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/open-instruct/scripts

set -e
set -x
echo "======"
srun {cmd}

[ ! -f "{log_dir}/$SLURM_JOB_ID*.out" ] && mv {log_dir}/$SLURM_JOB_ID*.out {save_dir}
"""

test_run = False
model_name_or_path = "../results/baselines/huggyllama/llama-7b"
save_dir = "/gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/open-instruct/scripts/llama-7b_outputs/"
log_dir = '/gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/open-instruct/scripts/'

for dataset in datasets:
    cmd = f"""
    torchrun --nnodes=1 --nproc_per_node=4 \
        --rdzv_id=100 --rdzv_backend=c10d --rdzv_endpoint=localhost:29400 \
        note_llama_embeddings.py \
        --dataset {dataset} \
        --model_name_or_path={model_name_or_path} \
        --save_dir={save_dir} \
    """.strip()


    shell_scripts = shell_scripts_template.format(
        cmd=cmd, log_dir=log_dir, save_dir=save_dir)
    out = submit_job(
        shell_scripts, 
        job_name=f'compute_LM_outputs.{dataset}', 
        nodes=1,
        num_cpus=32,
        cpu_mem=64,
        num_gpus=6,
        gpu_type='v100',
        test_run=test_run,
        job_duration=6,
    )
    if not test_run:
        print(out)


Submiting job with:
{
    "job_name": "compute_LM_outputs.sharegpt",
    "nodes": 1,
    "num_cpus": 32,
    "cpu_mem": 64,
    "num_gpus": 6,
    "gpu_type": "v100",
    "test_run": false,
    "queue": "el8",
    "num_jobs": 1
}
[{'args': 'sbatch --job-name=compute_LM_outputs.sharegpt --partition=el8 --nodes=1 --ntasks-per-node=1 --cpus-per-task=32 --mem=64GB --gres=gpu:6 --output=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/open-instruct/scripts/%J.out --time=6:00:00 /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/open-instruct/scripts/tmpljdnzfpz', 'job_id': 962317}]

Submiting job with:
{
    "job_name": "compute_LM_outputs.open_orca",
    "nodes": 1,
    "num_cpus": 32,
    "cpu_mem": 64,
    "num_gpus": 6,
    "gpu_type": "v100",
    "test_run": false,
    "queue": "el8",
    "num_jobs": 1
}
[{'args': 'sbatch --job-name=compute_LM_outputs.open_orca --partition=el8 --nodes=1 --ntasks-per-node=1 --cpus-per-task=32 --mem=64GB --gres=gpu:6 --output=/gpfs/u/scrat

In [56]:
datasets = []
for dataset in os.listdir(processed_dir):
    dataset_path = os.path.join(processed_dir, dataset)
    save_path = os.path.join(save_dir, f'{dataset}.pkl')
    if dataset in ['tulu'] or not os.path.isdir(dataset_path):
        continue
    if os.path.isfile(save_path):
        continue
    datasets.append(dataset)
datasets
    

['sharegpt',
 'open_orca',
 'dolly',
 'gpt4_alpaca',
 'flan_v2',
 'wizardlm',
 'super_ni',
 'stanford_alpaca',
 'baize',
 'code_alpaca',
 'self_instruct',
 'unnatural_instructions',
 'oasst1']

In [2]:
from rosemary import jpt_parse_args, jpt_setup, jpt_in_notebook; jpt_setup()

if jpt_in_notebook():
    import os
    
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
    

In [3]:
from functools import partial
import os
import numpy as np
import time

import pickle
from tqdm import tqdm 

import pyarrow # import before `torch`, `transformers`, `datasets`
import torch
from torch.utils.data import DataLoader

from datasets import load_dataset

from transformers import AutoModelForCausalLM, AutoTokenizer

from open_instruct.finetune_trainer import encode_with_prompt_completion_format, encode_with_messages_format

[2023-09-27 17:43:11,401] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [4]:
test_run = True
device = 'cuda'
model_name_or_path = '../results/baselines/huggyllama/llama-7b'

processed_dir = '../data/processed'

save_dir = '/gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/open-instruct/scripts/llama-7b_outputs'
os.makedirs(save_dir, exist_ok=True)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    device_map='cuda:0',
    torch_dtype=torch.float16)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path, use_fast=True)
tokenizer.padding_side = 'left'
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:

def datasets_shard_chunk_size(N, num_shards, index):
    div = N // num_shards
    mod = N % num_shards
    start = div * index + min(index, mod)
    end = start + div + (1 if index < mod else 0)
    return end-start


In [None]:
dataset = 'flan_v2'
use_dist = False


if use_dist:
    dist.init_process_group("nccl")
    rank = dist.get_rank()
    world_size = torch.cuda.device_count()
else:
    rank = 0
    world_size = 2

device = f'cuda:{str(rank)}'

train_file = os.path.join(processed_dir, dataset, f'{dataset}_data.jsonl')
assert(os.path.isfile(train_file))

data_files = {'train': train_file}
raw_datasets = load_dataset("json", data_files=data_files)
if test_run:
    raw_datasets['train'] = raw_datasets['train'].select(range(10))
print(f"{dataset} dataset length = {len(raw_datasets['train'])}")

encode_function = partial(
    encode_with_messages_format, tokenizer=tokenizer, max_seq_length=2048)

if rank == 0:
    lm_datasets = raw_datasets.map(
        encode_function, batched=False, num_proc=16,
        desc="Tokenizing and reformatting instruction data")
if use_dist:
    dist.barrier()
if rank!= 0:
    lm_datasets = raw_datasets.map(
        encode_function, batched=False, num_proc=16,
        desc="Tokenizing and reformatting instruction data")

train_dataset = lm_datasets['train']
train_dataset.set_format(
    type="torch",
    output_all_columns=False,
    columns=['input_ids', 'labels', 'attention_mask'])
train_dataset = train_dataset.shard(
    num_shards=world_size, 
    index=rank,
    contiguous=True)
loader = DataLoader(train_dataset, shuffle=False, batch_size=1, pin_memory=True) 


text_embeddings = []
log_probs = []
for batch in tqdm(loader, disable=rank!=0, total=len(loader)):
    batch = {k: v.to(device, non_blocking=True) for k, v in batch.items()}
    with torch.inference_mode():
        outputs = model(**batch, output_hidden_states=True)

    # (bsz, seq_len, hidden_size) -> (bsz, hidden_size)
    text_embedding = outputs['hidden_states'][-1].mean(1)
    # sum of output token log probs
    log_prob = -outputs['loss']

    text_embeddings.append(text_embedding.detach().cpu())
    log_probs.append(log_prob.detach().cpu())

text_embeddings = torch.vstack(text_embeddings).to(torch.float32)
log_probs = torch.vstack(log_probs)


chunk_sizes = [datasets_shard_chunk_size(len(train_dataset), num_shards=world_size, index=i) 
               for i in range(world_size)]

def gather_2d_tensors(tensor):
    D = tensor.shape[1]
    tensor_list = [torch.zeros((B, D), dtype=torch.float32) 
                   for B in chunk_sizes]
    if rank == 0:
        dist.gather(tensor, gather_list=tensor_list, dst=0)
    else:
        dist.gather(tensor, gather_list=[], dst=0)
    return tensor_list

if use_dist:
    text_embeddings = gather_2d_tensors(text_embeddings)
    log_probs = gather_2d_tensors(log_probs)
    

if rank == 0:
    output = {'text_embeddings': text_embeddings,
              'log_probs': log_probs}
    print([(k, v.shape) for k, v in output.items()])
    if not test_run:
        with open(save_path, 'wb') as f:
            pickle.dump(output, f, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:

train_dataset.shard(num_shards=3, index=0, contiguous=True)['id']


In [None]:
train_dataset.shard(num_shards=3, index=1, contiguous=True)['id']


In [None]:
train_dataset.shard(num_shards=3, index=2, contiguous=True)['id']
