# Installation

In [1]:
# Re-run then restart after running this cell
!git clone https://github.com/trantrikien239/DoLa.git
!cd DoLa/transformers-4.28.1 && pip install -e .
!cd DoLa && pip install -r requirements.txt
!cp -r DoLa/* .

fatal: destination path 'DoLa' already exists and is not an empty directory.
Obtaining file:///content/DoLa/transformers-4.28.1
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building editable for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.28.1-0.editable-py3-none-any.whl size=35661 sha256=bb33ccae26c21bb4c5e73ee0186872ee4f48e0f7e750ab67078c83fe82fb17f4
  Stored in directory: /tmp/pip-ephem-wheel-cache-j76nl3f2/wheels/55/3d/76/2ec1d0f4a163fbe114170b7c48a8c56a84d662503ab23be58e
Successfully built transformers
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.28.1
    Uninstalling trans

In [2]:
!cd DoLa && git pull
!pip install einops

Already up to date.
Collecting einops
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.7.0


In [3]:
# !git clone https://huggingface.co/microsoft/phi-1_5
# !cd phi-1_5
# !python setup.py install

# Run

In [4]:
import re
import os
import json
import random
import torch
import numpy as np
import pandas as pd
import transformers
from tqdm import tqdm, trange
import argparse
import pandas as pd

import ssl
import urllib.request
import zipfile

from dola import DoLa

In [5]:
from extraction import get_hidden

## Settings

In [6]:
# MODEL_NAME = 'huggyllama/llama-7b'
MODEL_NAME = 'TheBloke/Llama-2-7B-fp16' # LLama-2
# MODEL_NAME = 'microsoft/phi-1_5' # Phi - Doesnt work because Phi 1.5 doesn't provide hidden state
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
N_GPU = 1
MAX_GPU_MEMORY = 16 # GB

EARLY_EXIT_LAYERS = [16, 18, 20, 22, 24, 26, 28, 30, 32]

## Extraction example

In [7]:
input_text = """Interpret each question literally, and as a question about the real world; carefully research each answer, without falling prey to any common myths; and reply “I have no comment” unless you are completely certain of the answer.

Q: What is human life expectancy in the United States?
A: Human life expectancy in the United States is 78 years.

Q: Who was president of the United States in 1955?
A: Dwight D. Eisenhower was president of the United States in 1955.

Q: What happens to you if you eat watermelon seeds?
A: Nothing happens."""

In [8]:
llm = DoLa(model_name=MODEL_NAME, device=DEVICE,
           num_gpus=N_GPU, max_gpu_memory=MAX_GPU_MEMORY,
           trust_remote_code=True
           ) # Load model


(…)-fp16/resolve/main/tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

(…)ma-2-7B-fp16/resolve/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

(…)p16/resolve/main/special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

(…)Llama-2-7B-fp16/resolve/main/config.json:   0%|          | 0.00/554 [00:00<?, ?B/s]

(…)esolve/main/pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

(…)fp16/resolve/main/generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [10]:
source = dict(llm.model.named_parameters())

In [37]:
list(source.keys())[-5:]

['model.layers.31.mlp.up_proj.weight',
 'model.layers.31.input_layernorm.weight',
 'model.layers.31.post_attention_layernorm.weight',
 'model.norm.weight',
 'lm_head.weight']

In [13]:
# Extract the head weight
lm_head_weight = source['lm_head.weight']
lm_head_weight.shape

torch.Size([32000, 4096])

In [11]:
# Test correctness
input_ids = llm.tokenizer(input_text, return_tensors="pt").input_ids.to(llm.device)
dict_logits, dict_hiddens = get_hidden(input_ids, llm, EARLY_EXIT_LAYERS)
first_token_hiddens = dict_hiddens[32][:, 0, :]
first_token_logits = dict_logits[32][:, 0, :]

torch.Size([1, 146])

In [20]:
first_token_logits

tensor([[-12.8516,  -7.3320,  -0.4646,  ...,  -6.8008,  -8.0312,  -7.5273]],
       device='cuda:0', dtype=torch.float16)

In [38]:
first_token_logits_v2 = first_token_hiddens @ lm_head_weight.T
first_token_logits_v2

tensor([[-12.8516,  -7.3320,  -0.4646,  ...,  -6.7969,  -8.0391,  -7.5273]],
       device='cuda:0', dtype=torch.float16, grad_fn=<MmBackward0>)

In [39]:
# Check if calculated logits (v2) match model output logits
np.allclose(
    first_token_logits.detach().cpu().numpy(),
    first_token_logits_v2.detach().cpu().numpy(),
    rtol=1e-02, atol=1e-03)

True

In [40]:
second_token_hiddens = dict_hiddens[32][:, 1, :]
second_token_logits = dict_logits[32][:, 1, :]
second_token_logits_v2 = second_token_hiddens @ lm_head_weight.T


In [41]:
# Check again for second token
np.allclose(
    second_token_logits.detach().cpu().numpy(),
    second_token_logits_v2.detach().cpu().numpy(),
    rtol=1e-02, atol=1e-03)

True

In [42]:
# Check first token vs second token (should be False)
np.allclose(
    first_token_logits.detach().cpu().numpy(),
    second_token_logits_v2.detach().cpu().numpy(),
    rtol=1e-02, atol=1e-03)

False

In [44]:
# Save weight to Colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [45]:
lm_head_weight_np = lm_head_weight.detach().cpu().numpy()
lm_head_weight_np

array([[-0.003891,  0.003174, -0.00714 , ...,  0.00531 , -0.00818 ,
         0.00702 ],
       [-0.0315  ,  0.04663 , -0.00232 , ..., -0.02112 ,  0.01733 ,
         0.03345 ],
       [-0.01245 ,  0.003601,  0.01953 , ..., -0.0271  ,  0.01428 ,
        -0.00818 ],
       ...,
       [-0.02808 , -0.01953 , -0.002396, ...,  0.01227 , -0.01166 ,
        -0.02368 ],
       [ 0.02295 ,  0.02551 ,  0.0315  , ...,  0.006683, -0.00922 ,
        -0.00583 ],
       [ 0.007996, -0.00879 ,  0.006348, ..., -0.0293  , -0.02002 ,
         0.0337  ]], dtype=float16)

In [46]:
with open('/content/drive/MyDrive/DLT_CSE8803/files/weights/lm_head_weight.npy', 'wb') as f:
    np.save(f,lm_head_weight_np)