In [1]:
import matplotlib.pyplot as plt
import torch
from torch.nn import functional as F 
import seaborn as sns
import numpy as np
import json
from tqdm import tqdm
from skimage.metrics import structural_similarity as ssim
from PIL import Image
import numpy as np
import os

## Function

In [2]:
def ElayerAvg(layer):
    '''
        calculate the ising energy in a tensor
    '''
    
    filters=torch.tensor([[0,1,0],[1,0,1],[0,1,0]]).float().to('cuda').reshape(1,1,3,3)
    layerF=layer.float().to('cuda')
    layerShape=layerF.shape
    layerF=layerF.reshape(1,1,*layerShape)
    elayer=F.conv2d(layerF,filters,padding=0)
    layerF=layerF.reshape(*layerShape)
    elayer=elayer.squeeze(0).squeeze(0)
    return -(elayer*layerF[1:-1,1:-1]).mean()

def ensure_divisibility(numerator, denominator):
    """Ensure that numerator is divisible by the denominator."""
    assert numerator % denominator == 0, '{} is not divisible by {}'.format(
        numerator, denominator)
    
def divide(numerator, denominator):
    """Ensure that numerator is divisible by the denominator and return
    the division value."""
    ensure_divisibility(numerator, denominator)
    return numerator // denominator

def split_tensor_along_last_dim(tensor, num_partitions,
                                contiguous_split_chunks=False):
    """Split a tensor along its last dimension.
    Arguments:
        tensor: input tensor.
        num_partitions: number of partitions to split the tensor
        contiguous_split_chunks: If True, make each chunk contiguous
                                 in memory.
    """
    # Get the size and dimension.
    last_dim = tensor.dim() - 1
    last_dim_size = divide(tensor.size()[last_dim], num_partitions) # 得到每个切分的size
    # Split.
    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) # 对张量进行切分
    # Note: torch.split does not create contiguous tensors by default.
    if contiguous_split_chunks:
        return tuple(chunk.contiguous() for chunk in tensor_list)

    return tensor_list

def calculateEnergyMat(model,n_heads,weightName):
    '''
        calculate Energy Mat in a LLM model
    Arguments:
        model: input LLM model dict.
        n_heads: the n_heads in attention module in model
        weightName(str) : the weight name to calculate option: wq,wk,wv,wo
    '''
    Energy_Mat = []
    for i in range(n_heads):
        cur_layer_W_q = model["layers.{}.attention.{}.weight".format(str(i),weightName)]
        cur_heads = split_tensor_along_last_dim(cur_layer_W_q, n_heads)

        cur_energy = map(ElayerAvg ,list(cur_heads))
        Energy_Mat.append(list(cur_energy))
    return torch.tensor(Energy_Mat)

def Elayer(layer):
    filters=torch.tensor([[0,1,0],[1,0,1],[0,1,0]]).float().to('cuda').reshape(1,1,3,3)
    layerF=layer.float().to('cuda')
    layerShape=layerF.shape
    layerF=layerF.reshape(1,1,*layerShape)
    elayer=F.conv2d(layerF,filters,padding=0)
    layerF=layerF.reshape(*layerShape)
    elayer=elayer.squeeze(0).squeeze(0)
    return -(elayer*layerF[1:-1,1:-1])

def avg_pool2attention(single_head, kernel_size):
    layershape = single_head.shape
    single_head=single_head.float().to('cuda')
    single_head=single_head.reshape(1,1,*layershape)
    out = F.avg_pool2d(single_head, kernel_size, ceil_mode=True)
    out=out.squeeze(0).squeeze(0)
    return out

def calculateEnergyMat_65B(model,n_layers,n_heads,weightName):
    Energy_Mat = []
    for i in range(n_layers):
        cur_layer_W_q = model["layers.{}.attention.{}.weight".format(str(i),weightName)]
        cur_heads = split_tensor_along_last_dim(cur_layer_W_q, n_heads)
        cur_energy = map(ElayerAvg ,list(cur_heads))
        Energy_Mat.append(list(cur_energy))
    return torch.tensor(Energy_Mat)

## compared similarity

In [4]:
def calc_ssim(img1, img2):  
    '''
    SSIM结构相似性比较
    '''
    img1, img2 = np.array(img1), np.array(img2)
    # 此处因为转换为灰度值之后的图像范围是0-255，所以data_range为255，如果转化为浮点数，且是0-1的范围，则data_range应为1
    ssim_score = ssim(img1, img2, data_range=255)
    return ssim_score

def KL(energy_map,energy_single_head):
    return F.kl_div(energy_single_head.view(-1).softmax(dim=-1).log().cpu(), energy_map[:-1,:-1].contiguous().view(-1).softmax(dim=-1).cpu(), reduction='sum')

def corrlation(a,b):
    corr=torch.stack([a.view(-1).cpu(), b.view(-1).cpu()],axis = 0)
    corr=torch.corrcoef(corr)
    return corr[0,1]

In [6]:
path='/data/shhliu19/wonder/llama/65B'

kernel_size = (13, 2)
with open(path+'/params.json', 'r') as f:
    params = json.load(f)
n_heads=params['n_heads']
n_layers=params['n_layers']
model=torch.load(path+'/consolidated.00.pth')
energy_map_wv = calculateEnergyMat_65B(model,n_layers,n_heads,"wv")
cur_layer_W_q = model["layers.0.attention.wv.weight"]
cur_heads = split_tensor_along_last_dim(cur_layer_W_q, n_heads)
Single_head_Energy_map = Elayer(list(cur_heads)[0])
out = avg_pool2attention(Single_head_Energy_map, kernel_size)
KL(energy_map,out)

In [13]:
corrlation(model["layers.0.attention.wq.weight"].float(),model["layers.0.attention.wk.weight"].float())

tensor(0.6704)

In [16]:
corrlation(energy_map[:-1,:-1].contiguous(),out)

tensor(0.0270)

In [17]:
energy_map_wq = calculateEnergyMat_65B(model,n_layers,n_heads,"wq")
energy_map_wk = calculateEnergyMat_65B(model,n_layers,n_heads,"wk")

corrlation(energy_map_wk,energy_map_wq)

tensor(0.5664)