In [1]:
import matplotlib.pyplot as plt
import torch
from torch.nn import functional as F 
import seaborn as sns
import numpy as np
import json
from tqdm import tqdm
from skimage.metrics import structural_similarity as ssim
from PIL import Image
import numpy as np
import os

In [2]:
def ElayerAvg(layer):
    filters=torch.tensor([[0,1,0],[1,0,1],[0,1,0]]).float().to('cuda').reshape(1,1,3,3)
    layerF=layer.float().to('cuda')
    layerShape=layerF.shape
    layerF=layerF.reshape(1,1,*layerShape)
    elayer=F.conv2d(layerF,filters,padding=0)
    layerF=layerF.reshape(*layerShape)
    elayer=elayer.squeeze(0).squeeze(0)
    return -(elayer*layerF[1:-1,1:-1]).mean()
def ensure_divisibility(numerator, denominator):
    """Ensure that numerator is divisible by the denominator."""
    assert numerator % denominator == 0, '{} is not divisible by {}'.format(
        numerator, denominator)


def divide(numerator, denominator):
    """Ensure that numerator is divisible by the denominator and return
    the division value."""
    ensure_divisibility(numerator, denominator)
    return numerator // denominator
def split_tensor_along_last_dim(tensor, num_partitions,
                                contiguous_split_chunks=False):
    """Split a tensor along its last dimension.
    Arguments:
        tensor: input tensor.
        num_partitions: number of partitions to split the tensor
        contiguous_split_chunks: If True, make each chunk contiguous
                                 in memory.
    """
    # Get the size and dimension.
    last_dim = tensor.dim() - 1
    last_dim_size = divide(tensor.size()[last_dim], num_partitions) # 得到每个切分的size
    # Split.
    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) # 对张量进行切分
    # Note: torch.split does not create contiguous tensors by default.
    if contiguous_split_chunks:
        return tuple(chunk.contiguous() for chunk in tensor_list)

    return tensor_list
def calculateEnergyMat(model,n_heads,weightName):
    Energy_Mat = []
    for i in range(n_heads):
        cur_layer_W_q = model["layers.{}.attention.{}.weight".format(str(i),weightName)]
        cur_heads = split_tensor_along_last_dim(cur_layer_W_q, n_heads)

        cur_energy = map(ElayerAvg ,list(cur_heads))
        Energy_Mat.append(list(cur_energy))
    return torch.tensor(Energy_Mat)

def Elayer(layer):
    filters=torch.tensor([[0,1,0],[1,0,1],[0,1,0]]).float().to('cuda').reshape(1,1,3,3)
    layerF=layer.float().to('cuda')
    layerShape=layerF.shape
    layerF=layerF.reshape(1,1,*layerShape)
    elayer=F.conv2d(layerF,filters,padding=0)
    layerF=layerF.reshape(*layerShape)
    elayer=elayer.squeeze(0).squeeze(0)
    return -(elayer*layerF[1:-1,1:-1])

def avg_pool2attention(single_head, kernel_size):
    layershape = single_head.shape
    single_head=single_head.float().to('cuda')
    single_head=single_head.reshape(1,1,*layershape)
    out = F.avg_pool2d(single_head, kernel_size, ceil_mode=True)
    out=out.squeeze(0).squeeze(0)
    return out

    def calculateEnergyMat_65B(model,n_layers,n_heads,weightName):
        Energy_Mat = []
        for i in range(n_layers):
            cur_layer_W_q = model["layers.{}.attention.{}.weight".format(str(i),weightName)]
            cur_heads = split_tensor_along_last_dim(cur_layer_W_q, n_heads)
            cur_energy = map(ElayerAvg ,list(cur_heads))
            Energy_Mat.append(list(cur_energy))
        return torch.tensor(Energy_Mat)

In [3]:
path='/data/shhliu19/wonder/llama/65B'
model=torch.load(path+'/consolidated.00.pth')
kernel_size=(13,2)
cur_layer_W_q = model["layers.20.attention.wq.weight"]
with open(path+'/params.json', 'r') as f:
    params = json.load(f)
n_heads=params['n_heads']
cur_heads = split_tensor_along_last_dim(cur_layer_W_q, n_heads)
print(list(cur_heads)[0].shape)
Single_head_Energy_map = Elayer(list(cur_heads)[0])
print(Single_head_Energy_map.shape)
out = avg_pool2attention(Single_head_Energy_map, kernel_size)
print(out.shape)

torch.Size([1024, 128])
torch.Size([1022, 126])
torch.Size([79, 63])


#### 矩阵行翻转

In [4]:
out = torch.flip(out, [0])
out.shape

torch.Size([79, 63])

### SSIM结构相似性比较

In [5]:
def calc_ssim(img1, img2):  
    img1, img2 = np.array(img1), np.array(img2)
    # 此处因为转换为灰度值之后的图像范围是0-255，所以data_range为255，如果转化为浮点数，且是0-1的范围，则data_range应为1
    ssim_score = ssim(img1, img2, data_range=255)
    return ssim_score

### KL散度

#### pooling q矩阵 vs energy map

In [9]:
import torch.nn.functional as F
kl = F.kl_div(out.view(-1).softmax(dim=-1).log().cpu(), energy_map[:-1,:-1].contiguous().view(-1).softmax(dim=-1).cpu(), reduction='sum')
kl

NameError: name 'energy_map' is not defined

#### pooling q矩阵 vs self

In [None]:
kl = F.kl_div(out.view(-1).softmax(dim=-1).log().cpu(), out.view(-1).softmax(dim=-1).cpu(), reduction='sum')
kl

#### random matrix vs energy map

In [None]:
kl = F.kl_div(torch.randn(79,64).view(-1).softmax(dim=-1).log().cpu(), energy_map[:-1,:].view(-1).softmax(dim=-1).cpu(), reduction='sum')
kl

In [None]:
kl = F.kl_div(energy_map[:-1,:].view(-1).softmax(dim=-1).log().cpu(), torch.randn(79,64).view(-1).softmax(dim=-1).cpu(), reduction='sum')
kl

In [None]:
sns.heatmap(energy_map[:-1,:-1].cpu())

In [None]:
sns.heatmap(out.cpu())

In [None]:
kernel_size = (13, 2)
with open(path+'/params.json', 'r') as f:
    params = json.load(f)
n_heads=params['n_heads']
n_layers=params['n_layers']

for i in range(8):
    model=torch.load(path+'/consolidated.0{}.pth'.format(i))
    energy_map = calculateEnergyMat_65B(model,n_layers,n_heads,"wq")
    cur_layer_W_q = model["layers.0.attention.wq.weight"]
    cur_heads = split_tensor_along_last_dim(cur_layer_W_q, n_heads)
    Single_head_Energy_map = Elayer(list(cur_heads)[0])
    out = avg_pool2attention(Single_head_Energy_map, kernel_size)
    kl = F.kl_div(out.view(-1).softmax(dim=-1).log().cpu(), energy_map[:-1,:-1].contiguous().view(-1).softmax(dim=-1).cpu(), reduction='sum')
    print(kl)

tensor(-8.5075e-07)
tensor(-7.4094e-07)
tensor(-1.6282e-06)
tensor(1.6820e-06)
tensor(-1.2859e-06)
tensor(2.1419e-06)
tensor(-1.8161e-06)


#### energy map & out 拼接

In [None]:
a=torch.stack([out.view(-1).cpu(), out.view(-1).cpu()*2],axis = 0)
a.shape

#### 65B kernel size & crop

In [None]:
path='/data/shhliu19/wonder/llama/65B'

kernel_size = (13, 2)
with open(path+'/params.json', 'r') as f:
    params = json.load(f)
n_heads=params['n_heads']
n_layers=params['n_layers']
model=torch.load(path+'/consolidated.00.pth'.format(i))
energy_map = calculateEnergyMat_65B(model,n_layers,n_heads,"wv")
cur_layer_W_q = model["layers.0.attention.wv.weight"]
cur_heads = split_tensor_along_last_dim(cur_layer_W_q, n_heads)
print(list(cur_heads)[0].shape)
Single_head_Energy_map = Elayer(list(cur_heads)[0])
print(Single_head_Energy_map.shape)
out = avg_pool2attention(Single_head_Energy_map, kernel_size)
print(out.shape)
print(energy_map.shape)
kl = F.kl_div(out.view(-1).softmax(dim=-1).log().cpu(), energy_map[:-1,:-1].contiguous().view(-1).softmax(dim=-1).cpu(), reduction='sum')
print(kl)

#### 13B kernel size & crop

In [None]:
path='/data/shhliu19/wonder/llama/13B'
kernel_size = (64,3) #13B
with open(path+'/params.json', 'r') as f:
    params = json.load(f)
n_heads=params['n_heads']
n_layers=params['n_layers']
model=torch.load(path+'/consolidated.00.pth'.format(i))
energy_map = calculateEnergyMat_65B(model,n_layers,n_heads,"wv")
cur_layer_W_q = model["layers.0.attention.wv.weight"]
cur_heads = split_tensor_along_last_dim(cur_layer_W_q, n_heads)
print(list(cur_heads)[0].shape)
Single_head_Energy_map = Elayer(list(cur_heads)[0])
print(Single_head_Energy_map.shape)
out = avg_pool2attention(Single_head_Energy_map, kernel_size)
print(out.shape)
print(energy_map.shape)
kl = F.kl_div(out[:,:-2].contiguous().view(-1).softmax(dim=-1).log().cpu(), energy_map.view(-1).softmax(dim=-1).cpu(), reduction='sum')
print(kl)

In [None]:
path='/data/shhliu19/wonder/llama/65B'

kernel_size = (13, 2)
with open(path+'/params.json', 'r') as f:
    params = json.load(f)
n_heads=params['n_heads']
n_layers=params['n_layers']
model=torch.load(path+'/consolidated.00.pth.fake'.format(i))
energy_map = calculateEnergyMat_65B(model,n_layers,n_heads,"wv")
cur_layer_W_q = model["layers.0.attention.wv.weight"]
cur_heads = split_tensor_along_last_dim(cur_layer_W_q, n_heads)
print(list(cur_heads)[0].shape)
Single_head_Energy_map = Elayer(list(cur_heads)[0])
print(Single_head_Energy_map.shape)
out = avg_pool2attention(Single_head_Energy_map, kernel_size)
print(out.shape)
print(energy_map.shape)
kl = F.kl_div(out.view(-1).softmax(dim=-1).log().cpu(), energy_map[:-1,:-1].contiguous().view(-1).softmax(dim=-1).cpu(), reduction='sum')
print(kl)

In [None]:
torch.randn_like()