In [None]:
from transformers import AutoImageProcessor, Dinov2Model
import torch
from datasets import load_dataset

dataset = load_dataset("huggingface/cats-image")
image = dataset["test"]["image"][0]

image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
model = Dinov2Model.from_pretrained("facebook/dinov2-base")

inputs = image_processor([image,image], return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state
list(last_hidden_states.shape)

In [None]:
inputs.cuda()

In [None]:
model.config._name_or_path

In [None]:
import torch
import torch.nn as nn

class SiglipMLP(nn.Module):
    def __init__(self, input_dim, intermediate_dim, output_dim):
        super().__init__()
        self.pre_norm = nn.LayerNorm(input_dim)
        self.proj = nn.Sequential(
            nn.Linear(input_dim, intermediate_dim),
            nn.GELU(),
            nn.Linear(intermediate_dim, output_dim)
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.pre_norm(hidden_states)
        hidden_states = hidden_states+self.proj(hidden_states)
        return hidden_states

class VLContrastHead(nn.Module):
    def __init__(self, vision_dimesion, text_dimension, device, target_dimension=512, linear=False):
        super(VLContrastHead, self).__init__()
        self.device = device if device else ('cuda' if torch.cuda.is_available() else 'cpu')
        self.linear = linear
        if self.linear:
            self.vision_mapping_network = nn.Linear(vision_dimesion, target_dimension)
            self.text_mapping_network = nn.Linear(text_dimension, target_dimension)
        else:
            # self.vision_mapping_network = SiglipMLP(vision_dimesion, target_dimension, target_dimension)
            # self.text_mapping_network = SiglipMLP(text_dimension, target_dimension, target_dimension)
            self.vision_mapping_network = nn.Linear(vision_dimesion, target_dimension)
            self.text_mapping_network = nn.Linear(text_dimension, target_dimension)
            self.mapping_network = SiglipMLP(target_dimension, target_dimension, target_dimension)

        self.vision_layer_norm = nn.LayerNorm(vision_dimesion)
        self.text_layer_norm = nn.LayerNorm(text_dimension)
        self.logit_scale = nn.Parameter(torch.randn(1))
        self.logit_bias = nn.Parameter(torch.randn(1))

        self._initialize_weights()
    
    def _initialize_weights(self):

        for module in self.modules():
            if isinstance(module, nn.Linear):
                torch.nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    torch.nn.init.zeros_(module.bias)
            elif isinstance(module, nn.LayerNorm):
                torch.nn.init.ones_(module.weight)
                torch.nn.init.zeros_(module.bias)

        # Initialize logit_scale and logit_bias
        logit_scale_init = torch.log(torch.tensor(10.0))
        self.logit_scale.data.fill_(logit_scale_init)
        self.logit_bias.data.fill_(torch.tensor(-10.0))

In [None]:
head = VLContrastHead(512, 512, 'cuda')

In [None]:
head.mapping_network.proj[2].bias

In [None]:
import torch

# 尝试释放显存
torch.cuda.empty_cache()

# 查看显存使用情况
print(torch.cuda.memory_allocated(0))
print(torch.cuda.memory_reserved(0))


In [None]:
import torch
import torch.nn as nn
from typing import Optional
class StarMLP(nn.Module):
    def __init__(
        self,
        input_dim: int,
        output_dim: int,
        intermediate_dim: Optional[int] = None,
    ):
        super().__init__()
        intermediate_dim = intermediate_dim if intermediate_dim is not None else output_dim
        self.Wa = nn.Linear(input_dim, input_dim, bias=False)
        self.Wb = nn.Linear(input_dim, input_dim, bias=False)
        self.g = nn.Linear(input_dim, output_dim, bias=False)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        a = self.Wa(x)  # N x d
        b = self.Wb(x)  # N x d
        x = torch.einsum('bij,bj->bi', torch.sigmoid(a.unsqueeze(-1) * b.unsqueeze(1)), x)
        x = self.g(x)

        assert not torch.isnan(x).any(), "Output contains NaN"
        assert not torch.isinf(x).any(), "Output contains infinite values"

        return x

In [None]:
networ = StarMLP(64, 128)


In [None]:
x = torch.randn(16, 64)
networ(x).shape

In [3]:
import torch
from transformers import AutoModel, AutoTokenizer

path = 'Alibaba-NLP/gte-base-en-v1.5'
device = torch.device('cuda')
tokenzier = AutoTokenizer.from_pretrained(path)
model = AutoModel.from_pretrained(
    path,
    trust_remote_code=True,
    unpad_inputs=True,
    use_memory_efficient_attention=True,
).to(device)




A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


In [13]:
input_texts = [
    "what is the capital of China?",
    "how to implement quick sort in python?",
    "Beijing",
    "sorting algorithms"
]
inputs = tokenzier(input_texts, max_length=8192, padding=True, truncation=True, return_tensors='pt')
with torch.autocast(device_type=device.type, dtype=torch.float16):  # or bfloat16
    with torch.inference_mode():
        outputs = model(**inputs.to(device))

In [14]:
outputs.last_hidden_state[:, 0]

tensor([[ 0.2642,  0.5342, -0.3053,  ...,  0.5917,  0.3331,  0.6498],
        [ 0.7925,  0.2668,  1.0746,  ..., -0.5660,  0.7037, -0.4509],
        [-0.2670,  0.1429, -0.6497,  ...,  1.0327, -0.1330,  0.0383],
        [ 0.3925, -0.7887,  0.3967,  ..., -0.8746, -0.0094, -0.3476]],
       device='cuda:0')

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch

model = AutoModel.from_pretrained("google/siglip-base-patch16-224")




config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/813M [00:00<?, ?B/s]

In [6]:
model.logit_scale.exp()

tensor([117.3308], grad_fn=<ExpBackward0>)

In [11]:
import torch

def random_mask(self, v1, v2, r: float):
    """
    随机从每个向量中挑选 k 个维度，同时保证梯度传导
    :param v1: 第一个向量, 形状为 (n, d)
    :param v2: 第二个向量, 形状为 (n, d)
    :param r: 选择的维度比例（例如 0.5 表示选择一半的维度）
    :return: 选中的维度对应的子向量 v1 和 v2
    """
    assert v1.shape == v2.shape, "两个张量的形状必须相同"
    n, d = v1.shape
    k = int(d * r)
    
    # 获取 v1 所在设备
    device = v1.device
    
    # 在相同的设备上生成随机索引
    indices = torch.randperm(d, device=device)[:k]  # 从 d 中随机选择 k 个不重复的索引
    print(indices)
    # 选取对应维度
    selected_v1 = v1[:, indices]  # 使用高级索引选择维度
    selected_v2 = v2[:, indices]
    
    return selected_v1, selected_v2

# 示例代码
v1 = torch.randn(3, 5, device='cuda', requires_grad=True)  # 形状为 (5, 10) 的向量
v2 = torch.randn(3, 5, device='cuda', requires_grad=True)  # 形状为 (5, 10) 的向量
r = 0.6  # 选择50%的维度

selected_v1, selected_v2 = random_mask(None, v1, v2, r)
print("随机选择维度的结果:", selected_v1, selected_v2)


tensor([2, 4, 1], device='cuda:0')
随机选择维度的结果: tensor([[-1.0714, -0.5365, -0.2856],
        [ 0.5856,  0.8731,  0.0737],
        [ 0.4370, -0.6297,  0.4586]], device='cuda:0',
       grad_fn=<IndexBackward0>) tensor([[ 0.7240, -1.4434,  0.6400],
        [ 1.2815, -0.5635, -1.3642],
        [ 0.4747, -0.4199, -0.5053]], device='cuda:0',
       grad_fn=<IndexBackward0>)


In [12]:
selected_v1.shape

torch.Size([3, 3])

In [5]:
import torch

# 假设你的张量形状为 (n, d)
n, d = 5, 4  # 示例数据
tensor = torch.randn(n, d)

# 定义分组数 m
m = 2  # 比如将 d 维度分成 2 组

# 检查 d 能否被 m 整除
assert d % m == 0, "d 维度必须能被 m 整除"

# 计算每组的大小
group_size = d // m

# 重塑张量，以便在 m 组上进行平均池化
tensor_reshaped = tensor.view(n, m, group_size)

# 对每组进行平均池化
pooled_result = tensor_reshaped.max(dim=-1)

print(pooled_result)


torch.return_types.max(
values=tensor([[ 0.2201,  2.1231],
        [ 0.6620,  0.7037],
        [ 0.4444,  1.6779],
        [ 1.9595, -0.0101],
        [-0.1047,  0.2060]]),
indices=tensor([[1, 0],
        [0, 0],
        [1, 1],
        [0, 1],
        [1, 1]]))


In [6]:
tensor

tensor([[-1.9637,  0.2201,  2.1231, -1.3038],
        [ 0.6620, -0.1820,  0.7037,  0.0283],
        [ 0.2498,  0.4444, -0.6127,  1.6779],
        [ 1.9595, -0.2025, -2.1426, -0.0101],
        [-2.3939, -0.1047, -0.5598,  0.2060]])

In [11]:
max_v, max_index = tensor[:, 2:4].max(dim=-1)
max_v

tensor([ 2.1231,  0.7037,  1.6779, -0.0101,  0.2060])

In [1]:
import torch
from transformers import AutoModel, AutoTokenizer

path = 'Alibaba-NLP/gte-Qwen2-1.5B-instruct'
device = torch.device('cuda')
tokenzier = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
model = AutoModel.from_pretrained(
    path,
    trust_remote_code=True,
    unpad_inputs=True,
    use_memory_efficient_attention=True,
    torch_dtype=torch.float16
).to(device)

inputs = tokenzier(['test input'], truncation=True, max_length=8192, padding=True, return_tensors='pt')

with torch.inference_mode():
    outputs = model(**inputs.to(device))


A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct:
- tokenization_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct:
- modeling_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

TypeError: __init__() got an unexpected keyword argument 'unpad_inputs'

In [2]:
from transformers import TRANSFORMERS_CACHE
print(TRANSFORMERS_CACHE)


/network/scratch/l/le.zhang/hub


In [9]:
from transformers import AutoImageProcessor, AutoModel
from PIL import Image
import requests
import torch

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
device = torch.device('cuda')
image_processor = AutoImageProcessor.from_pretrained("facebook/vit-mae-large",use_fast=True)
model = AutoModel.from_pretrained("facebook/vit-mae-large", attn_implementation="sdpa", torch_dtype=torch.float16).to(device)

In [8]:
inputs = image_processor(images=[image]*1024, return_tensors="pt").to(model.device, dtype=torch.float16)
with torch.no_grad():
    outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state

In [1]:
from transformers import AutoImageProcessor, Dinov2Model
import torch
from datasets import load_dataset

# dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
# image = dataset["test"]["image"][0]
# device = torch.device('cuda')
image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
# model = Dinov2Model.from_pretrained("facebook/dinov2-base", torch_dtype=torch.float16).to(device)



In [11]:
from PIL import Image   
image = Image.open('/home/mila/l/le.zhang/scratch/datasets/LAION/LAION30M/images/0000009/0007528.jpg')
image.size

FileNotFoundError: [Errno 2] No such file or directory: '/home/mila/l/le.zhang/scratch/datasets/LAION/LAION30M/images/0000009/0007528.jpg'

In [20]:
inputs = image_processor(image, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state
list(last_hidden_states.shape)

[1, 257, 768]

In [1]:
import torch
weight = torch.load('/home/mila/l/le.zhang/scratch/light_align/logs/cc3m_gtendinoL_bs_65536_lion_mean_lr_1e-5_star7_d1024_scale50_negbias10/checkpoints/epoch_10.pt')

  weight = torch.load('/home/mila/l/le.zhang/scratch/light_align/logs/cc3m_gtendinoL_bs_65536_lion_mean_lr_1e-5_star7_d1024_scale50_negbias10/checkpoints/epoch_10.pt')


In [17]:
print(weight['state_dict']['vision_mapping_network.f1.weight'].shape)
print(weight['state_dict']['vision_mapping_network.f2.weight'].shape)
print(weight['state_dict']['vision_mapping_network.g.weight'].shape)

print(weight['state_dict']['text_mapping_network.f1.weight'].shape)
print(weight['state_dict']['text_mapping_network.f2.weight'].shape)
print(weight['state_dict']['text_mapping_network.g.weight'].shape)

torch.Size([4096, 2048])
torch.Size([4096, 2048])
torch.Size([1024, 4096])
torch.Size([2048, 1024])
torch.Size([2048, 1024])
torch.Size([1024, 2048])


In [1]:
import torch

def z_score_normalize(features):
    mean = features.mean(dim=0, keepdim=True)
    std = features.std(dim=0, keepdim=True)
    return (features - mean) / std

features = torch.randn(10, 1024)
features = z_score_normalize(features)
print(features)
print((features - features.mean(0)) / features.std(0))

tensor([[-0.7246, -0.3332,  1.0531,  ...,  0.5195,  0.6481, -1.0260],
        [-0.3381, -0.4691, -0.6043,  ...,  0.7669,  0.3860,  1.0222],
        [-0.9226, -0.9556, -1.7079,  ...,  1.2693, -1.3151, -0.9646],
        ...,
        [ 0.8557,  0.8758,  0.9854,  ..., -0.4275, -0.0440,  0.2412],
        [ 0.6166,  0.3915,  0.2303,  ..., -0.1609,  1.2541, -0.4990],
        [-0.6820,  0.5480, -0.2736,  ..., -1.4394,  0.1189, -0.1482]])
tensor([[-0.7246, -0.3332,  1.0531,  ...,  0.5195,  0.6481, -1.0260],
        [-0.3381, -0.4691, -0.6043,  ...,  0.7669,  0.3860,  1.0222],
        [-0.9226, -0.9556, -1.7079,  ...,  1.2693, -1.3151, -0.9646],
        ...,
        [ 0.8557,  0.8758,  0.9854,  ..., -0.4275, -0.0440,  0.2412],
        [ 0.6166,  0.3915,  0.2303,  ..., -0.1609,  1.2541, -0.4990],
        [-0.6820,  0.5480, -0.2736,  ..., -1.4394,  0.1189, -0.1482]])
