In [1]:
import torch
import torchvision.models as models
from torchvision import transforms
from PIL import Image

In [2]:
# 加载预训练的 ResNet-50 模型
resnet50 = models.resnet50(pretrained=True)

# 去掉最后的全连接层，只保留卷积层和全局平均池化层
resnet50 = torch.nn.Sequential(*list(resnet50.children())[:-1])

# 将模型设置为评估模式
resnet50.eval()



Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)


In [3]:
# 定义图像预处理步骤
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [4]:
# 加载图像
image = Image.open("/Users/yonglxie/Downloads/dog0.jpeg")

In [5]:
# 预处理图像
input_tensor = preprocess(image)
input_batch = input_tensor.unsqueeze(0)  # 添加 batch 维度

In [6]:
# 使用 GPU 加速（如果可用）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet50 = resnet50.to(device)
input_batch = input_batch.to(device)

In [7]:
# 生成 embedding
with torch.no_grad():
    embedding = resnet50(input_batch)

In [8]:
# 将 embedding 转换为 numpy 数组
embedding = embedding.squeeze().numpy()

In [9]:
embedding.shape

(2048,)

In [10]:
import torch
from torch import nn

In [11]:
from torchvision import datasets
from torch.utils.data import Dataset, DataLoader

In [12]:
class ContrastiveLearningDataset(Dataset):
    def __init__(self, dataset, transform):
        """
        dataset: 原始数据集（如 ImageFolder 或其他自定义数据集）
        transform: 数据增强方法（如 transform_train）
        """
        self.dataset = dataset
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        # 获取原始图像和标签
        img, _ = self.dataset[index]

        # 应用 transform 生成两个不同的增强视图
        img1 = self.transform(img)
        img2 = self.transform(img)

        # 返回两个增强视图和标签
        return img1, img2


In [13]:
# 加载原始数据集
root_dir = '/Users/yonglxie/Desktop'  # 假设图片在 'data/train' 文件夹中
original_dataset = datasets.ImageFolder(root=root_dir)

# 创建对比学习的数据集
contrastive_dataset = ContrastiveLearningDataset(original_dataset, preprocess)

# 使用 DataLoader 载入对比学习数据集
train_loader = DataLoader(contrastive_dataset, batch_size=32, shuffle=True, num_workers=0)


In [14]:
import torchvision.models as models
class ResNet50Embedding(nn.Module):
    """
    Neural network model for ranking translations
    """
    def __init__(self):
        """
        Initialize the model with a pre-trained backbone and regression head
        """
        super(ResNet50Embedding, self).__init__()
        resnet50 = models.resnet50(pretrained=True)
        # 去掉最后的全连接层，只保留卷积层和全局平均池化层
        self.features = torch.nn.Sequential(*list(resnet50.children())[:-1])
        self.flatten = torch.nn.Flatten()

    def forward(self, imgs, **kwargs):
        """
        Forward pass of the model
        Returns: Ranking scores for input sequences
        """
        embedding = self.features(imgs)
        embedding = self.flatten(embedding)
        
        return embedding

In [None]:
class NTXentLoss(nn.Module):
    def __init__(self, temperature=0.5, device='cpu'):
        super(NTXentLoss, self).__init__()
        self.temperature = temperature
        self.cosine_similarity = nn.CosineSimilarity(dim=-1)
        self.criterion = nn.CrossEntropyLoss()
        self.device = device

    def forward(self, embeddings_i, embeddings_j):
        N = embeddings_i.size(0)  # batch size
        # 计算相似性矩阵
        embeddings = torch.cat([embeddings_i, embeddings_j], dim=0) #[2N, embedding_dim]
        sim = self.cosine_similarity(embeddings.unsqueeze(0), embeddings.unsqueeze(1)) / self.temperature
        
        labels = torch.cat([torch.arange(N) for _ in range(2)], dim=0)
        positive_mask = labels.unsqueeze(0) == (labels.unsqueeze(1)).float()
        positive_mask = positive_mask.to(self.device)
        negative_mask = ~positive_mask
        #(2N, 2N) 4*N*N - 4*N 
        
        positives = sim[positive_mask].view(2 * N, -1) #(2N, 2)
        negatives = sim[negative_mask].view(2 * N, -1) #(2N, 2N - 2)
        
        logits = torch.cat([positives, negatives], dim=1) #(2N, 2N)
        labels = torch.zeros(2 * N, dtype=torch.long).to(self.device)  # 正样本对的标签
        return self.criterion(logits, labels)

In [85]:
import torch.optim as optim
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ResNet50Embedding().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = NTXentLoss(temperature=0.5, device=device)



In [86]:
# 5. 开始训练
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for img_i, img_j in train_loader:
        img_i, img_j = img_i.to(device), img_j.to(device)
        embeddings_i = model(img_i)
        embeddings_j = model(img_j)
        
        loss = criterion(embeddings_i, embeddings_j)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader)}")


torch.Size([18, 18])
Epoch [1/10], Loss: 2.5048186779022217
torch.Size([18, 18])
Epoch [2/10], Loss: 2.0428295135498047
torch.Size([18, 18])
Epoch [3/10], Loss: 1.789481282234192
torch.Size([18, 18])
Epoch [4/10], Loss: 1.674331784248352
torch.Size([18, 18])
Epoch [5/10], Loss: 1.6051310300827026
torch.Size([18, 18])
Epoch [6/10], Loss: 1.5611814260482788
torch.Size([18, 18])
Epoch [7/10], Loss: 1.5321450233459473
torch.Size([18, 18])
Epoch [8/10], Loss: 1.5123257637023926
torch.Size([18, 18])
Epoch [9/10], Loss: 1.498137354850769
torch.Size([18, 18])
Epoch [10/10], Loss: 1.4873777627944946


In [87]:
# 6. 提取最后一层的embedding
def extract_embedding(model, img):
    model.eval()
    with torch.no_grad():
        embedding = model(img.to(device))
    return embedding.cpu().numpy()

In [91]:
extract_embedding(model, input_batch).shape

(1, 2048)