In [450]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor

In [451]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [452]:
import os
import  pickle
import h5py
import json

class CustomImageDataset(Dataset):
    def __init__(self, mode = "train", max_sequence_length=128):
        self.max_sequence_length = max_sequence_length
        if mode == "train":
            with open("./train.txt", "r") as f:
                self.datalist = f.readlines()
        elif mode == "test":
            with open("./test.txt", "r") as f:
                self.datalist = f.readlines()
        text_tmp = open("data.json")
        data = json.load(text_tmp)
        ann = {"假": 0, "真": 1, "辟谣": 2}
        self.label_dict = {}
        for i in data:
            self.label_dict[i["video_id"]] = ann[i["annotation"]]

    def __len__(self):
        return len(self.datalist)

    def __getitem__(self, idx):
        imf = open('/mnt/c/Users/IT/Downloads/all/ptvgg19_frames/'+self.datalist[idx].strip()+'.pkl', 'rb')
        data_imf = pickle.load(imf)
        vf = h5py.File('/mnt/c/Users/IT/Downloads/all/c3d/'+self.datalist[idx].strip()+'.hdf5', 'r')
        data_vf = vf[self.datalist[idx].strip()]["c3d_features"][:]
        af = open('/mnt/c/Users/IT/Downloads/all/audiosconvfea/'+self.datalist[idx].strip(), 'rb')
        data_af = pickle.load(af)
        label = self.label_dict[self.datalist[idx].strip()]

        data_imf = self.pad_or_truncate_sequence(data_imf)
        data_vf = self.pad_or_truncate_sequence(data_vf)
        data_af = self.pad_or_truncate_sequence(data_af)

        return data_imf, data_vf, data_af, label
    

    def pad_or_truncate_sequence(self, sequence):
        # 将 NumPy 数组转换为 PyTorch 张量
        sequence = torch.from_numpy(sequence)

        # 填充或截断序列，使其具有相同的长度
        if len(sequence) < self.max_sequence_length:
            # 填充
            padding = torch.zeros((self.max_sequence_length - len(sequence), sequence.shape[1]))
            sequence = torch.cat((sequence, padding))
        elif len(sequence) > self.max_sequence_length:
            # 截断
            sequence = sequence[:self.max_sequence_length, :]

        return sequence

In [453]:
c = CustomImageDataset()

In [454]:
data_imf, data_vf, data_af, label = c[10]

In [455]:
data_imf.shape, data_vf.shape, data_af.shape

(torch.Size([128, 4096]), torch.Size([128, 4096]), torch.Size([128, 12288]))

In [456]:
inputI_size = 4096
inputV_size = 4096
inputA_size = 12288
hidden_size = 128
output_size = 3

In [457]:
a = torch.randn(128,128)
b = torch.randn(128,128)
c = torch.randn(128,128)


In [458]:
d = torch.cat([a, b, c], dim=1)
d.shape

torch.Size([128, 384])

In [459]:
d = nn.ReLU(d)

In [460]:
class NeuralNetwork(nn.Module):
    def __init__(self, inputI_size, inputV_size, inputA_size, hidden_size, output_size):
        super(NeuralNetwork, self).__init__()
        self.fc_imf = nn.Linear(inputI_size, hidden_size) 
        self.fc_vf = nn.Linear(inputV_size, hidden_size)  
        self.fc_af = nn.Linear(inputA_size, hidden_size)  
        self.relu = nn.ReLU()
        self.fc_fusion = nn.Linear(hidden_size * 3, output_size)  
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x_imf, x_vf, x_af):
        x_imf = self.fc_imf(x_imf)
        x_vf = self.fc_vf(x_vf)
        x_af = self.fc_af(x_af)
        # 将三种特征连接在一起
        fused_features = torch.cat([x_imf, x_vf, x_af], dim=1)

        # 对融合后的特征进行全连接
        fused_features = self.relu(fused_features)
        fused_features = self.fc_fusion(fused_features)
        fused_features = self.softmax(fused_features)

        return fused_features

In [461]:
fusion_model = NeuralNetwork(inputI_size, inputV_size, inputA_size, hidden_size, output_size)

In [462]:
# License: BSD
# Author: Sasank Chilamkurthy

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import time
import os
from PIL import Image
from tempfile import TemporaryDirectory

In [463]:
train = CustomImageDataset()

In [464]:
test = CustomImageDataset(mode="test")

In [465]:
import torch.nn.utils.rnn as rnn_utils

def collate_fn(batch):
    # batch 是一个包含多个样本的列表，每个样本是一个元组 (data_imf, data_vf, data_af, label)
    imfs, vfs, afs, labels = zip(*batch)

    # 使用 pad_sequence 进行填充
    padded_imfs = rnn_utils.pad_sequence(imfs, batch_first=True)
    padded_vfs = rnn_utils.pad_sequence(vfs, batch_first=True)
    padded_afs = rnn_utils.pad_sequence(afs, batch_first=True)

    # 返回填充后的数据和标签
    return padded_imfs, padded_vfs, padded_afs, torch.tensor(labels)

# 创建 DataLoader 时指定 collate_fn
train_loader = torch.utils.data.DataLoader(train, batch_size=4, shuffle=True, collate_fn=collate_fn)
val_loader = torch.utils.data.DataLoader(test, batch_size=4, shuffle=True, collate_fn=collate_fn)

dataloaders = {'train': train_loader, 'val': val_loader}


In [466]:
import torch.nn.utils.rnn as rnn_utils

def collate_fn(batch):
    # batch 是一个包含多个样本的列表，每个样本是一个元组 (data_imf, data_vf, data_af, label)
    imfs, vfs, afs, labels = zip(*batch)

    # 使用 pad_sequence 进行填充
    padded_imfs = rnn_utils.pad_sequence(imfs, batch_first=True)
    padded_vfs = rnn_utils.pad_sequence(vfs, batch_first=True)
    padded_afs = rnn_utils.pad_sequence(afs, batch_first=True)

    # 直接将填充后的数据转换为 PyTorch 张量
    padded_imfs = torch.tensor(padded_imfs)
    padded_vfs = torch.tensor(padded_vfs)
    padded_afs = torch.tensor(padded_afs)

    # 返回填充后的数据和标签
    return padded_imfs, padded_vfs, padded_afs, torch.tensor(labels)


In [467]:
dataloaders = {
    'train': torch.utils.data.DataLoader(train, batch_size=4, shuffle=True),
    'val': torch.utils.data.DataLoader(test, batch_size=4, shuffle=True)
}

In [468]:
# dataloaders = {'train': torch.utils.data.DataLoader(train, batch_size=4,
#                                              shuffle=True),
# 'val': torch.utils.data.DataLoader(test, batch_size=4,
#                                              shuffle=True)
#                                              }

In [469]:
train[0]

(tensor([[-0.4256, -0.3148, -1.3210,  ...,  0.0560, -0.7306, -0.8196],
         [-0.7379, -0.2910, -1.5157,  ...,  0.0189, -0.6578, -0.9554],
         [-0.5325, -0.2531, -1.5586,  ...,  0.1187, -0.7828, -0.8430],
         ...,
         [-0.1492,  1.0484, -0.1234,  ..., -0.2936, -0.3012, -0.2957],
         [-0.1312,  1.0226, -0.1796,  ..., -0.2905, -0.3724, -0.2205],
         [-0.2015,  1.0845, -0.2335,  ..., -0.2483, -0.2853, -0.2584]]),
 tensor([[0.0000, 0.0000, 2.5520,  ..., 0.0000, 0.0000, 3.3453],
         [0.0000, 0.0000, 4.0873,  ..., 0.0000, 0.0000, 2.2750],
         [0.0000, 0.0000, 5.3454,  ..., 0.0000, 0.0000, 2.3237],
         ...,
         [0.0000, 0.5312, 0.5379,  ..., 0.0000, 0.0000, 1.7034],
         [0.3185, 0.0000, 3.3585,  ..., 0.0000, 0.0000, 3.0217],
         [0.8294, 0.0000, 3.8933,  ..., 0.0000, 0.0000, 2.8031]]),
 tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0371, 0.0000, 0.0000],
         [0.0955, 0.00

In [470]:
for b in dataloaders["train"]:
    print(b)
    break

[tensor([[[-0.3602, -0.4651, -0.3777,  ...,  0.0718, -0.2641, -0.6900],
         [-0.1122, -0.7719, -0.6257,  ..., -0.0328, -0.2836, -0.8467],
         [-0.1533, -0.7232, -0.5291,  ...,  0.0277, -0.4541, -0.9145],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-0.6466, -0.2027, -1.9240,  ...,  0.2948, -0.5452, -1.2570],
         [-0.0830,  0.4102, -1.4344,  ...,  0.1057,  0.0891, -0.7472],
         [ 0.0260,  0.2577, -1.5819,  ...,  0.1294, -0.0723, -0.6357],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-2.0768, -0.9201, -4.0298,  ..., -1.4944, -1.4145, -5.2806],
         [-2.1723, -1.0488, -3.9071,  ..., -

In [471]:
len(train)

2500

In [472]:
dataset_sizes = {"train":len(train), "val": len(test)}

In [473]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    # Create a temporary directory to save training checkpoints
    with TemporaryDirectory() as tempdir:
        best_model_params_path = os.path.join(tempdir, 'best_model_params.pt')

        torch.save(model.state_dict(), best_model_params_path)
        

        for epoch in range(num_epochs):
            print(f'Epoch {epoch}/{num_epochs - 1}')
            print('-' * 10)

            # Each epoch has a training and validation phase
            for phase in ['train', 'val']:
                if phase == 'train':
                    model.train()  # Set model to training mode
                else:
                    model.eval()   # Set model to evaluate mode

                running_loss = 0.0
                running_corrects = 0

                # Iterate over data.
                for inputsV, inputsI, inputsA, labels in dataloaders[phase]:
                    inputsV = inputsV.to(device)
                    inputsV = inputsI.to(device)
                    inputsA = inputsA.to(device)
                    labels = labels.to(device)

                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # forward
                    # track history if only in train
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(inputsV, inputsV, inputsA)
                        # _, preds = torch.max(outputs, 1)
                        loss = criterion(outputs, labels)

                        # backward + optimize only if in training phase
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()

                    # statistics
                    running_loss += loss.item() * inputsV.size(0)
                    # running_corrects += torch.sum(preds == labels.data)
                if phase == 'train':
                    scheduler.step()

                epoch_loss = running_loss / dataset_sizes[phase]

                print(f'{phase} Loss: {epoch_loss:.4f}')

        time_elapsed = time.time() - since
        print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')

        # load best model weights
        model.load_state_dict(torch.load(best_model_params_path))
    return model

In [474]:
criterion = nn.CrossEntropyLoss()

In [475]:
train[0][0]

tensor([[-0.4256, -0.3148, -1.3210,  ...,  0.0560, -0.7306, -0.8196],
        [-0.7379, -0.2910, -1.5157,  ...,  0.0189, -0.6578, -0.9554],
        [-0.5325, -0.2531, -1.5586,  ...,  0.1187, -0.7828, -0.8430],
        ...,
        [-0.1492,  1.0484, -0.1234,  ..., -0.2936, -0.3012, -0.2957],
        [-0.1312,  1.0226, -0.1796,  ..., -0.2905, -0.3724, -0.2205],
        [-0.2015,  1.0845, -0.2335,  ..., -0.2483, -0.2853, -0.2584]])

In [476]:
model_ft = NeuralNetwork(inputI_size, inputV_size, inputA_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

In [477]:
out = model_ft(test[0][0], test[0][1], test[0][2])

In [480]:
out

tensor([[0.4067, 0.2947, 0.2987],
        [0.3824, 0.3153, 0.3023],
        [0.3447, 0.4089, 0.2464],
        [0.3649, 0.3973, 0.2379],
        [0.4001, 0.3518, 0.2481],
        [0.4032, 0.3639, 0.2329],
        [0.4607, 0.2985, 0.2409],
        [0.4375, 0.3102, 0.2522],
        [0.4275, 0.3249, 0.2477],
        [0.3536, 0.3699, 0.2765],
        [0.3801, 0.3766, 0.2433],
        [0.3875, 0.3615, 0.2510],
        [0.4193, 0.3383, 0.2424],
        [0.3767, 0.3446, 0.2787],
        [0.3829, 0.3868, 0.2303],
        [0.3158, 0.4291, 0.2551],
        [0.3612, 0.3992, 0.2395],
        [0.3716, 0.3651, 0.2632],
        [0.4054, 0.3650, 0.2296],
        [0.3884, 0.3681, 0.2435],
        [0.3440, 0.4138, 0.2422],
        [0.3900, 0.3423, 0.2677],
        [0.3636, 0.3692, 0.2672],
        [0.3417, 0.3981, 0.2603],
        [0.4241, 0.3318, 0.2441],
        [0.4572, 0.3386, 0.2042],
        [0.4581, 0.3435, 0.1983],
        [0.4391, 0.3213, 0.2396],
        [0.3755, 0.3819, 0.2426],
        [0.372

In [429]:
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=25)

Epoch 0/24
----------


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1536x128 and 384x3)

In [61]:
data_vf

array([[0.21621627, 1.2825677 , 1.447454  , ..., 0.        , 1.7883198 ,
        0.12972558],
       [0.        , 1.1042053 , 0.47129965, ..., 0.        , 0.        ,
        0.7814119 ],
       [0.        , 0.00869048, 0.        , ..., 0.        , 0.        ,
        1.7861747 ],
       ...,
       [0.        , 0.        , 2.665276  , ..., 0.55926245, 0.        ,
        0.        ],
       [0.        , 0.        , 2.6870937 , ..., 0.95208526, 0.        ,
        0.        ],
       [0.        , 0.        , 2.6269674 , ..., 1.0240164 , 0.        ,
        0.        ]], dtype=float32)

text

In [489]:
import json

keywords = []
comments = []
count = 0

with open('./data_complete.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

for i in range(len(data)):
    keywords.append(data[i]["keywords"])
    if data[i]["comments"]:
        comments.append(data[i]["comments"])
    if not data[i]["comments"]:
        count = count + 1
count

1647

In [490]:
len(comments)

3848

In [500]:
import json

keywords = []
comments = []
count = 0

with open('./data_complete.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

for i in range(len(data)):
    keywords.append(data[i]["keywords"])
    if data[i]["comments"]:
        comments.append(data[i]["comments"])
#     if not data[i]["comments"]:
#         count = count + 1
# count

In [501]:
keywords

['100多颗恒星“离奇消失”是被外星人操控了',
 '100多颗恒星“离奇消失”是被外星人操控了',
 '100多颗恒星“离奇消失”是被外星人操控了',
 '100多颗恒星“离奇消失”是被外星人操控了',
 '100多颗恒星“离奇消失”是被外星人操控了',
 '119徐州化工厂爆炸',
 '119徐州化工厂爆炸',
 '119徐州化工厂爆炸',
 '120被堵路口致患者抢救无效死亡',
 '120被堵路口致患者抢救无效死亡',
 '16名韩国护士因疫情严重辞职',
 '16名韩国护士因疫情严重辞职',
 '16名韩国护士因疫情严重辞职',
 '16名韩国护士因疫情严重辞职',
 '16名韩国护士因疫情严重辞职',
 '16名韩国护士因疫情严重辞职',
 '16名韩国护士因疫情严重辞职',
 '16名韩国护士因疫情严重辞职',
 '16名韩国护士因疫情严重辞职',
 '16名韩国护士因疫情严重辞职',
 '16名韩国护士因疫情严重辞职',
 '16名韩国护士因疫情严重辞职',
 '19岁消防员救火时被坍塌的墙体压倒而牺牲',
 '19岁消防员救火时被坍塌的墙体压倒而牺牲',
 '19岁消防员救火时被坍塌的墙体压倒而牺牲',
 '19岁消防员救火时被坍塌的墙体压倒而牺牲',
 '19岁消防员救火时被坍塌的墙体压倒而牺牲',
 '19岁消防员救火时被坍塌的墙体压倒而牺牲',
 '19岁消防员救火时被坍塌的墙体压倒而牺牲',
 '19岁消防员救火时被坍塌的墙体压倒而牺牲',
 '19岁消防员救火时被坍塌的墙体压倒而牺牲',
 '19岁消防员救火时被坍塌的墙体压倒而牺牲',
 '19岁消防员救火时被坍塌的墙体压倒而牺牲',
 '19岁消防员救火时被坍塌的墙体压倒而牺牲',
 '2019年8月一架客机在黑龙江失事，42人遇难',
 '2019年8月一架客机在黑龙江失事，42人遇难',
 '2019年8月一架客机在黑龙江失事，42人遇难',
 '2019年8月一架客机在黑龙江失事，42人遇难',
 '2019年8月一架客机在黑龙江失事，42人遇难',
 '2019环邢台国际自行车赛连环大撞车',
 '2019环邢台国际自行车赛连环大撞车',
 '2019环邢台国际自行车赛连环大撞车',
 '2019环邢台国际自行车赛连环大撞车',
 '2019环邢台国际自行车赛连环大撞车',


In [502]:
len(keywords)

5495

In [503]:
import torch
from transformers import BertTokenizer, BertModel

model_name = './bert-base-chinese/'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

indexed_keywords = [(i, keywords) for i, group in enumerate(keywords) for comment in group]

# 提取keywords特征
features = []
for i, keyword in indexed_keywords:
    inputs = tokenizer(keyword, return_tensors='pt', padding=True, truncation=True, max_length=50)
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state

    feature = last_hidden_states  
    features.append({
        'index': data[i]["video_id"],
        'input_ids': feature.flatten().detach().numpy()
    })

# 保存特征
features_path = './text_keywords.pt'
torch.save(features, features_path)

KeyboardInterrupt: 

In [504]:
feature

tensor([[[-0.1541,  0.4216, -0.7977,  ..., -0.0375, -0.2507,  0.3271],
         [-0.5716,  0.1056,  0.2506,  ..., -0.4534, -0.0112,  0.2752],
         [-0.0119, -0.1084, -0.3049,  ...,  0.2324,  0.8298, -0.3112],
         ...,
         [ 1.7274, -0.1018, -0.2066,  ...,  0.3424,  0.5099,  0.1122],
         [ 0.7383,  0.4764,  0.0848,  ..., -0.6071, -0.7224, -0.3605],
         [-0.1490, -0.0086,  0.2895,  ..., -0.1016,  0.0882, -0.0244]]],
       grad_fn=<NativeLayerNormBackward0>)

In [507]:
b = torch.load('./text_comments.pt')

In [509]:
b

[{'index': '3x4zj7hyemptkvm',
  'input_ids': tensor([ 101, 1427, 1427,  784,  720, 8024, 2802, 1378, 4413, 3766, 4413,  749,
          8024, 6375, 2769, 2897, 6814, 3341, 2496, 1378, 4413, 4500,  749, 8024,
           671, 2661,  671,  724, 4638,  511,  102]),
  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1])},
 {'index': '3x4zj7hyemptkvm',
  'input_ids': tensor([ 101, 2772, 6387, 2769,  812, 6963, 3221, 3378,  702, 2342, 1920, 4495,
          4289,  860, 1079, 4638, 2164, 4495, 6001, 5387,  749,  510,  102]),
  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])},
 {'index': '3x4zj7hyemptkvm',
  'input_ids': tensor([ 101, 3766,  752, 8024,  679, 2512, 1510, 2769, 6814, 2399, 2218, 6121,
           102]),
  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])},
 {'index': '3x4zj7hyemptkvm',
  'input_ids': tensor([ 101, 2608, 3215, 1726, 2157, 6814, 

In [65]:
b[0]['input_ids'].shape

(16128,)

In [48]:
grouped_comments = {}
for comment in b:
    index = comment['index']
    if index not in grouped_comments:
        grouped_comments[index] = []
    grouped_comments[index].append(comment)

In [58]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np

model_name = './bert-base-chinese/'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def calculate_attention_weights(group):
    num_keywords = len(group)
    return np.ones(num_keywords) / num_keywords

final_features = []

# 遍历分组后的数据
for index, group in grouped_comments.items():
    all_input_ids = []
    all_attention_weights = []

    for comment in group:
        feature = comment['input_ids']
        all_input_ids.append(feature)

    # 计算注意力权重
    attention_weights = calculate_attention_weights(group)

    # 使用注意力权重对特征进行融合
    merged_feature = np.average(all_input_ids, axis=0, weights=attention_weights)

    # 存储最终结果
    final_features.append({'index': index, 'merged_feature': merged_feature})
all_merged_features = np.vstack([feature['merged_feature'] for feature in final_features])
# 打印最终结果
for final_feature in final_features:
    print(final_feature)


{'index': '3x4zj7hyemptkvm', 'merged_feature': array([ 3.31136286e-01, -5.27080297e-02, -8.61083642e-02, -3.28927953e-03,
        5.10410130e-01,  1.73011228e-01, -2.29835033e-01,  2.82268465e-01,
       -3.37671816e-01,  3.77170026e-01,  1.28737852e-01, -3.13593328e-01,
        8.80487189e-02,  6.25338733e-01,  8.47829640e-01, -8.83926302e-02,
       -4.35855798e-02,  3.46386105e-01, -2.16170311e-01,  2.12434128e-01,
       -2.24721834e-01,  3.89261055e-03,  5.90885244e-02,  1.70138001e-01,
        5.85317433e-01, -4.66737658e-01,  5.74524462e-01, -4.26458955e-01,
        2.52191633e-01, -1.78976178e-01, -2.58813620e-01,  3.59810203e-01,
        5.98238781e-02,  3.09427008e-02, -8.68773833e-02, -1.42996653e-03,
       -8.05156350e-01,  1.03550829e-01, -5.02201676e-01, -4.95455831e-01,
        5.07095754e-01, -3.08932811e-01, -1.82036042e-01, -3.53716850e-01,
        1.66972518e-01,  3.01892199e-02, -2.49630526e-01, -9.71173868e-02,
        4.30593073e-01,  4.52418596e-01,  6.47896752e

In [60]:
final_feature['merged_feature']

all_merged_features.shape

(1, 768)

In [13]:
import torch
from transformers import BertTokenizer, BertModel

model_name = './bert-base-chinese/'
tokenizer = BertTokenizer.from_pretrained(model_name)

inputs = tokenizer(keywords, return_tensors='pt', padding=True, truncation=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ValueError: too many values to unpack (expected 2)

In [26]:
model = BertModel.from_pretrained(model_name)

with torch.no_grad():
    outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state
features = last_hidden_states.mean(dim=1)  

print(features)


tensor([[ 0.1991, -0.0138, -0.1109,  ...,  0.2633,  0.0440, -0.0900],
        [ 0.1991, -0.0138, -0.1109,  ...,  0.2633,  0.0440, -0.0900],
        [ 0.1991, -0.0138, -0.1109,  ...,  0.2633,  0.0440, -0.0900],
        ...,
        [-0.2289,  0.5408, -0.4242,  ..., -0.0441, -0.1671, -0.2258],
        [-0.2289,  0.5408, -0.4242,  ..., -0.0441, -0.1671, -0.2258],
        [-0.2289,  0.5408, -0.4242,  ..., -0.0441, -0.1671, -0.2258]])


In [7]:
features.shape

NameError: name 'features' is not defined

In [1]:
features_path = './fea/text_f.pt'

In [2]:
torch.save(features, features_path)

NameError: name 'torch' is not defined