In [1]:
from transformers import ViltProcessor, ViltForImagesAndTextClassification
import requests
from PIL import Image
import torch

In [None]:
image1 = Image.open(
    requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg", stream=True).raw
)
image2 = Image.open(
    requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_1.jpg", stream=True).raw
)
text = "The left image contains twice the number of dogs as the right image."

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")
model = ViltForImagesAndTextClassification.from_pretrained(
    "dandelin/vilt-b32-finetuned-nlvr2"
)
# prepare inputs
small_encoding = processor([image1, image2], text, return_tensors="pt")


# forward pass
outputs = model(
    input_ids=small_encoding.input_ids,
    pixel_values=small_encoding.pixel_values.unsqueeze(0),
    labels=torch.tensor([0]),
)
logits = outputs.logits
idx = logits.argmax(-1).item()
print("Predicted answer:", model.config.id2label[idx])

In [39]:
print(small_encoding["pixel_mask"].shape)

torch.Size([2, 384, 544])


In [40]:
print(outputs)

ViltForImagesAndTextClassificationOutput(loss=tensor(4.8357, grad_fn=<NllLossBackward0>), logits=tensor([[-2.1189,  2.7089]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


## 训练

In [2]:
%load_ext autoreload
%autoreload 2

In [12]:
from datasets import DatasetDict, load_dataset

train_dataset = load_dataset("json", data_files="train.json")["train"]
test_dataset = load_dataset("json", data_files="test.json")["train"]
val_dataset = load_dataset("json", data_files="val.json")["train"]

# Create a DatasetDict
mydb = DatasetDict({"train": train_dataset, "val": val_dataset, "test": test_dataset})

print(mydb)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'path', 'text', 'label'],
        num_rows: 3600
    })
    val: Dataset({
        features: ['id', 'path', 'text', 'label'],
        num_rows: 400
    })
    test: Dataset({
        features: ['id', 'path', 'text', 'label'],
        num_rows: 511
    })
})


In [4]:
id2label = {0: "NEGATIVE", 1: "NEUTRAL", 2: "POSITIVE"}
label2id = {"NEGATIVE": 0, "NEUTRAL": 1, "POSITIVE": 2}

In [7]:
# autodl加速，如果不是使用autodl，可以注释掉
import subprocess
import os

result = subprocess.run(
    'bash -c "source /etc/network_turbo && env | grep proxy"',
    shell=True,
    capture_output=True,
    text=True,
)
output = result.stdout
for line in output.splitlines():
    if "=" in line:
        var, value = line.split("=", 1)
        os.environ[var] = value

In [8]:
from transformers import ViltConfig, ViltForImagesAndTextClassification

# Initializing a ViLT dandelin/vilt-b32-mlm style configuration
configuration = ViltConfig(num_images=1, id2label=id2label, label2id=label2id)

# Initializing a model from the dandelin/vilt-b32-mlm style configuration
model = ViltForImagesAndTextClassification(configuration)


processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")

In [9]:
for i in range(10):
    print(mydb["train"][i])

{'text': 'Frances prefers parking lot to hotel room #couple #oral #sex #teen #blonde #small #tits #b¡\xad ', 'id': '3603', 'path': 'data/3603.jpg', 'label': 2}
{'text': '@haremprotag ... what if i told you looking at that actually made me a little flustered ', 'id': '2947', 'path': 'data/2947.jpg', 'label': 2}
{'text': '×î\x8f\x8a¥Þ¥ó¥¬Õi¤ß·Åî}¥³¥ß¥Ã¥¯¥¢¥×¥ê!!! http://t.co/AsudmZm5Xf', 'id': '4934', 'path': 'data/4934.jpg', 'label': 2}
{'text': 'Airsoft/Paintball Multicam Camo BDU Uniform Set - Jacket & Pants - Medium - Full read by e¡\xad ', 'id': '3318', 'path': 'data/3318.jpg', 'label': 1}
{'text': '#my #first #tattoo #today #soo #happy #staystrong #stay #strong #brown #baby #brownie #fed¡\xad ', 'id': '1225', 'path': 'data/1225.jpg', 'label': 2}
{'text': "Zoe's first love #Rattled @JohnnyHarper15 ", 'id': '1669', 'path': 'data/1669.jpg', 'label': 2}
{'text': 'RT @ShaunFrankson: RT to mind people that 640', 'id': '4420', 'path': 'data/4420.jpg', 'label': 0}
{'text': '$89.95 ?¡\xad h

In [13]:
def preprocess_function(examples):
    images = [Image.open(path).resize((224, 224)) for path in examples["path"]]
    texts = examples["text"]
    encoding = processor(
        images,
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
    )
    # image = Image.open(examples["path"])
    # text = examples["text"]
    # encoding = processor([image], text, return_tensors="pt")
    # targets = []
    # for label in examples["label"]:
    #     target = torch.zeros(len(label2id))
    #     target[label] = 1
    #     targets.append(target)
    encoding["labels"] = examples["label"]
    return encoding


processed_db = mydb.map(preprocess_function, batched=True, batch_size=256)

Map:   0%|          | 0/3600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/511 [00:00<?, ? examples/s]

In [14]:
print(processed_db)

DatasetDict({
    train: Dataset({
        features: ['id', 'path', 'text', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'pixel_values', 'pixel_mask', 'labels'],
        num_rows: 3600
    })
    val: Dataset({
        features: ['id', 'path', 'text', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'pixel_values', 'pixel_mask', 'labels'],
        num_rows: 400
    })
    test: Dataset({
        features: ['id', 'path', 'text', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'pixel_values', 'pixel_mask', 'labels'],
        num_rows: 511
    })
})


In [15]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [16]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [21]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="multimodalmodel",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    # evaluation_strategy="epoch",
    # save_strategy="epoch",
    # load_best_model_at_end=True,
    # remove_unused_columns=False,  # 防止删掉标签
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_db["train"],
    eval_dataset=processed_db["val"],
    tokenizer=processor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [22]:
trainer.train()

Step,Training Loss
500,0.8888


TrainOutput(global_step=565, training_loss=0.8843758979729847, metrics={'train_runtime': 5299.9042, 'train_samples_per_second': 3.396, 'train_steps_per_second': 0.107, 'total_flos': 383247741600000.0, 'train_loss': 0.8843758979729847, 'epoch': 5.0})

In [23]:
# Suppose 'trainer' is your Trainer instance
results = trainer.evaluate()

# Print the results
for key, value in results.items():
    print(f"{key}: {value}")

eval_loss: 0.9010838270187378
eval_accuracy: 0.5725
eval_runtime: 117.5372
eval_samples_per_second: 3.403
eval_steps_per_second: 0.111
epoch: 5.0


In [55]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


def inference_function(examples):
    images = Image.open(examples["path"]).resize((224, 224))
    texts = examples["text"]
    encoding = processor(
        images,
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
    )
    outputs = model(
        input_ids=encoding.input_ids.to(device),
        pixel_values=encoding.pixel_values.unsqueeze(0).to(device),
    )
    logits = outputs.logits
    idx = logits.argmax(-1).item()
    # print("Predicted answer:", model.config.id2label[idx])
    return model.config.id2label[idx]

In [74]:
# 训练集正确率
from tqdm import tqdm

correct = 0
for i in tqdm(range(len(mydb["train"]))):
    if mydb["train"][i]["label"] == label2id[inference_function(mydb["train"][i])]:
        correct += 1
print(correct / len(mydb["train"]))

correct = 0
for i in tqdm(range(len(mydb["val"]))):
    if mydb["val"][i]["label"] == label2id[inference_function(mydb["val"][i])]:
        correct += 1
print(correct / len(mydb["val"]))

  0%|          | 0/3600 [00:00<?, ?it/s]

100%|██████████| 3600/3600 [02:10<00:00, 27.63it/s]


0.6175


100%|██████████| 400/400 [00:14<00:00, 27.24it/s]

0.5725





## 消融实验

### 仅图像

In [71]:
def inference_without_txt(examples):
    images = Image.open(examples["path"]).resize((224, 224))
    # texts = examples["text"]
    texts = ""
    encoding = processor(
        images,
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
    )
    outputs = model(
        input_ids=encoding.input_ids.to(device),
        pixel_values=encoding.pixel_values.unsqueeze(0).to(device),
    )
    logits = outputs.logits
    idx = logits.argmax(-1).item()
    # print("Predicted answer:", model.config.id2label[idx])
    return model.config.id2label[idx]


# 训练集正确率
from tqdm import tqdm

correct = 0
for i in tqdm(range(len(mydb["train"]))):
    if mydb["train"][i]["label"] == label2id[inference_without_txt(mydb["train"][i])]:
        correct += 1
print(correct / len(mydb["train"]))

correct = 0
for i in tqdm(range(len(mydb["val"]))):
    if mydb["val"][i]["label"] == label2id[inference_without_txt(mydb["val"][i])]:
        correct += 1
print(correct / len(mydb["val"]))

  0%|          | 0/3600 [00:00<?, ?it/s]

100%|██████████| 3600/3600 [02:09<00:00, 27.81it/s]


0.6133333333333333


100%|██████████| 400/400 [00:14<00:00, 27.75it/s]

0.5625





### 仅文本

In [73]:
def inference_without_img(examples):
    # images = Image.open(examples["path"]).resize((224, 224))
    images = Image.fromarray(np.full((224, 224, 3), 255, dtype=np.uint8))
    texts = examples["text"]
    encoding = processor(
        images,
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
    )
    outputs = model(
        input_ids=encoding.input_ids.to(device),
        pixel_values=encoding.pixel_values.unsqueeze(0).to(device),
    )
    logits = outputs.logits
    idx = logits.argmax(-1).item()
    # print("Predicted answer:", model.config.id2label[idx])
    return model.config.id2label[idx]


# 训练集正确率
from tqdm import tqdm

correct = 0
for i in tqdm(range(len(mydb["train"]))):
    if mydb["train"][i]["label"] == label2id[inference_without_img(mydb["train"][i])]:
        correct += 1
print(correct / len(mydb["train"]))

correct = 0
for i in tqdm(range(len(mydb["val"]))):
    if mydb["val"][i]["label"] == label2id[inference_without_img(mydb["val"][i])]:
        correct += 1
print(correct / len(mydb["val"]))

100%|██████████| 3600/3600 [01:44<00:00, 34.42it/s]


0.40444444444444444


100%|██████████| 400/400 [00:11<00:00, 34.61it/s]

0.3825





## 输出测试结果

In [49]:
!cp test_without_label.txt test_with_label.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [51]:
import json

with open("test.json", "r") as f:
    # Load the data from the file
    test_data = json.load(f)

# Now 'data' is a Python object containing the data from the JSON file
print(test_data)

[{'id': '8', 'text': 'Energetic training today with our San Antonio New Dollars/New Partners trainees ', 'label': None, 'path': 'data/8.jpg'}, {'id': '1576', 'text': 'Let your voice be heard! 18+ #endsuicide #blithe #selfharm #thinspo #bonespo #edo #hurt #cut ', 'label': None, 'path': 'data/1576.jpg'}, {'id': '2320', 'text': "RT @Austin_Powers__: Shark Week would be so much better if the sharks had laser beams attached to their frickin' heads. ", 'label': None, 'path': 'data/2320.jpg'}, {'id': '4912', 'text': '#TheTruthCaster http://t.co/S8jvqpKq5h', 'label': None, 'path': 'data/4912.jpg'}, {'id': '3821', 'text': "RT @jarpad: Hey #WBSDCC look what we're up to!!!! @JensenAckles @paulwesley @iansomerhalder ", 'label': None, 'path': 'data/3821.jpg'}, {'id': '1306', 'text': '@CogTn: How do we get fresh oil? Separate the flesh from the SEED!#crushed @bryancutshall ', 'label': None, 'path': 'data/1306.jpg'}, {'id': '4555', 'text': "RT @DailyMailCeleb: Little Mix's Jesy has been showing off h

In [52]:
print(len(test_data))

511


In [61]:
import pandas as pd

df = pd.read_csv("test_with_label.txt", sep=",", header=None)
print(df.head())
for i in range(len(test_data)):
    label = inference_function(test_data[i])
    df.iloc[i + 1, 1] = label
print(df.head())

      0    1
0  guid  tag
1     8  NaN
2  1576  NaN
3  2320  NaN
4  4912  NaN
      0         1
0  guid       tag
1     8  POSITIVE
2  1576  POSITIVE
3  2320  POSITIVE
4  4912  NEGATIVE


In [63]:
df.to_csv("test_with_label.txt", index=False, header=False)