In [3]:
!pip install torch
!pip install transformers




[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch

# model_name = "facebook/opt-6.7b"
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).cuda()

# # the fast tokenizer currently does not work correctly
# tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

# prompt = "What is the color of a carrot?\nA:"

# input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()

# generated_ids = model.generate(input_ids)

# tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

In [4]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Set the device to GPU if available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1

In [11]:
import json

def read_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            example = json.loads(line)
            data.append(example)
    return data

def preprocess_data(data, tokenizer):
    inputs = []
    labels = []
    for example in data:
        premise = example['sentence1']
        hypothesis = example['sentence2']
        label = example['gold_label']
        inputs.append((premise, hypothesis))
        labels.append(label)

    encoded_inputs = tokenizer.batch_encode_plus(
        inputs,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoded_inputs['input_ids'].to(device)
    attention_mask = encoded_inputs['attention_mask'].to(device)
    labels = torch.tensor([0 if label == 'contradiction' else 1 if label == 'entailment' else 2 for label in labels]).to(device)

    return input_ids, attention_mask, labels

# Load and preprocess the data
data_path_matched = 'E:\\desktop\\5018\\dev_matched_sampled-1.jsonl'
data_path_mismatched = 'E:\\desktop\\5018\\dev_mismatched_sampled-1.jsonl'
data_matched = read_data(data_path_matched)
data_mismatched = read_data(data_path_mismatched)
input_ids_matched, attention_mask_matched, labels_matched = preprocess_data(data_matched, tokenizer)
input_ids_mismatched, attention_mask_mismatched, labels_mismatched = preprocess_data(data_mismatched, tokenizer)


In [12]:
def predict_batch(model, input_ids, attention_mask):
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=-1)
        predictions = torch.argmax(probs, dim=-1)
    return predictions


In [26]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import json
import pandas as pd

def read_data(path):
    with open(path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    # collecting the sentences and labels from the dataset
    gold_label = [item['gold_label'] for item in data if 'gold_label' in item]
    sentence1 = [item['sentence1'] for item in data if 'sentence1' in item]
    sentence2 = [item['sentence2'] for item in data if 'sentence2' in item]

    df = pd.DataFrame({'gold_label': gold_label, 'sentence1': sentence1, 'sentence2': sentence2})
    df = df[df['gold_label'] != '-']
    df = df.reset_index(drop=True)

    return [{'premise': row['sentence1'], 'hypothesis': row['sentence2'], 'label': row['gold_label']} for _, row in df.iterrows()]


# 自定义数据集类
class NLIDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label_map = {'contradiction': 0, 'neutral': 1, 'entailment': 2}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        premise = self.data[index]['premise']
        hypothesis = self.data[index]['hypothesis']
        label = self.data[index]['label']
        encoding = self.tokenizer(premise, hypothesis, padding='max_length', max_length=self.max_length, truncation=True, return_tensors='pt')
        input_ids = encoding['input_ids'][0]
        attention_mask = encoding['attention_mask'][0]
        label_id = self.label_map[label]
        return input_ids, attention_mask, label_id


# 加载数据集
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
data_matched = read_data('E:\\desktop\\5018\\dev_matched_sampled-1.jsonl')
dataset_matched = NLIDataset(data_matched, tokenizer, max_length=128)
dataloader_matched = DataLoader(dataset_matched, batch_size=8, shuffle=True)

# 定义模型
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# 定义训练函数
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    train_loss = 0
    for input_ids, attention_mask, label_id in dataloader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        label_id = label_id.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=label_id)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
    return train_loss / len(dataloader)

# 定义评估函数
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    total_correct = 0
    with torch.no_grad():
        for input_ids, attention_mask, label_id in dataloader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            label_id = label_id.to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=label_id)
            loss = outputs.loss
            total_loss += loss.item()
            _, predicted = torch.max(outputs.logits, dim=1)
            total_correct += torch.sum(predicted == label_id).item()
    return total_loss / len(dataloader), total_correct / len(dataloader.dataset)


# 训练模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()
for epoch in range(5):
    train_loss = train(model, dataloader_matched, optimizer, criterion, device)
    print('Epoch', epoch, 'Train Loss:', train_loss)
    val_loss, val_acc = evaluate(model, dataloader_matched, criterion, device)
    print('Epoch', epoch, 'Val Loss:', val_loss, 'Val Acc:', val_acc)

   


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 0 Train Loss: 1.0885078773483055


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Epoch 0 Val Loss: 0.9545623362064362 Val Acc: 0.5898373983739837


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Epoch 1 Train Loss: 0.8846498398811786


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Epoch 1 Val Loss: 0.4772443244257918 Val Acc: 0.8630081300813008


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Epoch 2 Train Loss: 0.4707570855759762


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Epoch 2 Val Loss: 0.17730102823539215 Val Acc: 0.9565040650406504


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Epoch 3 Train Loss: 0.18275941403691548


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Epoch 3 Val Loss: 0.09533474918200889 Val Acc: 0.9723577235772358


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Epoch 4 Train Loss: 0.09039340543034546


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Epoch 4 Val Loss: 0.02187605735213895 Val Acc: 0.9955284552845528


In [27]:
prompts = [
    ("A soccer game with multiple males playing", "Some men are playing a sport."),
    ("An old man with a package poses in front of an advertisement", "A man is standing in front of a billboard."),
    ("A black race car starts up in front of a crowd of people", "A fast car is ready to drive in front of an audience."),
    ("A woman with a green headscarf, blue shirt and a very big grin", "The woman is happy and smiling."),
    ("A group of people are sitting in a movie theater watching a movie", "People are watching a film indoors.")
]

label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}

model.eval()
for premise, hypothesis in prompts:
    encoding = tokenizer(premise, hypothesis, padding='max_length', max_length=128, truncation=True, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_label_id = torch.argmax(logits, dim=1).item()
        predicted_label = label_map[predicted_label_id]
        print(f"Premise: {premise}")
        print(f"Hypothesis: {hypothesis}")
        print(f"Predicted label: {predicted_label}")
        print()


Premise: A soccer game with multiple males playing
Hypothesis: Some men are playing a sport.
Predicted label: neutral

Premise: An old man with a package poses in front of an advertisement
Hypothesis: A man is standing in front of a billboard.
Predicted label: entailment

Premise: A black race car starts up in front of a crowd of people
Hypothesis: A fast car is ready to drive in front of an audience.
Predicted label: neutral

Premise: A woman with a green headscarf, blue shirt and a very big grin
Hypothesis: The woman is happy and smiling.
Predicted label: entailment

Premise: A group of people are sitting in a movie theater watching a movie
Hypothesis: People are watching a film indoors.
Predicted label: neutral

