In [1]:
from ft_datasets.imdb import IMDBDataset
from transformers import AdamW, pipeline
from datasets import load_dataset

from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoModelForCausalLM, AutoTokenizer, BertForSequenceClassification, BertTokenizer
from torch.utils.data import DataLoader
from utils.io_utils import save
import torch

  from .autonotebook import tqdm as notebook_tqdm


## 加载模型


In [2]:
# 加载模型
model_path = '/pubshare/fwk/training_results/imdb_sft_gpt2'
# model_path = 'gpt2'
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [3]:
device = torch.device("cuda:4")
model.to(device) 

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

## 加载数据集

In [4]:
# 加载数据集
data_path = '/pubshare/fwk/huggingface/datasets/imdb/plain_text'
# 加载 IMDb 数据集
dataset = load_dataset(data_path)
# 过滤正面评论
positive_reviews = dataset['train'].filter(lambda x: x['label'] == 1)

train_test_split = positive_reviews.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']
test_dataset = IMDBDataset(test_dataset, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

## 情感分类器

In [5]:
sentiment_model_path = '/pubshare/fwk/huggingface/models/siebert/sentiment-roberta-large-english'
sentiment_classifier = pipeline("sentiment-analysis", model=sentiment_model_path)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [6]:
def evaluate(model, tokenizer, dataloader, sentiment_classifier, device)->tuple[list[float], list[float]]:
    model.eval()
    total_score = 0
    count = 0
    positive_score = []
    negative_score = [] 
    for batch in dataloader:
        print(count)
        count+=1
        inputs = batch["input_ids"][:,:6].to(device)
        attention_mask = batch["attention_mask"][:, :6].to(device)
        with torch.no_grad():
            outputs = model.generate(inputs, attention_mask=attention_mask, max_length=510, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
        for output in outputs:
            generated_text = tokenizer.decode(output, skip_special_tokens=True)
            sentiment = sentiment_classifier(generated_text)[0]
            if sentiment['label'] == 'POSITIVE':
                # total_score += sentiment['score']
                positive_score.append(sentiment['score'])
            else:
                negative_score.append(sentiment['score'])
            # count += 1
    # avg_score = total_score / count if count > 0 else 0
    # avg_pos_score = sum(positive_score) / len(positive_score) if len(positive_score) > 0 else 0
    # avg_neg_score = sum(negative_score) / len(negative_score) if len(negative_score) > 0 else 0
    # return avg_pos_score, avg_neg_score
    return positive_score, negative_score

In [19]:
positive_score, negative_score = evaluate(model, tokenizer, test_loader, sentiment_classifier, device)

0


## 生成文本测试

In [9]:
def generate_positive_review(prefix, model, tokenizer, device, max_length=510):
    model.eval()
    inputs = tokenizer(prefix, return_tensors="pt").to(device)
    outputs = model.generate(inputs.input_ids, attention_mask=inputs.attention_mask, max_length=max_length, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
    review = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return review

In [10]:
f = open('sft_generate.txt', 'w')
for i in range(10):
    review_tokens = test_dataset[i]
    # review_tokens = tokenizer(review, return_tensors="pt")
    prefix = tokenizer.decode(review_tokens['input_ids'][:6], skip_special_tokens=True)
    generated_review = generate_positive_review(prefix, model, tokenizer, device)
    f.write(f'prefix: {prefix}, generated review: {generated_review}\n')
    f.write('==='*20+'\n')
    # print(f'prefix: {prefix}, generated review: {generated_review}')
f.close()

