In [1]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from torch.utils.data import Dataset
from transformers import EarlyStoppingCallback
from torch.nn import CrossEntropyLoss

In [2]:
# 检查GPU可用性
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
# 数据加载
df = pd.read_csv(r"D:\GitHubRepos\is6941-ml-social-media\taptap\data\integrated\lm_cleaned_taptap_reviews.csv")
df = df[['review_content', 'sentiment']].dropna()
df['sentiment'] = df['sentiment'].astype(int)

In [4]:
# 划分数据集
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['review_content'].tolist(),
    df['sentiment'].tolist(),
    test_size=0.2,
    random_state=42
)

In [5]:
# 修正后的数据集类
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(texts, 
                                 padding='max_length',  # 统一填充长度
                                 truncation=True, 
                                 max_length=256)
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

    def __len__(self):
        return len(self.labels)

In [6]:
# 计算类别权重
class_weights = torch.tensor(
    [len(train_labels)/sum(train_labels),  # 正样本权重
     len(train_labels)/(len(train_labels)-sum(train_labels))],  # 负样本权重
    device=device
)

In [7]:
# 初始化分词器和模型
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
# 修改模型初始化
model = AutoModelForSequenceClassification.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B",num_labels=2,)



KeyboardInterrupt: 

In [None]:
# 将类别权重移动到GPU
class_weights = class_weights.to(device)