In [29]:
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from torch.utils.data import Dataset
from transformers import EarlyStoppingCallback
from torch.nn import CrossEntropyLoss

In [None]:
# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Load data
df = pd.read_csv(r"D:\GitHubRepos\is6941-ml-social-media\taptap\data\integrated\lm_cleaned_taptap_reviews.csv")
df = df[['review_content', 'sentiment']].dropna()
df['sentiment'] = df['sentiment'].astype(int)

In [None]:
# Split dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['review_content'].tolist(),
    df['sentiment'].tolist(),
    test_size=0.2,
    random_state=42
)

In [None]:
# Updated dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(texts, 
                                 padding='max_length',  # Uniform padding length
                                 truncation=True, 
                                 max_length=256)
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

    def __len__(self):
        return len(self.labels)

In [None]:
# Compute class weights
class_weights = torch.tensor(
    [len(train_labels)/sum(train_labels),  # Positive class weight
     len(train_labels)/(len(train_labels)-sum(train_labels))],  # Negative class weight
    device=device
)

In [None]:
# Initialize model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
# Modify model initialization
model = BertForSequenceClassification.from_pretrained(
    "bert-base-chinese",
    num_labels=2,
    # problem_type="single_label_classification",
    # hidden_dropout_prob=0.3,
    # classifier_dropout=0.2
)

# Move class weights to GPU
class_weights = class_weights.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Create datasets (keep CPU tensors)
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
test_dataset = SentimentDataset(test_texts, test_labels, tokenizer)

In [None]:
# Training configuration (automatically handle data to GPU)
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    eval_strategy="epoch",
    fp16=True,  # Automatically enable pin_memory
    dataloader_pin_memory=True,  # Explicitly enable memory pinning
    # learning_rate=3e-5,  # Lower initial learning rate from default 5e-5
    # warmup_ratio=0.1,    # Add learning rate warmup
    # weight_decay=0.01,   # L2 regularization
    # gradient_accumulation_steps=2,   # Gradient accumulation
)

In [None]:
# Custom evaluation function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    print("\nClassification Report:")
    print(classification_report(labels, preds))
    print("Confusion Matrix:")
    print(confusion_matrix(labels, preds))
    return {'accuracy': (preds == labels).mean()}

In [None]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
# Start training (automatically handle data migration)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3965,0.36537,0.834188
2,0.301,0.380965,0.836939
3,0.1744,0.472908,0.839815



Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.78      0.76      2740
           1       0.88      0.86      0.87      5257

    accuracy                           0.83      7997
   macro avg       0.81      0.82      0.82      7997
weighted avg       0.84      0.83      0.84      7997

Confusion Matrix:
[[2140  600]
 [ 726 4531]]

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.69      0.74      2740
           1       0.85      0.91      0.88      5257

    accuracy                           0.84      7997
   macro avg       0.83      0.80      0.81      7997
weighted avg       0.83      0.84      0.83      7997

Confusion Matrix:
[[1883  857]
 [ 447 4810]]

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.74      0.76      2740
           1       0.87      0.89      0.88      5257

    accuracy        

TrainOutput(global_step=3000, training_loss=0.30004727681477866, metrics={'train_runtime': 466.4451, 'train_samples_per_second': 205.735, 'train_steps_per_second': 6.432, 'total_flos': 1.262459465828352e+16, 'train_loss': 0.30004727681477866, 'epoch': 3.0})

In [None]:
# Final test set evaluation
test_results = trainer.predict(test_dataset)
print("\nFinal Test Set Evaluation:")
compute_metrics(test_results)


Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.74      0.76      2740
           1       0.87      0.89      0.88      5257

    accuracy                           0.84      7997
   macro avg       0.82      0.82      0.82      7997
weighted avg       0.84      0.84      0.84      7997

Confusion Matrix:
[[2037  703]
 [ 578 4679]]

Final Test Set Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.74      0.76      2740
           1       0.87      0.89      0.88      5257

    accuracy                           0.84      7997
   macro avg       0.82      0.82      0.82      7997
weighted avg       0.84      0.84      0.84      7997

Confusion Matrix:
[[2037  703]
 [ 578 4679]]


{'accuracy': 0.8398149305989746}

In [None]:
# --- Save Probabilities for Ensemble ---
print("\nSaving test set prediction probabilities (BERT)...")
# Trainer.predict returns logits in test_results.predictions
# Apply Softmax to get probabilities
logits = torch.tensor(test_results.predictions)
probabilities = torch.softmax(logits, dim=-1).numpy() # Convert to numpy array
np.save(r'D:\GitHubRepos\is6941-ml-social-media\taptap\analytics\predictions\probabilities_bert.npy', probabilities)
print("BERT probabilities saved to predictions/probabilities_bert.npy")


保存测试集预测概率 (BERT)...
BERT 概率已保存到 predictions/probabilities_bert.npy
