In [1]:
import pandas as pd
df=pd.read_csv('../data/moderated_reviews.csv')

In [2]:
df['predicted_label'].value_counts()

predicted_label
Valid                 967
Irrelevant             33
Advertisement           1
Rant_Without_Visit      1
Name: count, dtype: int64

In [3]:
tag_mapping_dict = {
    'Valid': 0,
    'Advertisement': 1,
    'Irrelevant': 2  ,
    'Rant_Without_Visit': 3,
}

df['label'] = df['predicted_label'].map(tag_mapping_dict)
df[df['label']==None]

Unnamed: 0,business_name,text,predicted_label,prediction_reason,label


In [4]:
df_filtered = df.loc[:,['text','label']]
# df_filtered=df_filtered.head(10)

In [5]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
train_df, temp_df = train_test_split(df_filtered, test_size=0.1, random_state=42) 

# val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42) 

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(temp_df)
# val_dataset = Dataset.from_pandas(val_df)
# test_dataset = Dataset.from_pandas(test_df)

my_dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    # 'test': test_dataset
})
my_dataset_dict = my_dataset_dict.remove_columns('__index_level_0__')
print(my_dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 901
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 101
    })
})


In [None]:
# gpu usage
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3"

In [8]:
import torch

if torch.cuda.is_available():
    print(f"GPU is available. Device count: {torch.cuda.device_count()}")
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
    device = torch.device("cuda")
else:
    print("No GPU available, using CPU.")
    device = torch.device("cpu")

GPU is available. Device count: 3
Current device: 0
Device name: NVIDIA L40S


In [9]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["text"],truncation=True)


tokenized_datasets = my_dataset_dict.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 901/901 [00:00<00:00, 14422.16 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 8324.16 examples/s]


In [10]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# 1. 加载模型 (此时模型默认在CPU上)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=4)

# 2. 设置训练参数

training_args = TrainingArguments(
    output_dir=f"./results/{checkpoint}",          # 输出目录
    num_train_epochs=3,              # 训练轮次
    per_device_train_batch_size=16,  # 每个GPU的训练批量大小
    per_device_eval_batch_size=64,   # 每个GPU的评估批量大小
    warmup_steps=500,                # 预热步数
    weight_decay=0.01,               # 权重衰减
    logging_dir="./logs",            # 日志目录
    fp16=True,                       # <--- 开启混合精度训练，大幅加速并减少显存占用 (推荐！)
)

# 假设你已经准备好了 tokenized_datasets
# tokenized_datasets['train'], tokenized_datasets['test']

# 3. 初始化 Trainer
trainer = Trainer(
    model=model,                         # 模型会被自动移动到GPU
    args=training_args,                  # 训练参数
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
)

# 4. 开始训练
#    在调用 .train() 时，每一批数据也会被自动发送到GPU
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


TrainOutput(global_step=57, training_loss=1.0191426862750137, metrics={'train_runtime': 11.4883, 'train_samples_per_second': 235.282, 'train_steps_per_second': 4.962, 'total_flos': 247014841607352.0, 'train_loss': 1.0191426862750137, 'epoch': 3.0})

In [11]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)



(101, 4) (101,)


In [12]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)