In [1]:
import pandas as pd
import random
seed=random.randint(0,100)
print(seed)

78


In [2]:
df=pd.read_csv('../data/out/augmented_shuffled.csv')
df = df.dropna(subset=["predicted_label"])
df = df[df["predicted_label"].str.strip() != ""]
df['predicted_label'].value_counts()

predicted_label
Valid                 7681
Irrelevant             480
Advertisement          172
Rant_Without_Visit     135
Name: count, dtype: int64

In [3]:
# Separate the DataFrame into 'valid' and 'non-valid' rows.
df_valid = df[df['predicted_label'] == 'Valid']
df_other = df[df['predicted_label'] != 'Valid']

# Randomly sample 300 rows from the 'valid' DataFrame.
df_valid_sampled = df_valid.sample(n=300, random_state=seed)

# Concatenate the sampled 'valid' rows with the 'non-valid' rows.
df = pd.concat([df_valid_sampled, df_other])

# Print the value counts of the updated DataFrame.
print(df['predicted_label'].value_counts())

predicted_label
Irrelevant            480
Valid                 300
Advertisement         172
Rant_Without_Visit    135
Name: count, dtype: int64


In [4]:
tag_mapping_dict = {
    'Valid': 0,
    'Advertisement': 1,
    'Irrelevant': 2,
    'Rant_Without_Visit': 3,
}

df['label'] = df['predicted_label'].map(tag_mapping_dict)


# REPLACE this:
# df[df['label']==None]
# WITH:
missing = df['label'].isna()
print("Unmapped labels:", int(missing.sum()))
assert missing.sum() == 0, "Found unmapped labels in predicted_label"


Unmapped labels: 0


In [5]:
df.head(2)

Unnamed: 0,business_name,text,predicted_label,prediction_reason,description,category,label
4600,Lao Sze Chuan,Expect a lot of strange and interesting soundi...,Valid,The review provides a genuine dining experienc...,,['Chinese restaurant'],0
6089,Walmart Supercenter,"Corner Of Sullivan And Wax RD, Central, LA--Fo...",Valid,The review provides a genuine shopping experie...,,"['Department store', 'Clothing store', 'Craft ...",0


In [6]:
PROMPT_PREFIX="""
You are a top-tier content moderation expert specializing in the evaluation of Google Maps location reviews. 
Your task is to parse a JSON object containing review data and accurately classify it according to the following policies and rules.
"""

def create_json_from_row(row):
    """
    Converts a DataFrame row into a JSON-formatted string.
    """
    json_object = {
        "business_name": row["business_name"],
        "business_description": row["description"],
        "business_category": row["category"],
        "review_text": row["text"],
        # "review_length": len(row["text"]),
    }
    return str(json_object)
    # return PROMPT_PREFIX+str(json_object)

df['text'] = df.apply(create_json_from_row, axis=1)
df['text']


4600    {'business_name': 'Lao Sze Chuan', 'business_d...
6089    {'business_name': 'Walmart Supercenter', 'busi...
6988    {'business_name': 'China Harbor', 'business_de...
1803    {'business_name': 'Zacatecas Restaurant', 'bus...
3511    {'business_name': 'Saigon Noodles', 'business_...
                              ...                        
8442    {'business_name': 'Shamrock', 'business_descri...
8446    {'business_name': 'Blue Lagoon Spa', 'business...
8448    {'business_name': 'Sunrise Bakery Cafe', 'busi...
8462    {'business_name': 'Key Village Shopping Center...
8471    {'business_name': 'Pizza Palace Express', 'bus...
Name: text, Length: 1087, dtype: object

In [7]:
df_filtered = df.loc[:, ['text', 'label']]

In [8]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    df_filtered[['text','label']],
    test_size=0.2,
    random_state=seed,
    stratify=df_filtered['label']
)

train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset   = Dataset.from_pandas(val_df.reset_index(drop=True))

my_dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
}).remove_columns([])  # nothing to drop now

print(my_dataset_dict)


  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 869
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 218
    })
})


In [9]:
val_df['label'].value_counts()

label
2    96
0    60
1    35
3    27
Name: count, dtype: int64

In [10]:
# gpu usage
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3,4,5,6,7"

In [11]:
import torch

if torch.cuda.is_available():
    print(f"GPU is available. Device count: {torch.cuda.device_count()}")
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
    device = torch.device("cuda")
else:
    print("No GPU available, using CPU.")
    device = torch.device("cpu")

GPU is available. Device count: 7
Current device: 0
Device name: NVIDIA L40S


In [12]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

INSTRUCTION = (
    "Task: classify review for THIS business. "
    "Labels: Valid, Advertisement, Irrelevant, Rant_Without_Visit. "
    "Priority: Ad > Irrelevant > No-visit rant > Valid."
)

def tokenize_function(example):
    texts = example["text"]
    # texts is a list when batched=True; replicate the instruction to match length
    instr = [INSTRUCTION] * len(texts)
    return tokenizer(instr, texts, truncation=True, max_length=256)


tokenized_datasets = my_dataset_dict.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# drop any raw columns that may exist (prompted_text OR text, and any index col)
cols_to_drop = [c for c in tokenized_datasets["train"].column_names 
                if c in ["prompted_text", "text", "__index_level_0__"]]
if cols_to_drop:
    tokenized_datasets = tokenized_datasets.remove_columns(cols_to_drop)



Map: 100%|██████████| 869/869 [00:00<00:00, 5417.97 examples/s]
Map: 100%|██████████| 218/218 [00:00<00:00, 7805.62 examples/s]


In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=4,ignore_mismatched_sizes=True)

# 简单评估指标（最小改动）
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_weighted": f1_score(labels, preds, average="weighted", zero_division=0),
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
    }

# 2. 设置训练参数
training_args = TrainingArguments(
    output_dir=f"./results/{checkpoint}",
    num_train_epochs=6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir="./logs",
    # Add these two lines to show training loss
    logging_strategy="steps",
    logging_steps=6,  
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
)

# 假设你已经准备好了 tokenized_datasets
# tokenized_datasets['train'], tokenized_datasets['test']

# 3. 初始化 Trainer
trainer = Trainer(
    model=model,                                   # 模型会被自动移动到GPU
    args=training_args,                            # 训练参数
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,                           # （替换 processing_class）
    compute_metrics=compute_metrics,               # 增加评估指标
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/torch/nn/parallel/parallel_apply.py", line 99, in _worker
    output = module(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/transformers/models/bert/modeling_bert.py", line 1488, in forward
    outputs = self.bert(
              ^^^^^^^^^^
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/transformers/models/bert/modeling_bert.py", line 1006, in forward
    encoder_outputs = self.encoder(
                      ^^^^^^^^^^^^^
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/transformers/models/bert/modeling_bert.py", line 653, in forward
    layer_outputs = layer_module(
                    ^^^^^^^^^^^^^
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/transformers/modeling_layers.py", line 94, in __call__
    return super().__call__(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/transformers/models/bert/modeling_bert.py", line 562, in forward
    self_attention_outputs = self.attention(
                             ^^^^^^^^^^^^^^^
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/transformers/models/bert/modeling_bert.py", line 493, in forward
    self_outputs = self.self(
                   ^^^^^^^^^^
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/transformers/models/bert/modeling_bert.py", line 385, in forward
    self.key(current_states)
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zhangguibin/anaconda3/envs/dra_ykm/lib/python3.11/site-packages/torch/nn/modules/linear.py", line 125, in forward
    return F.linear(input, self.weight, self.bias)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB. GPU 0 has a total capacity of 44.40 GiB of which 9.69 MiB is free. Process 2425686 has 36.51 GiB memory in use. Process 2915914 has 5.76 GiB memory in use. Including non-PyTorch memory, this process has 2.11 GiB memory in use. Of the allocated memory 1.45 GiB is allocated by PyTorch, and 52.44 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

id2label = {0: "Valid", 1: "Advertisement", 2: "Irrelevant", 3: "Rant_Without_Visit"}
all_labels = [0, 1, 2, 3]
target_names = [id2label[i] for i in all_labels]

predictions = trainer.predict(tokenized_datasets["validation"])
logits = predictions.predictions
labels = predictions.label_ids
preds = np.argmax(logits, axis=-1).astype(int)



                    precision    recall  f1-score   support

             Valid      0.806     0.833     0.820        60
     Advertisement      0.943     0.943     0.943        35
        Irrelevant      0.914     0.885     0.899        96
Rant_Without_Visit      0.929     0.963     0.945        27

          accuracy                          0.890       218
         macro avg      0.898     0.906     0.902       218
      weighted avg      0.891     0.890     0.890       218

Confusion matrix:
 [[50  1  7  2]
 [ 1 33  1  0]
 [11  0 85  0]
 [ 0  1  0 26]]


In [None]:
import numpy as np
from datasets import Dataset

predictions = trainer.predict(tokenized_datasets["validation"])
logits = predictions.predictions
labels = predictions.label_ids
preds = np.argmax(logits, axis=-1).astype(int)

mask = (preds == 0)
indices_to_stage2 = np.where(mask)[0]
stage2_dataset = tokenized_datasets["validation"].select(indices_to_stage2)
print(stage2_dataset)
stage2_labels = labels[mask]
save_path = "./stage2_filtered_data"
stage2_dataset.save_to_disk(save_path)




第一阶段预测为 'label=0' 的数据有 62 条。
已成功创建用于第二阶段预测的新数据集：
Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 62
})
正在将数据集保存到 './stage2_filtered_data' ...


Saving the dataset (1/1 shards): 100%|██████████| 62/62 [00:00<00:00, 15490.04 examples/s]

保存成功！





In [None]:
from datasets import load_from_disk

load_path = "./stage2_filtered_data"
loaded_stage2_dataset = load_from_disk(load_path)
print(loaded_stage2_dataset)




In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

id2label = {0: "Valid", 1: "Advertisement", 2: "Irrelevant", 3: "Rant_Without_Visit"}
all_labels = [0, 1, 2, 3]
target_names = [id2label[i] for i in all_labels]
# stage2_predictions = trainer_stage2.predict(loaded_stage2_dataset)
predictions = trainer.predict(loaded_stage2_dataset)
logits = predictions.predictions
labels = predictions.label_ids
preds = np.argmax(logits, axis=-1).astype(int)


print(classification_report(
    labels, preds,
    labels=all_labels,              # <-- ensure fixed label set
    target_names=target_names,
    zero_division=0,
    digits=3
))

print("Confusion matrix:\n", confusion_matrix(labels, preds, labels=all_labels))



                    precision    recall  f1-score   support

             Valid      0.806     0.833     0.820        60
     Advertisement      0.943     0.943     0.943        35
        Irrelevant      0.914     0.885     0.899        96
Rant_Without_Visit      0.929     0.963     0.945        27

          accuracy                          0.890       218
         macro avg      0.898     0.906     0.902       218
      weighted avg      0.891     0.890     0.890       218

Confusion matrix:
 [[50  1  7  2]
 [ 1 33  1  0]
 [11  0 85  0]
 [ 0  1  0 26]]
