In [6]:
import pandas as pd
import random
seed=random.randint(0,100)
print(seed)

71


In [7]:
df=pd.read_csv('../data/label_data_with_cnd/merged_all.csv')
df = df.dropna(subset=["predicted_label"])
df = df[df["predicted_label"].str.strip() != ""]
df['predicted_label'].value_counts()

predicted_label
Valid                 7681
Irrelevant             280
Rant_Without_Visit      22
Advertisement           13
Name: count, dtype: int64

In [8]:
# Separate the DataFrame into 'valid' and 'non-valid' rows.
df_valid = df[df['predicted_label'] == 'Valid']
df_other = df[df['predicted_label'] != 'Valid']

# Randomly sample 300 rows from the 'valid' DataFrame.
df_valid_sampled = df_valid.sample(n=300, random_state=seed)

# Concatenate the sampled 'valid' rows with the 'non-valid' rows.
df = pd.concat([df_valid_sampled, df_other])

# Print the value counts of the updated DataFrame.
print(df['predicted_label'].value_counts())

predicted_label
Valid                 300
Irrelevant            280
Rant_Without_Visit     22
Advertisement          13
Name: count, dtype: int64


In [9]:
tag_mapping_dict = {
    'Valid': 0,
    'Advertisement': 1,
    'Irrelevant': 2,
    'Rant_Without_Visit': 3,
}

df['label'] = df['predicted_label'].map(tag_mapping_dict)


# REPLACE this:
# df[df['label']==None]
# WITH:
missing = df['label'].isna()
print("Unmapped labels:", int(missing.sum()))
assert missing.sum() == 0, "Found unmapped labels in predicted_label"


Unmapped labels: 0


In [14]:
df.head(2)

Unnamed: 0,business_name,text,predicted_label,prediction_reason,name,description,category,index,rating,error,label,review_info_json
2242,Food 4 Less,"The price is right, good location easy access.",Valid,The review describes a genuine experience at t...,Food 4 Less,No-frills supermarket chain stocking a wide se...,"['Grocery store', 'Propane supplier']",,,,0,"{'business_name': 'Food 4 Less', 'business_des..."
3585,Meg-A-Latte | Coffee House,The staff are always friendly and their flavor...,Valid,The review describes a genuine experience at t...,Meg-A-Latte | Coffee House,,['Coffee shop'],,,,0,{'business_name': 'Meg-A-Latte | Coffee House'...


In [None]:
PROMPT_PREFIX="""
You are a top-tier content moderation expert specializing in the evaluation of Google Maps location reviews. 
Your task is to parse a JSON object containing review data and accurately classify it according to the following policies and rules.
"""

def create_json_from_row(row):
    """
    Converts a DataFrame row into a JSON-formatted string.
    """
    json_object = {
        "business_name": row["business_name"],
        "business_description": row["description"],
        "reviewed_category": row["category"],
        "review_text": row["text"],
        # "review_length": len(row["text"]),
    }
    return str(json_object)
    # return PROMPT_PREFIX+str(json_object)

df['text'] = df.apply(create_json_from_row, axis=1)
df['text']


'{\'business_name\': \'Food 4 Less\', \'business_description\': \'No-frills supermarket chain stocking a wide selection of bulk grocery items, produce & more.\', \'reviewed_category\': "[\'Grocery store\', \'Propane supplier\']", \'review_text\': \'{\\\'business_name\\\': \\\'Food 4 Less\\\', \\\'business_description\\\': \\\'No-frills supermarket chain stocking a wide selection of bulk grocery items, produce & more.\\\', \\\'reviewed_category\\\': "[\\\'Grocery store\\\', \\\'Propane supplier\\\']", \\\'review_text\\\': \\\'{\\\\\\\'business_name\\\\\\\': \\\\\\\'Food 4 Less\\\\\\\', \\\\\\\'business_description\\\\\\\': \\\\\\\'No-frills supermarket chain stocking a wide selection of bulk grocery items, produce & more.\\\\\\\', \\\\\\\'reviewed_category\\\\\\\': "[\\\\\\\'Grocery store\\\\\\\', \\\\\\\'Propane supplier\\\\\\\']", \\\\\\\'review_text\\\\\\\': \\\\\\\'The price is right, good location easy access.\\\\\\\'}\\\'}\'}'

In [26]:
df_filtered = df.loc[:, ['text', 'label']]

In [27]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    df_filtered[['text','label']],
    test_size=0.1,
    random_state=seed,
    stratify=df_filtered['label']
)

train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset   = Dataset.from_pandas(val_df.reset_index(drop=True))

my_dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
}).remove_columns([])  # nothing to drop now

print(my_dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 553
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 62
    })
})


In [28]:
val_df['label'].value_counts()

label
0    30
2    28
1     2
3     2
Name: count, dtype: int64

In [29]:
# gpu usage
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3"

In [30]:
import torch

if torch.cuda.is_available():
    print(f"GPU is available. Device count: {torch.cuda.device_count()}")
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
    device = torch.device("cuda")
else:
    print("No GPU available, using CPU.")
    device = torch.device("cpu")

GPU is available. Device count: 3
Current device: 0
Device name: NVIDIA L40S


In [31]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

INSTRUCTION = (
    "Task: classify review for THIS business. "
    "Labels: Valid, Advertisement, Irrelevant, Rant_Without_Visit. "
    "Priority: Ad > Irrelevant > No-visit rant > Valid."
)

def tokenize_function(example):
    texts = example["text"]
    # texts is a list when batched=True; replicate the instruction to match length
    instr = [INSTRUCTION] * len(texts)
    return tokenizer(instr, texts, truncation=True, max_length=256)


tokenized_datasets = my_dataset_dict.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# drop any raw columns that may exist (prompted_text OR text, and any index col)
cols_to_drop = [c for c in tokenized_datasets["train"].column_names 
                if c in ["prompted_text", "text", "__index_level_0__"]]
if cols_to_drop:
    tokenized_datasets = tokenized_datasets.remove_columns(cols_to_drop)



Map: 100%|██████████| 553/553 [00:00<00:00, 2320.13 examples/s]
Map: 100%|██████████| 62/62 [00:00<00:00, 4815.24 examples/s]


In [32]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=4)

# 简单评估指标（最小改动）
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_weighted": f1_score(labels, preds, average="weighted", zero_division=0),
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
    }

# 2. 设置训练参数
training_args = TrainingArguments(
    output_dir=f"./results/{checkpoint}",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir="./logs",
    # Add these two lines to show training loss
    logging_strategy="steps",
    logging_steps=1,  # Log every 50 steps, you can adjust this value
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
)

# 假设你已经准备好了 tokenized_datasets
# tokenized_datasets['train'], tokenized_datasets['test']

# 3. 初始化 Trainer
trainer = Trainer(
    model=model,                                   # 模型会被自动移动到GPU
    args=training_args,                            # 训练参数
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,                           # （替换 processing_class）
    compute_metrics=compute_metrics,               # 增加评估指标
)

# 4. 开始训练
#    在调用 .train() 时，每一批数据也会被自动发送到GPU
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro
1,0.9918,0.956041,0.483871,0.315568,0.163043
2,0.9043,0.872912,0.629032,0.606722,0.323776
3,0.8797,0.875558,0.612903,0.592486,0.316463




TrainOutput(global_step=36, training_loss=1.0138625254233677, metrics={'train_runtime': 14.8343, 'train_samples_per_second': 111.836, 'train_steps_per_second': 2.427, 'total_flos': 218254539589632.0, 'train_loss': 1.0138625254233677, 'epoch': 3.0})

In [33]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

id2label = {0: "Valid", 1: "Advertisement", 2: "Irrelevant", 3: "Rant_Without_Visit"}
all_labels = [0, 1, 2, 3]
target_names = [id2label[i] for i in all_labels]

predictions = trainer.predict(tokenized_datasets["validation"])
logits = predictions.predictions
labels = predictions.label_ids
preds = np.argmax(logits, axis=-1).astype(int)

print(classification_report(
    labels, preds,
    labels=all_labels,              # <-- ensure fixed label set
    target_names=target_names,
    zero_division=0,
    digits=3
))

print("Confusion matrix:\n", confusion_matrix(labels, preds, labels=all_labels))




                    precision    recall  f1-score   support

             Valid      0.629     0.733     0.677        30
     Advertisement      0.000     0.000     0.000         2
        Irrelevant      0.630     0.607     0.618        28
Rant_Without_Visit      0.000     0.000     0.000         2

          accuracy                          0.629        62
         macro avg      0.315     0.335     0.324        62
      weighted avg      0.588     0.629     0.607        62

Confusion matrix:
 [[22  0  8  0]
 [ 1  0  1  0]
 [11  0 17  0]
 [ 1  0  1  0]]
