In [1]:
# --- Colab bootstrap for your repo (GPU + deps + data) ---

REPO_URL = "https://github.com/yyssophie/ML-for-Trustworthy-Location-Reviews.git"
BRANCH = "zhx"
REPO_DIR = "/content/ML-for-Trustworthy-Location-Reviews"
REQUIREMENTS = "requirements.txt"  # keep None to skip
DATA_REL_PATH = "data/label_data_with_cnd"  # adjust if needed

import os, sys, subprocess, importlib, json

def run(cmd):
    print(">>", cmd)
    subprocess.check_call(cmd, shell=True)

# 0) Show system GPU (sanity)
run("nvidia-smi || true")

# 1) Get repo @ latest zhx
if not os.path.isdir(REPO_DIR):
    run(f"git clone -b {BRANCH} {REPO_URL} {REPO_DIR}")
else:
    run(f"cd {REPO_DIR} && git fetch origin && git checkout {BRANCH} && git reset --hard origin/{BRANCH} && git clean -fd && git pull --ff-only")

os.chdir(REPO_DIR)
print("CWD:", os.getcwd())

# 2) Install GPU-enabled PyTorch first (CUDA 12.4)
#    (Colab shows CUDA 12.4 in nvidia-smi; use the cu124 wheel index)
run("pip uninstall -y torch torchvision torchaudio || true")
run("pip install --upgrade --index-url https://download.pytorch.org/whl/cu124 torch torchvision torchaudio")

# 3) Install the rest of your Python deps
if REQUIREMENTS and os.path.isfile(REQUIREMENTS):
    run(f"pip install -r {REQUIREMENTS}")

# 3b) Guard against requirements.txt accidentally overwriting torch to CPU wheel
#     If torch CUDA build missing or not available, reinstall cu124 once more.
import torch
need_fix = (torch.version.cuda is None) or (not torch.cuda.is_available())
print("Torch:", torch.__version__, "| Built with CUDA:", torch.version.cuda, "| cuda.is_available:", torch.cuda.is_available())
if need_fix:
    print("Re-installing CUDA-enabled PyTorch (cu124) to fix mismatch...")
    run("pip uninstall -y torch torchvision torchaudio || true")
    run("pip install --upgrade --index-url https://download.pytorch.org/whl/cu124 torch torchvision torchaudio")
    importlib.invalidate_caches()
    import torch  # re-import
    print("Torch after fix:", torch.__version__, "| Built with CUDA:", torch.version.cuda, "| cuda.is_available:", torch.cuda.is_available())

# 4) Final GPU verification + device selection
try:
    import torch
    details = {
        "torch_version": torch.__version__,
        "torch_cuda_build": torch.version.cuda,
        "cuda_available": torch.cuda.is_available(),
    }
    if torch.cuda.is_available():
        details["device_count"] = torch.cuda.device_count()
        details["current_device"] = torch.cuda.current_device()
        details["device_name"] = torch.cuda.get_device_name(0)
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    print("GPU check:", json.dumps(details, indent=2))
except Exception as e:
    print("GPU verification error:", e)
    device = "cpu"

# 5) Make repo importable
if REPO_DIR not in sys.path:
    sys.path.append(REPO_DIR)

# 6) Data directory check + quick peek
DATA_DIR = os.path.join(REPO_DIR, DATA_REL_PATH)
print("DATA_DIR exists:", os.path.isdir(DATA_DIR), "->", DATA_DIR)
if os.path.isdir(DATA_DIR):
    run(f"ls -lah {DATA_DIR} | head -n 20 || true")

# 7) Helpful note: from repo root, use repo-root relative paths, e.g.:
#    pd.read_csv('data/label_data_with_cnd/merged_all.csv')


>> nvidia-smi || true
>> cd /content/ML-for-Trustworthy-Location-Reviews && git fetch origin && git checkout zhx && git reset --hard origin/zhx && git clean -fd && git pull --ff-only
CWD: /content/ML-for-Trustworthy-Location-Reviews
>> pip uninstall -y torch torchvision torchaudio || true
>> pip install --upgrade --index-url https://download.pytorch.org/whl/cu124 torch torchvision torchaudio
>> pip install -r requirements.txt
Torch: 2.6.0+cu124 | Built with CUDA: 12.4 | cuda.is_available: True
GPU check: {
  "torch_version": "2.6.0+cu124",
  "torch_cuda_build": "12.4",
  "cuda_available": true,
  "device_count": 1,
  "current_device": 0,
  "device_name": "Tesla T4"
}
DATA_DIR exists: True -> /content/ML-for-Trustworthy-Location-Reviews/data/label_data_with_cnd
>> ls -lah /content/ML-for-Trustworthy-Location-Reviews/data/label_data_with_cnd | head -n 20 || true


In [2]:
import pandas as pd
import random
seed=random.randint(0,100)
print(seed)

3


In [3]:
df=pd.read_csv('data/label_data_with_cnd/merged_all.csv')
df = df.dropna(subset=["predicted_label"])
df = df[df["predicted_label"].str.strip() != ""]
df['predicted_label'].value_counts()

Unnamed: 0_level_0,count
predicted_label,Unnamed: 1_level_1
Valid,7681
Irrelevant,280
Rant_Without_Visit,22
Advertisement,13


In [4]:
# Separate the DataFrame into 'valid' and 'non-valid' rows.
df_valid = df[df['predicted_label'] == 'Valid']
df_other = df[df['predicted_label'] != 'Valid']

# Randomly sample 300 rows from the 'valid' DataFrame.
df_valid_sampled = df_valid.sample(n=300, random_state=seed)

# Concatenate the sampled 'valid' rows with the 'non-valid' rows.
df = pd.concat([df_valid_sampled, df_other])

# Print the value counts of the updated DataFrame.
print(df['predicted_label'].value_counts())

predicted_label
Valid                 300
Irrelevant            280
Rant_Without_Visit     22
Advertisement          13
Name: count, dtype: int64


In [5]:
tag_mapping_dict = {
    'Valid': 0,
    'Advertisement': 1,
    'Irrelevant': 2,
    'Rant_Without_Visit': 3,
}

df['label'] = df['predicted_label'].map(tag_mapping_dict)


# REPLACE this:
# df[df['label']==None]
# WITH:
missing = df['label'].isna()
print("Unmapped labels:", int(missing.sum()))
assert missing.sum() == 0, "Found unmapped labels in predicted_label"


Unmapped labels: 0


In [6]:
df.head(2)

Unnamed: 0,business_name,text,predicted_label,prediction_reason,name,description,category,index,rating,error,label
1834,Lucky Brand,The customer service was pleasant. The variety...,Valid,The review describes a genuine shopping experi...,Lucky Brand,"Trendy chain known for house-label jeans, a va...","['Clothing store', ""Men's clothing store"", ""Wo...",,,,0
1885,Fleetwood Roller Rink,Old Fashion Family. Took our girls for the fir...,Valid,The review describes a genuine family experien...,Fleetwood Roller Rink,Old school-style roller rink with open skating...,['Roller skating rink'],,,,0


In [7]:
PROMPT_PREFIX="""
You are a top-tier content moderation expert specializing in the evaluation of Google Maps location reviews.
Your task is to parse a JSON object containing review data and accurately classify it according to the following policies and rules.
"""

def create_json_from_row(row):
    """
    Converts a DataFrame row into a JSON-formatted string.
    """
    json_object = {
        "business_name": row["business_name"],
        "business_description": row["description"],
        "reviewed_category": row["category"],
        "review_text": row["text"],
        # "review_length": len(row["text"]),
    }
    return str(json_object)
    # return PROMPT_PREFIX+str(json_object)

df['text'] = df.apply(create_json_from_row, axis=1)
df['text']


Unnamed: 0,text
1834,"{'business_name': 'Lucky Brand', 'business_des..."
1885,"{'business_name': 'Fleetwood Roller Rink', 'bu..."
5292,{'business_name': 'AutoNation Toyota Gulf Free...
3093,"{'business_name': 'Ulta Beauty', 'business_des..."
7930,{'business_name': 'Regal Hamburg Pavilion IMAX...
...,...
7799,"{'business_name': 'Taco Bell', 'business_descr..."
7886,"{'business_name': 'Gourmet Chili', 'business_d..."
7913,"{'business_name': 'Big Lots', 'business_descri..."
7928,{'business_name': 'Key Village Shopping Center...


In [8]:
df_filtered = df.loc[:, ['text', 'label']]

In [9]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    df_filtered[['text','label']],
    test_size=0.1,
    random_state=seed,
    stratify=df_filtered['label']
)

train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset   = Dataset.from_pandas(val_df.reset_index(drop=True))

my_dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
}).remove_columns([])  # nothing to drop now

print(my_dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 553
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 62
    })
})


In [10]:
val_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,30
2,28
1,2
3,2


In [11]:
# gpu usage
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3"

In [12]:
import torch

if torch.cuda.is_available():
    print(f"GPU is available. Device count: {torch.cuda.device_count()}")
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
    device = torch.device("cuda")
else:
    print("No GPU available, using CPU.")
    device = torch.device("cpu")

GPU is available. Device count: 1
Current device: 0
Device name: Tesla T4


In [13]:
import torch
print("Torch:", torch.__version__)
print("Built with CUDA:", torch.version.cuda)   # None => CPU-only build
print("CUDA available?", torch.cuda.is_available())



Torch: 2.6.0+cu124
Built with CUDA: 12.4
CUDA available? True


In [14]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

INSTRUCTION = (
    "Task: classify review for THIS business. "
    "Labels: Valid, Advertisement, Irrelevant, Rant_Without_Visit. "
    "Priority: Ad > Irrelevant > No-visit rant > Valid."
)

def tokenize_function(example):
    texts = example["text"]
    # texts is a list when batched=True; replicate the instruction to match length
    instr = [INSTRUCTION] * len(texts)
    return tokenizer(instr, texts, truncation=True, max_length=256)


tokenized_datasets = my_dataset_dict.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# drop any raw columns that may exist (prompted_text OR text, and any index col)
cols_to_drop = [c for c in tokenized_datasets["train"].column_names
                if c in ["prompted_text", "text", "__index_level_0__"]]
if cols_to_drop:
    tokenized_datasets = tokenized_datasets.remove_columns(cols_to_drop)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/553 [00:00<?, ? examples/s]

Map:   0%|          | 0/62 [00:00<?, ? examples/s]

In [20]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=4)

# 简单评估指标（最小改动）
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_weighted": f1_score(labels, preds, average="weighted", zero_division=0),
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
    }

# 2. 设置训练参数
training_args = TrainingArguments(
    output_dir=f"./results/{checkpoint}",
    num_train_epochs=6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir="./logs",
    # Add these two lines to show training loss
    logging_strategy="steps",
    logging_steps=1,  # Log every 50 steps, you can adjust this value
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    report_to=[],
)

# 假设你已经准备好了 tokenized_datasets
# tokenized_datasets['train'], tokenized_datasets['test']

# 3. 初始化 Trainer
trainer = Trainer(
    model=model,                                   # 模型会被自动移动到GPU
    args=training_args,                            # 训练参数
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,                           # （替换 processing_class）
    compute_metrics=compute_metrics,               # 增加评估指标
)

# 4. 开始训练
#    在调用 .train() 时，每一批数据也会被自动发送到GPU
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro
1,1.208,0.890743,0.629032,0.60327,0.322964
2,0.4711,0.774532,0.693548,0.668443,0.3569
3,0.1611,0.807544,0.725806,0.701604,0.375139
4,0.1554,0.70189,0.741935,0.717823,0.383838
5,0.8327,0.806709,0.725806,0.701938,0.375246
6,0.0852,0.81566,0.790323,0.763441,0.408333


TrainOutput(global_step=210, training_loss=0.593023455852554, metrics={'train_runtime': 181.1325, 'train_samples_per_second': 18.318, 'train_steps_per_second': 1.159, 'total_flos': 343641542357568.0, 'train_loss': 0.593023455852554, 'epoch': 6.0})

In [21]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

id2label = {0: "Valid", 1: "Advertisement", 2: "Irrelevant", 3: "Rant_Without_Visit"}
all_labels = [0, 1, 2, 3]
target_names = [id2label[i] for i in all_labels]

predictions = trainer.predict(tokenized_datasets["validation"])
logits = predictions.predictions
labels = predictions.label_ids
preds = np.argmax(logits, axis=-1).astype(int)

print(classification_report(
    labels, preds,
    labels=all_labels,              # <-- ensure fixed label set
    target_names=target_names,
    zero_division=0,
    digits=3
))

print("Confusion matrix:\n", confusion_matrix(labels, preds, labels=all_labels))


                    precision    recall  f1-score   support

             Valid      0.800     0.800     0.800        30
     Advertisement      0.000     0.000     0.000         2
        Irrelevant      0.781     0.893     0.833        28
Rant_Without_Visit      0.000     0.000     0.000         2

          accuracy                          0.790        62
         macro avg      0.395     0.423     0.408        62
      weighted avg      0.740     0.790     0.763        62

Confusion matrix:
 [[24  0  6  0]
 [ 2  0  0  0]
 [ 3  0 25  0]
 [ 1  0  1  0]]
