<a href="https://colab.research.google.com/github/yyssophie/ML-for-Trustworthy-Location-Reviews/blob/main/test_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- Colab bootstrap for your repo (GPU + deps + data) ---

REPO_URL = "https://github.com/yyssophie/ML-for-Trustworthy-Location-Reviews.git"
BRANCH = "zhx"
REPO_DIR = "/content/ML-for-Trustworthy-Location-Reviews"
REQUIREMENTS = "requirements.txt"  # keep None to skip
DATA_REL_PATH = "data/label_data_with_cnd"  # adjust if needed

import os, sys, subprocess, importlib, json

def run(cmd):
    print(">>", cmd)
    subprocess.check_call(cmd, shell=True)

# 0) Show system GPU (sanity)
run("nvidia-smi || true")

# 1) Get repo @ latest zhx
if not os.path.isdir(REPO_DIR):
    run(f"git clone -b {BRANCH} {REPO_URL} {REPO_DIR}")
else:
    run(f"cd {REPO_DIR} && git fetch origin && git checkout {BRANCH} && git reset --hard origin/{BRANCH} && git clean -fd && git pull --ff-only")

os.chdir(REPO_DIR)
print("CWD:", os.getcwd())

# 2) Install GPU-enabled PyTorch first (CUDA 12.4)
#    (Colab shows CUDA 12.4 in nvidia-smi; use the cu124 wheel index)
run("pip uninstall -y torch torchvision torchaudio || true")
run("pip install --upgrade --index-url https://download.pytorch.org/whl/cu124 torch torchvision torchaudio")

# 3) Install the rest of your Python deps
if REQUIREMENTS and os.path.isfile(REQUIREMENTS):
    run(f"pip install -r {REQUIREMENTS}")

# 3b) Guard against requirements.txt accidentally overwriting torch to CPU wheel
#     If torch CUDA build missing or not available, reinstall cu124 once more.
import torch
need_fix = (torch.version.cuda is None) or (not torch.cuda.is_available())
print("Torch:", torch.__version__, "| Built with CUDA:", torch.version.cuda, "| cuda.is_available:", torch.cuda.is_available())
if need_fix:
    print("Re-installing CUDA-enabled PyTorch (cu124) to fix mismatch...")
    run("pip uninstall -y torch torchvision torchaudio || true")
    run("pip install --upgrade --index-url https://download.pytorch.org/whl/cu124 torch torchvision torchaudio")
    importlib.invalidate_caches()
    import torch  # re-import
    print("Torch after fix:", torch.__version__, "| Built with CUDA:", torch.version.cuda, "| cuda.is_available:", torch.cuda.is_available())

# 4) Final GPU verification + device selection
try:
    import torch
    details = {
        "torch_version": torch.__version__,
        "torch_cuda_build": torch.version.cuda,
        "cuda_available": torch.cuda.is_available(),
    }
    if torch.cuda.is_available():
        details["device_count"] = torch.cuda.device_count()
        details["current_device"] = torch.cuda.current_device()
        details["device_name"] = torch.cuda.get_device_name(0)
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    print("GPU check:", json.dumps(details, indent=2))
except Exception as e:
    print("GPU verification error:", e)
    device = "cpu"

# 5) Make repo importable
if REPO_DIR not in sys.path:
    sys.path.append(REPO_DIR)

# 6) Data directory check + quick peek
DATA_DIR = os.path.join(REPO_DIR, DATA_REL_PATH)
print("DATA_DIR exists:", os.path.isdir(DATA_DIR), "->", DATA_DIR)
if os.path.isdir(DATA_DIR):
    run(f"ls -lah {DATA_DIR} | head -n 20 || true")

# 7) Helpful note: from repo root, use repo-root relative paths, e.g.:
#    pd.read_csv('data/label_data_with_cnd/merged_all.csv')


>> nvidia-smi || true
>> git clone -b zhx https://github.com/yyssophie/ML-for-Trustworthy-Location-Reviews.git /content/ML-for-Trustworthy-Location-Reviews
CWD: /content/ML-for-Trustworthy-Location-Reviews
>> pip uninstall -y torch torchvision torchaudio || true
>> pip install --upgrade --index-url https://download.pytorch.org/whl/cu124 torch torchvision torchaudio
>> pip install -r requirements.txt
Torch: 2.6.0+cu124 | Built with CUDA: 12.4 | cuda.is_available: True
GPU check: {
  "torch_version": "2.6.0+cu124",
  "torch_cuda_build": "12.4",
  "cuda_available": true,
  "device_count": 1,
  "current_device": 0,
  "device_name": "Tesla T4"
}
DATA_DIR exists: True -> /content/ML-for-Trustworthy-Location-Reviews/data/label_data_with_cnd
>> ls -lah /content/ML-for-Trustworthy-Location-Reviews/data/label_data_with_cnd | head -n 20 || true


In [None]:
import pandas as pd
import random
seed=random.randint(0,100)
print(seed)

81


In [None]:
df=pd.read_csv('data/label_data_with_cnd/combined_shuffled.csv')
df = df.dropna(subset=["predicted_label"])
df = df[df["predicted_label"].str.strip() != ""]
df['predicted_label'].value_counts()

Unnamed: 0_level_0,count
predicted_label,Unnamed: 1_level_1
Valid,7681
Irrelevant,280
Advertisement,172
Rant_Without_Visit,135


In [None]:
# Separate the DataFrame into 'valid' and 'non-valid' rows.
df_valid = df[df['predicted_label'] == 'Valid']
df_other = df[df['predicted_label'] != 'Valid']

# Randomly sample 300 rows from the 'valid' DataFrame.
df_valid_sampled = df_valid.sample(n=300, random_state=seed)

# Concatenate the sampled 'valid' rows with the 'non-valid' rows.
df = pd.concat([df_valid_sampled, df_other])

# Print the value counts of the updated DataFrame.
print(df['predicted_label'].value_counts())

predicted_label
Valid                 300
Irrelevant            280
Advertisement         172
Rant_Without_Visit    135
Name: count, dtype: int64


In [None]:
#########################################
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Split df into train/validation (adjust test_size as needed)
df_train, df_val = train_test_split(df, test_size=0.2, random_state=seed, stratify=df['predicted_label'])

# Reset indices
df_train = df_train.reset_index(drop=True)
df_val   = df_val.reset_index(drop=True)

# Build HF DatasetDict
raw_datasets = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "validation": Dataset.from_pandas(df_val),
})

# Add a stable index to every row
def add_idx(example, idx):
    return {"__idx__": idx}

raw_datasets = raw_datasets.map(add_idx, with_indices=True)

print(raw_datasets)
print(raw_datasets["train"].column_names)  # should include "text", "label", "__idx__"


Map:   0%|          | 0/709 [00:00<?, ? examples/s]

Map:   0%|          | 0/178 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['business_name', 'text', 'predicted_label', 'prediction_reason', 'description', 'category', '__idx__'],
        num_rows: 709
    })
    validation: Dataset({
        features: ['business_name', 'text', 'predicted_label', 'prediction_reason', 'description', 'category', '__idx__'],
        num_rows: 178
    })
})
['business_name', 'text', 'predicted_label', 'prediction_reason', 'description', 'category', '__idx__']


In [None]:
tag_mapping_dict = {
    'Valid': 0,
    'Advertisement': 1,
    'Irrelevant': 2,
    'Rant_Without_Visit': 3,
}

df['label'] = df['predicted_label'].map(tag_mapping_dict)


# REPLACE this:
# df[df['label']==None]
# WITH:
missing = df['label'].isna()
print("Unmapped labels:", int(missing.sum()))
assert missing.sum() == 0, "Found unmapped labels in predicted_label"


Unmapped labels: 0


In [None]:
df.head(2)

Unnamed: 0,business_name,text,predicted_label,prediction_reason,description,category,label
5824,Steep Mountain Teahouse,Visited for the 1st time on game night. It's a...,Valid,The review describes a genuine visit experienc...,,"['Tea house', 'Bubble tea store', 'Cafe', 'Lun...",0
7467,Palmita Mexican Restaurant,If you haven't eaten here and you live near it...,Valid,The review describes a genuine dining experien...,,['Mexican restaurant'],0


In [None]:
PROMPT_PREFIX="""
You are a top-tier content moderation expert specializing in the evaluation of Google Maps location reviews.
Your task is to parse a JSON object containing review data and accurately classify it according to the following policies and rules.
"""

def create_json_from_row(row):
    """
    Converts a DataFrame row into a JSON-formatted string.
    """
    json_object = {
        "business_name": row["business_name"],
        "business_description": row["description"],
        "business_category": row["category"],
        "review_text": row["text"],
        # "review_length": len(row["text"]),
    }
    return str(json_object)
    # return PROMPT_PREFIX+str(json_object)

df['text'] = df.apply(create_json_from_row, axis=1)
df['text']


Unnamed: 0,text
5824,"{'business_name': 'Steep Mountain Teahouse', '..."
7467,{'business_name': 'Palmita Mexican Restaurant'...
2586,{'business_name': 'Olive Garden Italian Restau...
6427,"{'business_name': 'Draper Crossing', 'business..."
8175,"{'business_name': 'Wing Snack', 'business_desc..."
...,...
8202,"{'business_name': ""Stanley's Fresh Fruits and ..."
8218,"{'business_name': ""Bella's Hair Studio"", 'busi..."
8219,{'business_name': 'HushPuppies Catfish and Sea...
8243,"{'business_name': 'Mr. Submarine', 'business_d..."


In [None]:
df_filtered = df.loc[:, ['text', 'label']]

In [None]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    df_filtered[['text','label']],
    test_size=0.1,
    random_state=seed,
    stratify=df_filtered['label']
)

train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset   = Dataset.from_pandas(val_df.reset_index(drop=True))

my_dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
}).remove_columns([])  # nothing to drop now

print(my_dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 798
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 89
    })
})


In [None]:
val_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,30
2,28
1,17
3,14


In [None]:
# gpu usage
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3"

In [None]:
import torch

if torch.cuda.is_available():
    print(f"GPU is available. Device count: {torch.cuda.device_count()}")
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
    device = torch.device("cuda")
else:
    print("No GPU available, using CPU.")
    device = torch.device("cpu")

GPU is available. Device count: 1
Current device: 0
Device name: Tesla T4


In [None]:
import torch
print("Torch:", torch.__version__)
print("Built with CUDA:", torch.version.cuda)   # None => CPU-only build
print("CUDA available?", torch.cuda.is_available())



Torch: 2.6.0+cu124
Built with CUDA: 12.4
CUDA available? True


In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

INSTRUCTION = (
    "Task: classify review for THIS business. "
    "Labels: Valid, Advertisement, Irrelevant, Rant_Without_Visit. "
    "Priority: Ad > Irrelevant > No-visit rant > Valid."
)

def tokenize_function(example):
    texts = example["text"]
    instr = [INSTRUCTION] * len(texts)   # replicate instruction for each sample
    return tokenizer(instr, texts, truncation=True, max_length=256)

# Use your dataset dict (already split into train/validation)
tokenized_datasets = my_dataset_dict.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Drop only junk columns (but keep "text" for analysis later)
cols_to_drop = [c for c in tokenized_datasets["train"].column_names
                if c in ["prompted_text", "__index_level_0__"]]
if cols_to_drop:
    tokenized_datasets = tokenized_datasets.remove_columns(cols_to_drop)

print("Final columns:", tokenized_datasets["train"].column_names)
# should still contain "text" + labels + tokenized features


Map:   0%|          | 0/798 [00:00<?, ? examples/s]

Map:   0%|          | 0/89 [00:00<?, ? examples/s]

Final columns: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask']


In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=4)

# 简单评估指标（最小改动）
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_weighted": f1_score(labels, preds, average="weighted", zero_division=0),
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
    }

# 2. 设置训练参数
training_args = TrainingArguments(
    output_dir=f"./results/{checkpoint}",
    num_train_epochs=8,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir="./logs",
    # Add these two lines to show training loss
    logging_strategy="steps",
    logging_steps=1,  # Log every 50 steps, you can adjust this value
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    report_to=[],
)

# 假设你已经准备好了 tokenized_datasets
# tokenized_datasets['train'], tokenized_datasets['test']

# 3. 初始化 Trainer
trainer = Trainer(
    model=model,                                   # 模型会被自动移动到GPU
    args=training_args,                            # 训练参数
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,                           # （替换 processing_class）
    compute_metrics=compute_metrics,               # 增加评估指标
)

# 4. 开始训练
#    在调用 .train() 时，每一批数据也会被自动发送到GPU
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro
1,0.9686,0.914562,0.516854,0.38863,0.365626
2,0.5712,0.43024,0.842697,0.840448,0.861592
3,0.6971,0.404905,0.842697,0.837224,0.860871
4,0.5969,0.417902,0.88764,0.887302,0.899896
5,0.0764,0.456096,0.865169,0.86393,0.878859
6,0.017,0.493626,0.88764,0.88764,0.904167
7,0.4265,0.501536,0.88764,0.88764,0.904167
8,0.0133,0.512366,0.88764,0.88764,0.904167


TrainOutput(global_step=400, training_loss=0.41609187055611985, metrics={'train_runtime': 193.3728, 'train_samples_per_second': 33.014, 'train_steps_per_second': 2.069, 'total_flos': 637706316087360.0, 'train_loss': 0.41609187055611985, 'epoch': 8.0})

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

id2label = {0: "Valid", 1: "Advertisement", 2: "Irrelevant", 3: "Rant_Without_Visit"}
all_labels = [0, 1, 2, 3]
target_names = [id2label[i] for i in all_labels]

predictions = trainer.predict(tokenized_datasets["validation"])
logits = predictions.predictions
labels = predictions.label_ids
preds = np.argmax(logits, axis=-1).astype(int)

print(classification_report(
    labels, preds,
    labels=all_labels,              # <-- ensure fixed label set
    target_names=target_names,
    zero_division=0,
    digits=3
))

print("Confusion matrix:\n", confusion_matrix(labels, preds, labels=all_labels))


                    precision    recall  f1-score   support

             Valid      0.867     0.867     0.867        30
     Advertisement      1.000     1.000     1.000        17
        Irrelevant      0.821     0.821     0.821        28
Rant_Without_Visit      0.929     0.929     0.929        14

          accuracy                          0.888        89
         macro avg      0.904     0.904     0.904        89
      weighted avg      0.888     0.888     0.888        89

Confusion matrix:
 [[26  0  4  0]
 [ 0 17  0  0]
 [ 4  0 23  1]
 [ 0  0  1 13]]


In [None]:
# --- Inspect misclassified samples (robust) ---

import numpy as np
import pandas as pd

val_ds = tokenized_datasets["validation"]

# Indices of wrong predictions (convert to Python ints)
mis_tok_idx = np.where(labels != preds)[0]
mis_idx = [int(i) for i in mis_tok_idx]

# Pull what we need from the tokenized validation set
subset = val_ds.select(mis_idx)  # safer than row-by-row indexing
texts  = subset["text"]

mis_df = pd.DataFrame({
    "row_in_tokenized": mis_idx,
    "true_label": [id2label[int(l)] for l in labels[mis_idx]],
    "pred_label": [id2label[int(p)] for p in preds[mis_idx]],
    "text": texts,
})

print(f"Total misclassified: {len(mis_df)} / {len(labels)}")
mis_df.head(20)
# mis_df.to_csv("misclassified_validation.csv", index=False)


Total misclassified: 10 / 89


Unnamed: 0,row_in_tokenized,true_label,pred_label,text
0,1,Valid,Irrelevant,"{'business_name': ""Parrot's Cay Tavern & Grill..."
1,27,Irrelevant,Valid,"{'business_name': 'Seagull Book', 'business_de..."
2,29,Irrelevant,Valid,"{'business_name': 'Gourmet Chili', 'business_d..."
3,34,Rant_Without_Visit,Irrelevant,"{'business_name': ""Sportsman's Warehouse"", 'bu..."
4,39,Valid,Irrelevant,{'business_name': 'Daniel Boone National Fores...
5,48,Irrelevant,Valid,"{'business_name': 'Meredith & Son Glass', 'bus..."
6,65,Valid,Irrelevant,"{'business_name': 'Market Basket', 'business_d..."
7,68,Valid,Irrelevant,"{'business_name': ""Arby's"", 'business_descript..."
8,82,Irrelevant,Rant_Without_Visit,"{'business_name': 'Stratford Square Mall', 'bu..."
9,83,Irrelevant,Valid,{'business_name': 'Los Angeles Harley-Davidson...
