<a href="https://colab.research.google.com/github/yyssophie/ML-for-Trustworthy-Location-Reviews/blob/main/test_colab_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# --- Colab bootstrap for your repo (GPU + deps + data) ---

REPO_URL = "https://github.com/yyssophie/ML-for-Trustworthy-Location-Reviews.git"
BRANCH = "main"
REPO_DIR = "/content/ML-for-Trustworthy-Location-Reviews"
REQUIREMENTS = "requirements.txt"  # keep None to skip
DATA_REL_PATH = "data/out"  # adjust if needed

import os, sys, subprocess, importlib, json

def run(cmd):
    print(">>", cmd)
    subprocess.check_call(cmd, shell=True)

# 0) Show system GPU (sanity)
run("nvidia-smi || true")

# 1) Get repo @ latest zhx
if not os.path.isdir(REPO_DIR):
    run(f"git clone -b {BRANCH} {REPO_URL} {REPO_DIR}")
else:
    run(f"cd {REPO_DIR} && git fetch origin && git checkout {BRANCH} && git reset --hard origin/{BRANCH} && git clean -fd && git pull --ff-only")

os.chdir(REPO_DIR)
print("CWD:", os.getcwd())

# 2) Install GPU-enabled PyTorch first (CUDA 12.4)
#    (Colab shows CUDA 12.4 in nvidia-smi; use the cu124 wheel index)
#    Uninstall torch and related packages first to ensure a clean install.
run("pip uninstall -y torch torchvision torchaudio || true")
# Install torch specifically for CUDA 12.4, avoiding extra features like XLA
run("pip install --upgrade --index-url https://download.pytorch.org/whl/cu124 torch torchvision torchaudio")


# 3) Install the rest of your Python deps
if REQUIREMENTS and os.path.isfile(REQUIREMENTS):
    run(f"pip install -r {REQUIREMENTS}")

# 3b) Guard against requirements.txt accidentally overwriting torch to CPU wheel
#     If torch CUDA build missing or not available, reinstall cu124 once more.
import torch
need_fix = (torch.version.cuda is None) or (not torch.cuda.is_available())
print("Torch:", torch.__version__, "| Built with CUDA:", torch.version.cuda, "| cuda.is_available:", torch.cuda.is_available())
if need_fix:
    print("Re-installing CUDA-enabled PyTorch (cu124) to fix mismatch...")
    run("pip uninstall -y torch torchvision torchaudio || true")
    run("pip install --upgrade --index-url https://download.pytorch.org/whl/cu124 torch torchvision torchaudio")
    importlib.invalidate_caches()
    import torch  # re-import
    print("Torch after fix:", torch.__version__, "| Built with CUDA:", torch.version.cuda, "| cuda.is_available:", torch.cuda.is_available())


# 4) Final GPU verification + device selection
try:
    import torch
    details = {
        "torch_version": torch.__version__,
        "torch_cuda_build": torch.version.cuda,
        "cuda_available": torch.cuda.is_available(),
    }
    if torch.cuda.is_available():
        details["device_count"] = torch.cuda.device_count()
        details["current_device"] = torch.cuda.current_device()
        details["device_name"] = torch.cuda.get_device_name(0)
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    print("GPU check:", json.dumps(details, indent=2))
except Exception as e:
    print("GPU verification error:", e)
    device = "cpu"


# 5) Make repo importable
if REPO_DIR not in sys.path:
    sys.path.append(REPO_DIR)

# 6) Data directory check + quick peek
DATA_DIR = os.path.join(REPO_DIR, DATA_REL_PATH)
print("DATA_DIR exists:", os.path.isdir(DATA_DIR), "->", DATA_DIR)
if os.path.isdir(DATA_DIR):
    run(f"ls -lah {DATA_DIR} | head -n 20 || true")

# 7) Helpful note: from repo root, use repo-root relative paths, e.g.:
#    pd.read_csv('data/label_data_with_cnd/merged_all.csv')

>> nvidia-smi || true
>> git clone -b zhx https://github.com/yyssophie/ML-for-Trustworthy-Location-Reviews.git /content/ML-for-Trustworthy-Location-Reviews
CWD: /content/ML-for-Trustworthy-Location-Reviews
>> pip uninstall -y torch torchvision torchaudio || true
>> pip install --upgrade --index-url https://download.pytorch.org/whl/cu124 torch torchvision torchaudio
>> pip install -r requirements.txt
Torch: 2.6.0+cu124 | Built with CUDA: 12.4 | cuda.is_available: True
GPU check: {
  "torch_version": "2.6.0+cu124",
  "torch_cuda_build": "12.4",
  "cuda_available": true,
  "device_count": 1,
  "current_device": 0,
  "device_name": "Tesla T4"
}
DATA_DIR exists: True -> /content/ML-for-Trustworthy-Location-Reviews/data/out
>> ls -lah /content/ML-for-Trustworthy-Location-Reviews/data/out | head -n 20 || true


In [26]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import random
seed=random.randint(0,100)
print(seed)

87


In [27]:
df = pd.read_csv('data/out/augmented_shuffled.csv')
df = df.dropna(subset=["predicted_label"])
df = df[df["predicted_label"].str.strip() != ""]
df['predicted_label'].value_counts()

Unnamed: 0_level_0,count
predicted_label,Unnamed: 1_level_1
Valid,7681
Irrelevant,480
Advertisement,172
Rant_Without_Visit,135


In [32]:
# Separate the DataFrame into 'valid' and 'non-valid' rows.
df_valid = df[df['predicted_label'] == 'Valid']
df_other = df[df['predicted_label'] != 'Valid']

# Randomly sample 300 rows from the 'valid' DataFrame (for the train/val pool).
df_valid_sampled = df_valid.sample(n=300, random_state=seed)

# Keep the remaining 'Valid' rows to be added to the TEST set later.
df_valid_remaining = df_valid.drop(df_valid_sampled.index)

# Pool used for the 80/20 split = sampled 300 Valid + all Others.
# (Keep original indices so we can slice back into df later.)
df_pool = pd.concat([df_valid_sampled, df_other], ignore_index=False)


In [33]:
#########################################
from sklearn.model_selection import train_test_split

# 80/20 stratified split on the pool
df_train_pool, df_test_base = train_test_split(
    df_pool,
    test_size=0.2,
    random_state=seed,
    stratify=df_pool['predicted_label']
)

# TEST = base 20% of pool + ALL remaining Valid (not used in pool)
df_test = pd.concat([df_test_base, df_valid_remaining], ignore_index=False)

# Build a SMALL validation set (<50) from the training pool, then reduce train accordingly
n_val = min(48, max(1, int(round(0.05 * len(df_train_pool)))))  # < 50
df_val_small = df_train_pool.sample(n=n_val, random_state=seed, replace=False)
df_train = df_train_pool.drop(df_val_small.index)

# Keep index lists to slice after we finish creating 'label' and 'text'
train_idx = df_train.index.tolist()
val_idx   = df_val_small.index.tolist()
test_idx  = df_test.index.tolist()

print(f"Train pool size (pre-val): {len(df_train_pool)}")
print(f"Small validation size (<50): {len(val_idx)}")
print(f"Train size: {len(train_idx)}")
print(f"Test size: {len(test_idx)}")


Train pool size (pre-val): 869
Small validation size (<50): 43
Train size: 826
Test size: 7599


In [34]:
tag_mapping_dict = {
    'Valid': 0,
    'Advertisement': 1,
    'Irrelevant': 2,
    'Rant_Without_Visit': 3,
}

df['label'] = df['predicted_label'].map(tag_mapping_dict)

missing = df['label'].isna()
print("Unmapped labels:", int(missing.sum()))
assert missing.sum() == 0, "Found unmapped labels in predicted_label"


Unmapped labels: 0


In [35]:
df.head(2)

Unnamed: 0,business_name,text,predicted_label,prediction_reason,description,category,label
0,Ashton Burger Barn,The food here is great. Their service is great...,Valid,The review provides a genuine assessment of th...,,['Hamburger restaurant'],0
1,Lagoon Amusement Park,My kids and I love this place. Make sure you g...,Valid,The review describes a genuine experience at t...,Seasonal theme park/water park offering thrill...,"['Amusement park', 'Tourist attraction']",0


In [36]:
PROMPT_PREFIX="""
You are a top-tier content moderation expert specializing in the evaluation of Google Maps location reviews.
Your task is to parse a JSON object containing review data and accurately classify it according to the following policies and rules.
"""

def create_json_from_row(row):
    """
    Converts a DataFrame row into a JSON-formatted string.
    """
    json_object = {
        "business_name": row["business_name"],
        "business_description": row["description"],
        "reviewed_category": row["category"],
        "review_text": row["text"],
        # "review_length": len(row["text"]),
    }
    return str(json_object)
    # return PROMPT_PREFIX+str(json_object)

df['text'] = df.apply(create_json_from_row, axis=1)
df['text']


Unnamed: 0,text
0,"{'business_name': 'Ashton Burger Barn', 'busin..."
1,"{'business_name': 'Lagoon Amusement Park', 'bu..."
2,"{'business_name': 'NOLA Restaurant', 'business..."
3,"{'business_name': ""Raising Cane's Chicken Fing..."
4,"{'business_name': 'Latonia Centre', 'business_..."
...,...
8467,"{'business_name': 'Subway', 'business_descript..."
8468,"{'business_name': 'Chuck E. Cheese', 'business..."
8469,"{'business_name': 'Family Thrift Center', 'bus..."
8470,"{'business_name': 'Hampton Inn Moab', 'busines..."


In [37]:
df_filtered = df.loc[:, ['text', 'label']]

In [38]:
from datasets import Dataset, DatasetDict

# Use the split indices computed earlier to slice the finalized df (with 'text' and 'label')
train_df = df_filtered.loc[train_idx].reset_index(drop=True)
val_df   = df_filtered.loc[val_idx].reset_index(drop=True)     # small validation (<50)
test_df  = df_filtered.loc[test_idx].reset_index(drop=True)    # renamed-from-previous "validation"

train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)
test_dataset  = Dataset.from_pandas(test_df)

my_dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,  # small validation for training
    'test': test_dataset        # this is the large holdout set
}).remove_columns([])

print(my_dataset_dict)
print("Sizes — train / val / test:", len(train_df), len(val_df), len(test_df))


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 826
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 43
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7599
    })
})
Sizes — train / val / test: 826 43 7599


In [42]:
test_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,7441
2,96
1,35
3,27


In [43]:
# gpu usage
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3"

In [44]:
import torch

if torch.cuda.is_available():
    print(f"GPU is available. Device count: {torch.cuda.device_count()}")
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
    device = torch.device("cuda")
else:
    print("No GPU available, using CPU.")
    device = torch.device("cpu")

GPU is available. Device count: 1
Current device: 0
Device name: Tesla T4


In [45]:
import torch
print("Torch:", torch.__version__)
print("Built with CUDA:", torch.version.cuda)   # None => CPU-only build
print("CUDA available?", torch.cuda.is_available())



Torch: 2.6.0+cu124
Built with CUDA: 12.4
CUDA available? True


In [46]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

INSTRUCTION = (
    "Task: classify review for THIS business. "
    "Labels: Valid, Advertisement, Irrelevant, Rant_Without_Visit. "
    "Priority: Ad > Irrelevant > No-visit rant > Valid."
)

def tokenize_function(example):
    texts = example["text"]
    instr = [INSTRUCTION] * len(texts)   # replicate instruction for each sample
    return tokenizer(instr, texts, truncation=True, max_length=256)

# Map over train/validation/test
tokenized_datasets = my_dataset_dict.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Drop only junk columns (but keep "text" for analysis later)
cols_to_drop = [c for c in tokenized_datasets["train"].column_names
                if c in ["prompted_text", "__index_level_0__"]]
if cols_to_drop:
    tokenized_datasets = tokenized_datasets.remove_columns(cols_to_drop)

print("Final columns:", tokenized_datasets["train"].column_names)


Map:   0%|          | 0/826 [00:00<?, ? examples/s]

Map:   0%|          | 0/43 [00:00<?, ? examples/s]

Map:   0%|          | 0/7599 [00:00<?, ? examples/s]

Final columns: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask']


In [47]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

# Add padding token if not already defined and resize token embeddings
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_weighted": f1_score(labels, preds, average="weighted", zero_division=0),
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
    }

training_args = TrainingArguments(
    output_dir=f"./results/{checkpoint}",
    num_train_epochs=8,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=1,
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets["validation"],  # small validation (<50)
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro
1,1.589,0.930959,0.674419,0.664647,0.611028
2,0.5839,0.475703,0.860465,0.86423,0.888095
3,0.6087,0.409047,0.860465,0.86289,0.866953
4,0.2688,0.326683,0.883721,0.886662,0.909296
5,0.1512,0.388132,0.906977,0.909633,0.926934
6,0.0376,0.359084,0.883721,0.883165,0.904717
7,0.6019,0.332774,0.906977,0.909633,0.926934
8,0.0103,0.327586,0.906977,0.907361,0.923747


TrainOutput(global_step=416, training_loss=0.4721914251791671, metrics={'train_runtime': 332.9955, 'train_samples_per_second': 19.844, 'train_steps_per_second': 1.249, 'total_flos': 666377529947400.0, 'train_loss': 0.4721914251791671, 'epoch': 8.0})

In [48]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

id2label = {0: "Valid", 1: "Advertisement", 2: "Irrelevant", 3: "Rant_Without_Visit"}
all_labels = [0, 1, 2, 3]
target_names = [id2label[i] for i in all_labels]

# Evaluate on TEST (renamed from the previous large 'validation')
predictions = trainer.predict(tokenized_datasets["test"])
logits = predictions.predictions
labels = predictions.label_ids
preds = np.argmax(logits, axis=-1).astype(int)

print(classification_report(
    labels, preds,
    labels=all_labels,
    target_names=target_names,
    zero_division=0,
    digits=3
))

print("Confusion matrix:\n", confusion_matrix(labels, preds, labels=all_labels))


                    precision    recall  f1-score   support

             Valid      0.997     0.840     0.912      7441
     Advertisement      0.427     0.914     0.582        35
        Irrelevant      0.078     0.781     0.142        96
Rant_Without_Visit      0.081     0.889     0.148        27

          accuracy                          0.840      7599
         macro avg      0.396     0.856     0.446      7599
      weighted avg      0.980     0.840     0.898      7599

Confusion matrix:
 [[6249   42  882  268]
 [   3   32    0    0]
 [  15    0   75    6]
 [   0    1    2   24]]


In [19]:
# --- Inspect misclassified samples on TEST ---

import numpy as np
import pandas as pd

test_ds = tokenized_datasets["test"]

# Indices of wrong predictions (convert to Python ints)
mis_tok_idx = np.where(labels != preds)[0]
mis_idx = [int(i) for i in mis_tok_idx]

# Pull what we need from the tokenized TEST set
subset = test_ds.select(mis_idx)  # safer than row-by-row indexing
texts  = subset["text"]

mis_df = pd.DataFrame({
    "row_in_tokenized": mis_idx,
    "true_label": [id2label[int(l)] for l in labels[mis_idx]],
    "pred_label": [id2label[int(p)] for p in preds[mis_idx]],
    "text": texts,
})

print(f"Total misclassified: {len(mis_df)} / {len(labels)}")
mis_df.head(20)
# mis_df.to_csv("misclassified_test.csv", index=False)


Total misclassified: 25 / 847


Unnamed: 0,row_in_tokenized,true_label,pred_label,text
0,80,Rant_Without_Visit,Irrelevant,"{'business_name': 'Sterling Bazaar', 'business..."
1,86,Irrelevant,Valid,"{'business_name': 'East Side Animal Hospital',..."
2,96,Irrelevant,Valid,"{'business_name': 'Stag Barber Shop', 'busines..."
3,110,Irrelevant,Valid,"{'business_name': 'R & G Food Basket', 'busine..."
4,213,Irrelevant,Valid,"{'business_name': 'Stop N Save Liquors', 'busi..."
5,228,Irrelevant,Valid,"{'business_name': 'Roscoe Village Pub', 'busin..."
6,249,Irrelevant,Valid,"{'business_name': 'Sunset Motel', 'business_de..."
7,264,Rant_Without_Visit,Valid,"{'business_name': 'Public Auto Auctions CA', '..."
8,277,Rant_Without_Visit,Valid,"{'business_name': 'Jack in the Box', 'business..."
9,362,Irrelevant,Valid,"{'business_name': 'Siegen Lane Marketplace', '..."
