# **Problem Statement 1**  
### **Filtering the Noise: ML for Trustworthy Location Reviews**  
**Team 3Pandas** *(Tran Ha My, Diane Teo Min Xuan, Ng Yuen Ning)*  

---

## **Problem Statement**  
Design and implement an **ML-based system** to evaluate the **quality** and **relevancy** of Google location reviews. The system should:  

- **Gauge review quality:** Detect spam, advertisements, irrelevant content, and rants from users who have likely never visited the location.  
- **Assess relevancy:** Determine whether the content of a review is genuinely related to the location being reviewed.  
- **Enforce policies:** Automatically flag or filter out reviews that violate the following example policies:  
  - No advertisements or promotional content.  
  - No irrelevant content (e.g., reviews about unrelated topics).  
  - No rants or complaints from users who have not visited the place (can be inferred from content, metadata, or other signals).  

---

## **Motivation & Impact**  
- **For Users:** Increases trust in location-based reviews, leading to better decision-making.  
- **For Businesses:** Ensures fair representation and reduces the impact of malicious or irrelevant reviews.  
- **For Platforms:** Automates moderation, reduces manual workload, and enhances platform credibility.  

---

## **Data Sources**  

| **Data Sources**       | **Details** |
|-------------------------|-------------|
| **Public Datasets**    | - **Google Review Data:** Open datasets containing Google location reviews (e.g., [Google Local Reviews on Kaggle](https://www.kaggle.com/datasets/denizbilginn/google-maps-restaurant-reviews))<br>- **Google Local review data:** [UCSD Public Dataset](https://mcauleylab.ucsd.edu/public_datasets/gdrive/googlelocal/)<br>- **Alternative Sources:** Yelp, TripAdvisor, or other open review datasets for supplementary training. |
| **Student-Crawled Data** | - Students are encouraged to crawl additional reviews from Google Maps (in compliance with Google's terms of service).<br>- **Example:** [Scraping Google Reviews (YouTube)](https://www.youtube.com/watch?v=LYMdZ7W9bWQ) |


### Dependencies

In [107]:
import yaml
import os
import json

# ! pip install tldextract
import re
import tldextract

from transformers import pipeline
from tqdm import tqdm

# ! pip install textblob
from textblob import TextBlob
import pandas as pd

import torch
from transformers import pipeline

from sklearn.model_selection import train_test_split
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import numpy as np

import torch
from torch.utils.data import IterableDataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
import torch.nn as nn 
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import precision_recall_fscore_support, average_precision_score



### 1. Load Data

In [83]:
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)


labeled_input_folder = config['labeled_input']

full_df = pd.read_csv('data/labeled/all_reviews_with_labels_normalised.csv')
full_df.isnull().sum()

review_text               0
rating                  253
has_photo                 0
author_name               0
user_review_count       253
business_name             0
category                  0
source                    0
review_id                 0
comprehensive_review      0
is_ad                     0
is_relevant               0
is_rant                   0
is_legit                  0
dtype: int64

In [84]:
# Save as JSON
output_json_path = os.path.join(labeled_input_folder, "full_df.json")
full_df.to_json(output_json_path, orient="records", lines=True, force_ascii=False)
print(f"JSON file saved to: {output_json_path}")

# Save as Parquet
output_parquet_path = os.path.join(labeled_input_folder, "full_df.parquet")
full_df.to_parquet(output_parquet_path, index=False)
print(f"Parquet file saved to: {output_parquet_path}")

JSON file saved to: data/labeled\full_df.json
Parquet file saved to: data/labeled\full_df.parquet


In [85]:
to_clean_df = full_df.dropna(subset=['review_text', 'is_ad', 'is_relevant', 'is_rant', 'is_legit'])

to_clean_df.head()

Unnamed: 0,review_text,rating,has_photo,author_name,user_review_count,business_name,category,source,review_id,comprehensive_review,is_ad,is_relevant,is_rant,is_legit
0,Love the convenience of this neighborhood carw...,4.0,False,Doug Schmidt,1.0,"Auto Spa Speedy Wash - Harvester, MO",['Car wash'],google,1001,"[Business] Auto Spa Speedy Wash - Harvester, M...",False,True,False,True
1,"2 bathrooms (for a large 2 story building), 1 ...",2.0,False,Duf Duftopia,1.0,Kmart,"['Discount store', 'Appliance store', 'Baby st...",google,1002,[Business] Kmart | [Category] ['Discount store...,True,True,True,False
2,My favorite pizza shop hands down!,5.0,False,Andrew Phillips,1.0,Papa’s Pizza,"['Pizza restaurant', 'Chicken wings restaurant...",google,1003,[Business] Papa’s Pizza | [Category] ['Pizza r...,False,True,False,True
3,BOTCHED INSTRUMENT REPAIR IS COSTING US HUNDRE...,1.0,False,Julie Heiland,1.0,The Music Place,['Musical instrument store'],google,1004,[Business] The Music Place | [Category] ['Musi...,False,True,True,False
4,Very unprofessional!!!!!,1.0,False,Alan Khasanov,1.0,Park Motor Cars Inc,['Used car dealer'],google,1005,[Business] Park Motor Cars Inc | [Category] ['...,False,True,True,False


In [86]:
print(to_clean_df.shape)
print(to_clean_df.isnull().sum())

(11920, 14)
review_text               0
rating                  253
has_photo                 0
author_name               0
user_review_count       253
business_name             0
category                  0
source                    0
review_id                 0
comprehensive_review      0
is_ad                     0
is_relevant               0
is_rant                   0
is_legit                  0
dtype: int64


In [87]:
# Save as JSON
output_json_path = os.path.join(labeled_input_folder, "to_clean_df.json")
to_clean_df.to_json(output_json_path, orient="records", lines=True, force_ascii=False)
print(f"JSON file saved to: {output_json_path}")

# Save as Parquet
output_parquet_path = os.path.join(labeled_input_folder, "to_clean_df.parquet")
to_clean_df.to_parquet(output_parquet_path, index=False)
print(f"Parquet file saved to: {output_parquet_path}")

JSON file saved to: data/labeled\to_clean_df.json
Parquet file saved to: data/labeled\to_clean_df.parquet


### 2. Pre-Process Datafames

##### 2.1 Cleaning Functions

In [88]:
def normalize_whitespace(text):
    return re.sub(r'\s+', ' ', text).strip()

def clean_urls(text):
    url_pattern = re.compile(r'https?://[^\s]+')
    urls = url_pattern.findall(text)
    domains = [tldextract.extract(u).domain for u in urls]
    text_cleaned = url_pattern.sub(' '.join(domains), text)
    return text_cleaned

def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = clean_urls(text)
    text = normalize_whitespace(text)
    return text

##### 2.2 Compute Basic Signals

In [89]:
def compute_basic_signals(text):
    url_count = len(re.findall(r'https?://\S+', text))
    phone_count = len(re.findall(r'\+?\d[\d\s-]{7,}\d', text))
    caps_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)
    return url_count, phone_count, caps_ratio

##### 2.3 Sentiment Analysis

In [90]:
def add_textblob_sentiment(df, text_col="review_text", positive_threshold=0.9, negative_threshold=-0.9):
    def get_sentiment(text):
        if pd.isna(text) or not isinstance(text, str) or text.strip() == "":
            return 0.0, 0.0
        try:
            analysis = TextBlob(text)
            return analysis.sentiment.polarity, analysis.sentiment.subjectivity
        except Exception:
            return 0.0, 0.0

    sentiment_results = df[text_col].apply(get_sentiment)
    df["sentiment_polarity"], df["sentiment_subjectivity"] = zip(*sentiment_results)

    df["is_extreme_sentiment"] = df["sentiment_polarity"].apply(
        lambda x: 1 if x >= positive_threshold or x <= negative_threshold else 0
    )

    return df

##### Apply to Dataframe

In [91]:
def preprocess_reviews(df):
    df["clean_text"] = df["review_text"].apply(clean_text)
    signals = df["clean_text"].apply(compute_basic_signals)
    df["url_count"], df["phone_count"], df["caps_ratio"] = zip(*signals)
    return df

cleaned_df = preprocess_reviews(to_clean_df)
cleaned_df.head()

Unnamed: 0,review_text,rating,has_photo,author_name,user_review_count,business_name,category,source,review_id,comprehensive_review,is_ad,is_relevant,is_rant,is_legit,clean_text,url_count,phone_count,caps_ratio
0,Love the convenience of this neighborhood carw...,4.0,False,Doug Schmidt,1.0,"Auto Spa Speedy Wash - Harvester, MO",['Car wash'],google,1001,"[Business] Auto Spa Speedy Wash - Harvester, M...",False,True,False,True,Love the convenience of this neighborhood carw...,0,0,0.02
1,"2 bathrooms (for a large 2 story building), 1 ...",2.0,False,Duf Duftopia,1.0,Kmart,"['Discount store', 'Appliance store', 'Baby st...",google,1002,[Business] Kmart | [Category] ['Discount store...,True,True,True,False,"2 bathrooms (for a large 2 story building), 1 ...",0,0,0.016949
2,My favorite pizza shop hands down!,5.0,False,Andrew Phillips,1.0,Papa’s Pizza,"['Pizza restaurant', 'Chicken wings restaurant...",google,1003,[Business] Papa’s Pizza | [Category] ['Pizza r...,False,True,False,True,My favorite pizza shop hands down!,0,0,0.029412
3,BOTCHED INSTRUMENT REPAIR IS COSTING US HUNDRE...,1.0,False,Julie Heiland,1.0,The Music Place,['Musical instrument store'],google,1004,[Business] The Music Place | [Category] ['Musi...,False,True,True,False,BOTCHED INSTRUMENT REPAIR IS COSTING US HUNDRE...,0,0,0.042589
4,Very unprofessional!!!!!,1.0,False,Alan Khasanov,1.0,Park Motor Cars Inc,['Used car dealer'],google,1005,[Business] Park Motor Cars Inc | [Category] ['...,False,True,True,False,Very unprofessional!!!!!,0,0,0.041667


### 3. Train-Test Split with Multi-Label Stratification

In [92]:
meta_cols = ["url_count","phone_count","caps_ratio","rating","has_photo","user_review_count"]
label_cols = ["is_ad","is_relevant","is_rant","is_legit"]

X = cleaned_df.drop(columns=label_cols)
y = cleaned_df[label_cols].values

mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_idx, test_idx = next(mskf.split(X, y))

train_df = cleaned_df.iloc[train_idx].reset_index(drop=True)
test_df = cleaned_df.iloc[test_idx].reset_index(drop=True)

### 4. Tokenisation

In [93]:
def simple_tokenize(text):
    text = str(text).lower()
    tokens = re.findall(r'\b[a-z]+\b', text)
    return tokens

train_df['tokens'] = train_df['clean_text'].apply(simple_tokenize)
test_df['tokens'] = test_df['clean_text'].apply(simple_tokenize)

In [94]:
print(train_df.shape)
print(test_df.shape)

(9536, 19)
(2384, 19)


### Yuen Ning's model

In [125]:
df = cleaned_df.copy()
label_cols = ["is_ad", "is_relevant", "is_rant", "is_legit"]
meta_cols = ["url_count","phone_count","caps_ratio"]

df[label_cols] = df[label_cols].astype(float)

train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)

val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")


Train: 8344, Val: 1788, Test: 1788


In [129]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.texts = df["clean_text"].tolist()
        self.meta = df[meta_cols].fillna(0).to_numpy(dtype=np.float32)
        self.labels = df[label_cols].to_numpy(dtype=np.float32)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Handle if idx is a list/array (batch of indices)
        if isinstance(idx, (list, np.ndarray)):
            return [self[i] for i in idx]

        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "meta": torch.tensor(self.meta[idx], dtype=torch.float),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float)
        }

def collate_fn(batch):
    return {
        "input_ids": torch.stack([b["input_ids"] for b in batch]),
        "attention_mask": torch.stack([b["attention_mask"] for b in batch]),
        "meta": torch.stack([b["meta"] for b in batch]),
        "labels": torch.stack([b["labels"] for b in batch])
    }


In [134]:
model_name = "Qwen/Qwen2-0.5B"  # lightweight Qwen
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B", use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_cols))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2-0.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Qwen2ForSequenceClassification(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Q

In [130]:
batch_size = 8  # adjust for memory constraints
train_dataset = ReviewDataset(train_df, tokenizer)
test_dataset = ReviewDataset(test_df, tokenizer)

train_dataset = ReviewDataset(train_df, tokenizer, max_len=128)
val_dataset = ReviewDataset(val_df, tokenizer, max_len=128)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader   = torch.utils.data.DataLoader(val_dataset, batch_size=8)
test_loader  = torch.utils.data.DataLoader(test_dataset, batch_size=8)


In [131]:
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.BCEWithLogitsLoss()


In [135]:
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} Train Loss: {total_loss/len(train_loader):.4f}")


ValueError: Cannot handle batch sizes > 1 if no padding token is defined.

In [95]:
# =======================
# CONFIG
# =======================
label_cols = ["is_ad", "is_relevant", "is_rant", "is_legit"]
meta_cols = ["url_count", "phone_count", "caps_ratio", "rating", "user_review_count"]  # adjust based on available
max_len = 128
batch_size = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [99]:
df = cleaned_df
df[label_cols] = df[label_cols].astype(float)

X = df.drop(columns=label_cols)
y = df[label_cols].values

mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_idx, test_idx = next(mskf.split(X, y))

train_df = df.iloc[train_idx]
test_df = df.iloc[test_idx]

# Then split train into train/val
train_idx2, val_idx = next(MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
                           .split(train_df.drop(columns=label_cols), train_df[label_cols].values))
val_df = train_df.iloc[val_idx]
train_df = train_df.iloc[train_idx2]


In [100]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_batch(df):
    encodings = tokenizer(
        df["clean_text"].tolist(),
        truncation=True,
        padding="max_length",
        max_length=max_len,
        return_tensors="pt"
    )
    meta = torch.tensor(df[meta_cols].fillna(0).values, dtype=torch.float32)
    labels = torch.tensor(df[label_cols].values, dtype=torch.float32)
    return encodings, meta, labels

In [101]:
class ReviewDataset(Dataset):
    def __init__(self, encodings, meta, labels):
        self.encodings = encodings
        self.meta = meta
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "meta": self.meta[idx],
            "labels": self.labels[idx]
        }

train_enc, train_meta, train_labels = tokenize_batch(train_df)
val_enc, val_meta, val_labels = tokenize_batch(val_df)
test_enc, test_meta, test_labels = tokenize_batch(test_df)

train_dataset = ReviewDataset(train_enc, train_meta, train_labels)
val_dataset = ReviewDataset(val_enc, val_meta, val_labels)
test_dataset = ReviewDataset(test_enc, test_meta, test_labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [104]:
class ReviewGuardModel(nn.Module):
    def __init__(self, backbone="prajjwal1/bert-tiny", meta_dim=len(meta_cols)):
        super().__init__()
        self.enc = AutoModel.from_pretrained(backbone)
        hid = self.enc.config.hidden_size
        self.meta_net = nn.Sequential(nn.Linear(meta_dim, 64), nn.ReLU(), nn.Dropout(0.1), nn.Linear(64, 64), nn.ReLU())
        self.fuse = nn.Linear(hid + 64, hid)
        self.cls = nn.Linear(hid, len(label_cols))
    def forward(self, input_ids, attention_mask, meta, labels=None):
        x = self.enc(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:,0]
        m = self.meta_net(meta)
        z = torch.relu(self.fuse(torch.cat([x, m], dim=1)))
        logits = self.cls(z)
        loss = None
        if labels is not None:
            loss_f = nn.BCEWithLogitsLoss()
            loss = loss_f(logits, labels)
        return {"loss": loss, "logits": logits}

model = ReviewGuardModel().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)


In [105]:
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        meta = batch["meta"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, meta=meta, labels=labels)
        loss = outputs["loss"]
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Train Loss: {total_loss/len(train_loader):.4f}")


Epoch 1 Train Loss: 0.3287


KeyboardInterrupt: 

In [66]:
pos_weights = torch.tensor([29.5641, 0.0374, 12.0096, 0.1678], dtype=torch.float32).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weights)

In [None]:
model_name = "prajjwal1/bert-tiny"  # tiny BERT for CPU
max_len = 64
batch_size = 2
accum_steps = 4  # gradient accumulation
epochs = 3
meta_cols = ["url_count","phone_count","caps_ratio","user_review_count"]
label_cols = ["is_ad","is_relevant","is_rant","is_legit"]

device = torch.device("cpu") 

tokenizer = AutoTokenizer.from_pretrained(model_name)


In [68]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.texts = df["clean_text"].tolist()
        self.meta = df[meta_cols].fillna(0).to_numpy(dtype=np.float32)
        self.labels = df[label_cols].to_numpy(dtype=np.float32)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "meta": torch.tensor(self.meta[idx], dtype=torch.float),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float)
        }

class ReviewGuardModel(nn.Module):
    def __init__(self, backbone=model_name, meta_dim=len(meta_cols)):
        super().__init__()
        self.enc = AutoModel.from_pretrained(backbone)
        hid = self.enc.config.hidden_size
        self.meta_net = nn.Sequential(
            nn.Linear(meta_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, 64),
            nn.ReLU()
        )
        self.fuse = nn.Linear(hid + 64, hid)
        self.cls = nn.Linear(hid, len(label_cols))

    def forward(self, input_ids, attention_mask, meta, labels=None):
        x = self.enc(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0]
        m = self.meta_net(meta)
        z = torch.relu(self.fuse(torch.cat([x, m], dim=1)))
        logits = self.cls(z)
        loss = None
        if labels is not None:
            loss_f = nn.BCEWithLogitsLoss()
            loss = loss_f(logits, labels)
        return {"loss": loss, "logits": logits}


In [69]:
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny", use_fast=True)
train_dataset = ReviewDataset(train_df, tokenizer, max_len=128)
test_dataset = ReviewDataset(test_df, tokenizer, max_len=128)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [70]:
model = ReviewGuardModel().to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

In [71]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        meta = batch["meta"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, meta=meta)
        logits = outputs["logits"]
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1} Train Loss: {total_loss/len(train_loader):.4f}")


Epoch 1 Train Loss: 0.6894
Epoch 2 Train Loss: 0.6114
Epoch 3 Train Loss: 0.5194


In [72]:
def tune_thresholds(model, val_loader):
    model.eval()
    all_logits, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            meta = batch["meta"].to(device)
            labels = batch["labels"].cpu().numpy()
            
            logits = model(input_ids=input_ids, attention_mask=attention_mask, meta=meta)["logits"].cpu().numpy()
            all_logits.append(logits)
            all_labels.append(labels)
    
    all_logits = np.vstack(all_logits)
    all_labels = np.vstack(all_labels)
    
    thresholds = []
    for i in range(all_labels.shape[1]):
        best_thresh = 0.5
        best_f1 = 0.0
        for t in np.arange(0.1, 0.9, 0.05):
            preds = (1 / (1 + np.exp(-all_logits[:, i])) >= t).astype(int)
            f1 = f1_score(all_labels[:, i], preds)
            if f1 > best_f1:
                best_f1 = f1
                best_thresh = t
        thresholds.append(best_thresh)
    return thresholds


In [None]:
val_dataset = ReviewDataset(val_df)  # val_df is your validation DataFrame
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

thresholds = tune_thresholds(model, val_loader)

model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        meta = batch["meta"].to(device)
        labels = batch["labels"].cpu().numpy()
        
        logits = model(input_ids=input_ids, attention_mask=attention_mask, meta=meta)["logits"].cpu().numpy()
        preds = np.array([(1 / (1 + np.exp(-logits[:, i])) >= thresholds[i]).astype(int) 
                          for i in range(logits.shape[1])]).T
        all_preds.append(preds)
        all_labels.append(labels)

all_preds = np.vstack(all_preds)
all_labels = np.vstack(all_labels)


NameError: name 'val_loader' is not defined

In [59]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    for i, batch in enumerate(train_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        meta = batch["meta"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, meta=meta, labels=labels)
        loss = outputs["loss"] / accum_steps
        loss.backward()

        if (i + 1) % accum_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item() * accum_steps

    print(f"Epoch {epoch+1} Train Loss: {total_loss/len(train_loader):.4f}")

Epoch 1 Train Loss: 0.4324
Epoch 2 Train Loss: 0.2908
Epoch 3 Train Loss: 0.2578


In [60]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        meta = batch["meta"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids=input_ids, attention_mask=attention_mask, meta=meta)["logits"]
        probs = torch.sigmoid(logits).cpu().numpy()
        all_preds.append(probs)
        all_labels.append(labels.cpu().numpy())

all_preds = np.vstack(all_preds)
all_labels = np.vstack(all_labels)

# Metrics per label
metrics = {}
for i, name in enumerate(label_cols):
    ap = average_precision_score(all_labels[:, i], all_preds[:, i])
    prec, rec, f1, _ = precision_recall_fscore_support(all_labels[:, i], (all_preds[:, i]>=0.5).astype(int), zero_division=0)
    metrics[f"{name}_ap"] = ap
    metrics[f"{name}_prec"] = prec[0]
    metrics[f"{name}_rec"] = rec[0]
    metrics[f"{name}_f1"] = f1[0]

# Micro and macro F1
metrics["micro_f1"] = precision_recall_fscore_support(all_labels, (all_preds>=0.5).astype(int), average="micro", zero_division=0)[2]
metrics["macro_f1"] = precision_recall_fscore_support(all_labels, (all_preds>=0.5).astype(int), average="macro", zero_division=0)[2]

print("Evaluation metrics:", metrics)

Evaluation metrics: {'is_ad_ap': 0.03154354450228174, 'is_ad_prec': 0.9668624161073825, 'is_ad_rec': 1.0, 'is_ad_f1': 0.983152058008104, 'is_relevant_ap': 0.971026856084488, 'is_relevant_prec': 0.0, 'is_relevant_rec': 0.0, 'is_relevant_f1': 0.0, 'is_rant_ap': 0.22790569494194376, 'is_rant_prec': 0.923238255033557, 'is_rant_rec': 1.0, 'is_rant_f1': 0.9600872410032716, 'is_legit_ap': 0.9069841912615952, 'is_legit_prec': 0.0, 'is_legit_rec': 0.0, 'is_legit_f1': 0.0, 'micro_f1': 0.9263607257203842, 'macro_f1': 0.47608377941128566}


code graveyard

In [40]:
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
max_len = 128

class ReviewDataset(IterableDataset):
    def __init__(self, df, tokenizer, max_len, meta_cols, label_cols):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.meta_cols = meta_cols
        self.label_cols = label_cols

    def __iter__(self):
        for _, row in self.df.iterrows():
            enc = self.tokenizer(
                row["clean_text"],
                truncation=True,
                padding="max_length",
                max_length=self.max_len,
                return_tensors="pt"
            )
            meta = torch.tensor([row[c] for c in self.meta_cols], dtype=torch.float32)
            labels = torch.tensor([row[c] for c in self.label_cols], dtype=torch.float32)

            yield {
                "input_ids": enc["input_ids"].squeeze(0),
                "attention_mask": enc["attention_mask"].squeeze(0),
                "meta": meta,
                "labels": labels
            }

train_dataset = ReviewDataset(train_df, tokenizer, max_len, meta_cols, label_cols)
test_dataset = ReviewDataset(test_df, tokenizer, max_len, meta_cols, label_cols)

train_loader = DataLoader(train_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)


In [30]:
class ReviewGuardModel(nn.Module):
    def __init__(self, backbone="prajjwal1/bert-tiny", meta_dim=6):
        super().__init__()
        self.enc = AutoModel.from_pretrained(backbone)
        hid = self.enc.config.hidden_size
        self.meta_net = nn.Sequential(nn.Linear(meta_dim, 32), nn.ReLU(), nn.Dropout(0.1), nn.Linear(32, 32), nn.ReLU())
        self.fuse = nn.Linear(hid + 32, hid)
        self.cls = nn.Linear(hid, len(label_cols))

    def forward(self, input_ids, attention_mask, meta, labels=None):
        x = self.enc(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:,0]
        m = self.meta_net(meta)
        z = torch.relu(self.fuse(torch.cat([x, m], dim=1)))
        logits = self.cls(z)
        loss = None
        if labels is not None:
            loss = nn.BCEWithLogitsLoss()(logits, labels)
        return {"loss": loss, "logits": logits}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ReviewGuardModel(meta_dim=len(meta_cols)).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optional: freeze transformer backbone for first few epochs
freeze_backbone = True
if freeze_backbone:
    for param in model.enc.parameters():
        param.requires_grad = False

optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)

epochs = 3
accum_steps = 4  # simulate larger batch by accumulating gradients
scaler = GradScaler()

for epoch in range(epochs):
    model.train()
    total_loss = 0
    batch_count = 0

    for i, batch in enumerate(train_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        meta = batch["meta"].to(device)
        labels = batch["labels"].to(device)

        with autocast():  # mixed precision
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, meta=meta, labels=labels)
            loss = outputs["loss"] / accum_steps

        scaler.scale(loss).backward()

        if (i + 1) % accum_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item() * accum_steps
        batch_count += 1

    print(f"Epoch {epoch+1} Train Loss: {total_loss / batch_count:.4f}")

# Optional: unfreeze backbone for fine-tuning
if freeze_backbone:
    for param in model.enc.parameters():
        param.requires_grad = True


  scaler = GradScaler()
  with autocast():  # mixed precision


KeyboardInterrupt: 

In [None]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        meta = batch["meta"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids=input_ids, attention_mask=attention_mask, meta=meta)["logits"]
        probs = torch.sigmoid(logits).cpu().numpy()
        all_preds.append(probs)
        all_labels.append(labels.cpu().numpy())

import numpy as np
all_preds = np.vstack(all_preds)
all_labels = np.vstack(all_labels)
preds_bin = (all_preds >= 0.5).astype(int)

for i, label in enumerate(label_cols):
    f1 = f1_score(all_labels[:,i], preds_bin[:,i])
    prec = precision_score(all_labels[:,i], preds_bin[:,i])
    rec = recall_score(all_labels[:,i], preds_bin[:,i])
    print(f"{label}: F1={f1:.3f}, Precision={prec:.3f}, Recall={rec:.3f}")
