In [None]:
# Model: mDeBERTa-v3-base
# Load model directly
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch, re, joblib
import numpy as np
from huggingface_hub import hf_hub_download
from scipy.sparse import csr_matrix, hstack

# Constants
MODEL_NAME = "kiankiat/loc-review-classification-model"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load the models
tokenizer_model = AutoTokenizer.from_pretrained(MODEL_NAME)
classifier_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

# Load the meta_classifier
meta_path = hf_hub_download(repo_id=MODEL_NAME, filename="meta_classifier.joblib")
meta = joblib.load(meta_path)

sample = {
  "company_name": "McDonald's",
  "review_date": "2025-08-29",
  "text": "The new Spicy Tomato McChicken Set is wonderful for my wallet. The potato pops also go really well with it.",
  "stars": 5,
  "category": "food and beverages"
}

vocab = [
    "the","i","of","was","to","a","for","in","is","it","that","at","you","my","on","with","but","this","about","its",
    "and","we","me","they","are","out","their","an","our","not","been","if","service","like","also","had","so","as",
    "your","all","have","ive","from","even","here","very","just","food","never","place","were","there","amazing",
    "honestly","experience","be","good","by","get","how","people","while","staff","new","say","heard","time","friend",
    "call","which","check","up","dont","or","more","code","can","great","deals","absolutely","youre","has","meanwhile",
    "when","discount","one","told","these","really","recently","exclusive","some","visit","crypto","what","im","no",
    "only","us","them","offer","any","best","now","would","recommend","singapore","care","weather","clinic","unbeatable",
    "got","where","will","help","loved","life","too","offers","looking","did","discovered","miss","day","off","cash",
    "well","made","highly","local","nothing","spent","love","www","ever","friendly","she","than",
    "services","back","quick","over","restaurant","nice","definitely","go","always","other","bar","last"
]

def remove_punct(text):
    text = text or ""
    return re.sub(r"[\'\"’.,:&@!#\-\(\)0-9–—-−]", "", text)

def remove_escape_chars(text: str) -> str:
    if text is None:
        return ""
    cleaned = re.sub(r'[\n\t\r\f\v]', ' ', text)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned

def lowercase(text):
  return text.lower()

def text_to_array(review: str):
    if review is None:
        return [0.0] * len(vocab)
    # Normalize: lowercase, remove punctuation, split on whitespace
    review = remove_escape_chars(lowercase(remove_punct(review)))
    tokens = set(review.split())
    # Build the array
    return [1.0 if word in tokens else 0.0 for word in vocab]

sample["tfidf_score"] = text_to_array(sample["text"])

def tokenize_for_inference(datarow):
    cat = datarow["category"]
    rating = datarow["stars"]

    company = datarow["company_name"].strip()
    poi = f"POI: {company} [CAT_{cat}] [RATING_{rating}]"

    text = datarow["text"]
    if text is None:
        text = ""
    else:
        text = str(text).strip()

    encoded = tokenizer_model(
        text,
        poi,
        truncation=True,
        max_length=256,
        return_tensors="pt"   # so we can pass directly to model
    )
    return encoded

inputs = tokenize_for_inference(sample)

def tfidf_row(vec):
    arr = np.asarray(vec, dtype=np.float32).reshape(1, -1)
    return csr_matrix(arr)

with torch.inference_mode():
  logits = classifier_model(**inputs).logits
  probs = torch.softmax(logits, dim=-1)
  X_tfidf = tfidf_row(sample["tfidf_score"])
  X_meta = hstack([X_tfidf, csr_matrix(probs)], format="csr")
  pred = meta.predict(X_meta)[0]

print("Transformer probs:", probs.tolist())
print("Meta prediction:", pred)

Transformer probs: [[0.0052802711725234985, 0.00024143325572367758, 0.9944782257080078]]
Meta prediction: 2


In [None]:
# Training Script
import os, math, torch, evaluate, csv, json, time
import numpy as np

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset, DatasetDict

# Constants
MODEL_NAME = "microsoft/mdeberta-v3-base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
NO_OF_LABELS = 3
CATEGORIES = ["Food and Beverages", "Education", "Healthcare", "Retail", "Arts", "Hotels"]
DATA_FILE_PATH = "/content/sample_data/withtfidf.jsonl"
OUTPUT_DIR = "/content/sample_data/loc-review-classification-model-v2"
MAX_LENGTH = 256
SEED = 42

# Label Mappings
id2label = {0: "spam", 1: "irrelevant", 2: "relevant"}
label2id = {value: key for key, value in id2label.items()}

start = time.time()
# Load model to train
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
classification_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NO_OF_LABELS,
    id2label=id2label,
    label2id=label2id
)

# Load dataset
DATA_FILES = {
    "trainval": DATA_FILE_PATH
}
raw = load_dataset("json", data_files=DATA_FILES)["trainval"]

required_headers = {"company_name", "review_date", "text", "stars", "category", "label"}
missing = required_headers - set(raw.features.keys())
if missing:
  ValueError(f"Dataset missing required keys: {missing}")

# Special tokens
special_tokens = []
categories = set()
for ex in raw:
  categories.add(ex["category"])
# Add all categories
special_tokens += [f"[CAT_{c}]" for c in sorted(categories)]

# Add all ratings
special_tokens += [f"[RATING_{r}]" for r in range(1, 6)]

tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})

# Preprocessing
def construct_pair(datarow):
  cat = datarow["category"]
  rating = datarow["stars"]

  company = datarow["company_name"].strip()
  poi = f"POI: {company} [CAT_{cat}] [RATING_{rating}]"

  text = datarow["text"]
  if text is None:
    text = ""
  else:
    text = str(text).strip()

  encoded = tokenizer(
      text,
      poi,
      truncation=True,
      max_length=MAX_LENGTH
  )

  y = datarow["label"]
  encoded["labels"] = int(y)
  return encoded

processed = raw.map(construct_pair, remove_columns=raw.column_names)

# Splitting dataset
processed = processed.class_encode_column("labels")
splits = processed.train_test_split(test_size=0.2, seed=SEED, stratify_by_column="labels")
dataset = DatasetDict({
    "train": splits["train"],
    "validation": splits["test"]
})

# Evaluation
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  preds = np.argmax(logits, axis=-1)
  return {
      "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
      "precision": precision.compute(predictions=preds, references=labels, average="macro")["precision"],
      "recall_macro": recall.compute(predictions=preds, references=labels, average="macro")["recall"],
      "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"]
  }

# Training
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
    fp16=torch.cuda.is_available(),
    seed=SEED,
    report_to="none"
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=classification_model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()
end = time.time()
elapsed_time = (end - start) / 3600
print(f"Time to train model: {elapsed_time:.2f} seconds")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Training complete. Best model saved to:", OUTPUT_DIR)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall Macro,F1 Macro
1,0.1542,0.0616,0.98525,0.982018,0.972638,0.97711
2,0.0587,0.069751,0.986006,0.984969,0.973287,0.978815


Time to train model: 0.19 seconds


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...ssification-model/training_args.bin: 100%|##########| 5.84kB / 5.84kB            

  ...ssification-model/model.safetensors:   2%|1         | 16.8MB / 1.12GB            

  ...view-classification-model/spm.model: 100%|##########| 4.31MB / 4.31MB            

  ...classification-model/tokenizer.json: 100%|##########| 16.4MB / 16.4MB            

Training complete. Best model saved to: /content/sample_data/loc-review-classification-model


In [None]:
# LogisticRegression Classifier with TF-IDF

import json
import numpy as np
from datasets import load_dataset
from scipy.sparse import csr_matrix, vstack, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import joblib

MODEL_NAME = "kiankiat/loc-review-classification-model"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DATA_FILE_PATH = "/content/sample_data/withtfidf.jsonl"
SEED = 42
MAX_LENGTH = 256
BATCH = 64
TF_IDF_COL = "tfidf_score"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
classification_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE).eval()

DATA_FILES = {
    "trainval": DATA_FILE_PATH
}
raw = load_dataset("json", data_files=DATA_FILES)["trainval"]
raw = raw.class_encode_column("label")
splits = raw.train_test_split(test_size=0.2, seed=SEED, stratify_by_column="label")
train_ds, val_ds = splits["train"], splits["test"]

def tfidf_row_from_json_string(s):
    arr = np.asarray(json.loads(s), dtype=np.float32).reshape(1, -1)
    return csr_matrix(arr)

def build_tfidf_matrix(dataset, col=TF_IDF_COL):
    rows = [tfidf_row_from_json_string(s) for s in dataset[col]]
    return vstack(rows, format="csr")

Xtr_tfidf = build_tfidf_matrix(train_ds)
Xva_tfidf = build_tfidf_matrix(val_ds)

def build_poi(company, category, stars):
    return f"POI: {str(company).strip()} [CAT_{category}] [RATING_{stars}]"

@torch.inference_mode()
def probs_from_transformer(ds):
    out = []
    n = len(ds)
    for i in range(0, n, BATCH):
        t = ds["text"][i:i+BATCH]
        p = [build_poi(c, g, s) for c, g, s in zip(ds["company_name"][i:i+BATCH],
                                                  ds["category"][i:i+BATCH],
                                                  ds["stars"][i:i+BATCH])]
        enc = tokenizer(t, p, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors="pt").to(DEVICE)
        logits = classification_model(**enc).logits
        out.append(torch.softmax(logits, dim=-1).cpu().numpy())
    return np.vstack(out)

p_train = probs_from_transformer(train_ds)
p_val   = probs_from_transformer(val_ds)

# Stack features: [TFIDF || probs]
Xtr_meta = hstack([Xtr_tfidf, p_train])
Xva_meta = hstack([Xva_tfidf, p_val])
y_tr = np.array(train_ds["label"])
y_va = np.array(val_ds["label"])

# Meta-classifier
meta = LogisticRegression(max_iter=1000, n_jobs=-1, solver="saga", multi_class="auto")
meta.fit(Xtr_meta, y_tr)

pred = meta.predict(Xva_meta)
print(classification_report(y_va, pred, digits=4))
print("Macro F1:", f1_score(y_va, pred, average="macro"))

joblib.dump(meta, "/content/sample_data/meta_classifier.joblib")
print("Done!")

1
2




              precision    recall  f1-score   support

           0     0.9738    0.9275    0.9501       400
           1     1.0000    0.9992    0.9996      1217
           2     0.9723    0.9912    0.9817      1027

    accuracy                         0.9852      2644
   macro avg     0.9820    0.9726    0.9771      2644
weighted avg     0.9853    0.9852    0.9851      2644

Macro F1: 0.9771103074491533
Done!


TFIDF

In [None]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#Check this site for the latest download link https://www.apache.org/dyn/closer.lua/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

import os
import sys
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"


import findspark
findspark.init()
findspark.find()

import pyspark

from pyspark.sql import DataFrame, SparkSession
from typing import List
import pyspark.sql.types as T
import pyspark.sql.functions as F

In [None]:
import re
import sets, math
import numpy

from pyspark.sql.functions import lit
from pyspark.sql import SQLContext
from pyspark.mllib import *
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.classification import SVMWithSGD
from pyspark.mllib.evaluation import BinaryClassificationMetrics

sparkSession = SparkSession.builder.appName("SVM notebook").getOrCreate()
sc = sparkSession.sparkContext

In [None]:
import re

vocab = [
    "the","i","of","was","to","a","for","in","is","it","that","at","you","my","on","with","but","this","about","its",
    "and","we","me","they","are","out","their","an","our","not","been","if","service","like","also","had","so","as",
    "your","all","have","ive","from","even","here","very","just","food","never","place","were","there","amazing",
    "honestly","experience","be","good","by","get","how","people","while","staff","new","say","heard","time","friend",
    "call","which","check","up","dont","or","more","code","can","great","deals","absolutely","youre","has","meanwhile",
    "when","discount","one","told","these","really","recently","exclusive","some","visit","crypto","what","im","no",
    "only","us","them","offer","any","best","now","would","recommend","singapore","care","weather","clinic","unbeatable",
    "got","where","will","help","loved","life","too","offers","looking","did","discovered","miss","day","off","cash",
    "well","made","highly","local","nothing","spent","love","www","ever","friendly","she","than",
    "services","back","quick","over","restaurant","nice","definitely","go","always","other","bar","last"
]


def remove_punct(text):
    text = text or ""
    return re.sub(r"[\'\"’.,:&@!#\-\(\)0-9–—-−]", "", text)

def remove_escape_chars(text: str) -> str:
    if text is None:
        return ""
    cleaned = re.sub(r'[\n\t\r\f\v]', ' ', text)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned

def lowercase(text):
  return text.lower()

def text_to_array(review: str):
    if review is None:
        return [0.0] * len(vocab)
    # Normalize: lowercase, remove punctuation, split on whitespace
    review = remove_escape_chars(lowercase(remove_punct(review)))
    tokens = set(review.split())
    # Build the array
    return [1.0 if word in tokens else 0.0 for word in vocab]

# Example usage
sample = "The service was great, but the food was not very good."
print(text_to_array(sample))


[1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5
