In [1]:
import pandas as pd
import random
seed=random.randint(0,100)

In [2]:
df=pd.read_csv('../data/out/augmented_shuffled.csv')
df = df.dropna(subset=["predicted_label"])
df = df[df["predicted_label"].str.strip() != ""]
df['predicted_label'].value_counts()

predicted_label
Valid                 7681
Irrelevant             480
Advertisement          172
Rant_Without_Visit     135
Name: count, dtype: int64

In [3]:
# Separate the DataFrame into 'valid' and 'non-valid' rows.
df_valid = df[df['predicted_label'] == 'Valid']
df_other = df[df['predicted_label'] != 'Valid']

# Randomly sample 300 rows from the 'valid' DataFrame.
df_valid_sampled = df_valid.sample(n=300, random_state=seed)

# Keep the remaining 'Valid' rows to be added to the TEST set later.
# df_valid_remaining = df_valid.drop(df_valid_sampled.index)

# Pool used for the 80/20 split = sampled 300 Valid + all Others.
# (Keep original indices so we can slice back into df later.)
df_pool = pd.concat([df_valid_sampled, df_other], ignore_index=False)

In [4]:
tag_mapping_dict = {
    'Valid': 0,
    'Advertisement': 1,
    'Irrelevant': 2,
    'Rant_Without_Visit': 3,
}

df['label'] = df['predicted_label'].map(tag_mapping_dict)


# REPLACE this:
# df[df['label']==None]
# WITH:
missing = df['label'].isna()
print("Unmapped labels:", int(missing.sum()))
assert missing.sum() == 0, "Found unmapped labels in predicted_label"


Unmapped labels: 0


In [5]:
df.head(2)

Unnamed: 0,business_name,text,predicted_label,prediction_reason,description,category,label
0,Ashton Burger Barn,The food here is great. Their service is great...,Valid,The review provides a genuine assessment of th...,,['Hamburger restaurant'],0
1,Lagoon Amusement Park,My kids and I love this place. Make sure you g...,Valid,The review describes a genuine experience at t...,Seasonal theme park/water park offering thrill...,"['Amusement park', 'Tourist attraction']",0


In [6]:
PROMPT_PREFIX="""
You are a top-tier content moderation expert specializing in the evaluation of Google Maps location reviews. 
Your task is to parse a JSON object containing review data and accurately classify it according to the following policies and rules.
"""

def create_json_from_row(row):
    """
    Converts a DataFrame row into a JSON-formatted string.
    """
    json_object = {
        "business_name": row["business_name"],
        "business_description": row["description"],
        "business_category": row["category"],
        "review_text": row["text"],
        # "review_length": len(row["text"]),
    }
    return str(json_object)
    # return PROMPT_PREFIX+str(json_object)

df['text'] = df.apply(create_json_from_row, axis=1)
df['text']


0       {'business_name': 'Ashton Burger Barn', 'busin...
1       {'business_name': 'Lagoon Amusement Park', 'bu...
2       {'business_name': 'NOLA Restaurant', 'business...
3       {'business_name': "Raising Cane's Chicken Fing...
4       {'business_name': 'Latonia Centre', 'business_...
                              ...                        
8467    {'business_name': 'Subway', 'business_descript...
8468    {'business_name': 'Chuck E. Cheese', 'business...
8469    {'business_name': 'Family Thrift Center', 'bus...
8470    {'business_name': 'Hampton Inn Moab', 'busines...
8471    {'business_name': 'Pizza Palace Express', 'bus...
Name: text, Length: 8468, dtype: object

In [7]:
df_filtered = df.loc[:, ['text', 'label']]

In [8]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    df_filtered[['text','label']],
    test_size=0.2,
    random_state=seed,
    stratify=df_filtered['label']
)

train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset   = Dataset.from_pandas(val_df.reset_index(drop=True))

my_dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
}).remove_columns([])  # nothing to drop now

print(my_dataset_dict)


  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 6774
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1694
    })
})


In [9]:
val_df['label'].value_counts()

label
0    1537
2      96
1      34
3      27
Name: count, dtype: int64

In [10]:
# gpu usage
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3,4,5,6,7"

In [11]:
import torch

if torch.cuda.is_available():
    print(f"GPU is available. Device count: {torch.cuda.device_count()}")
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
    device = torch.device("cuda")
else:
    print("No GPU available, using CPU.")
    device = torch.device("cpu")

GPU is available. Device count: 7
Current device: 0
Device name: NVIDIA L40S


In [12]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

INSTRUCTION = (
    "Task: classify review for THIS business. "
    "Labels: Valid, Advertisement, Irrelevant, Rant_Without_Visit. "
    "Priority: Ad > Irrelevant > No-visit rant > Valid."
)

def tokenize_function(example):
    texts = example["text"]
    # texts is a list when batched=True; replicate the instruction to match length
    instr = [INSTRUCTION] * len(texts)
    return tokenizer(instr, texts, truncation=True, max_length=256)


tokenized_datasets = my_dataset_dict.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# drop any raw columns that may exist (prompted_text OR text, and any index col)
cols_to_drop = [c for c in tokenized_datasets["train"].column_names 
                if c in ["prompted_text", "text", "__index_level_0__"]]
if cols_to_drop:
    tokenized_datasets = tokenized_datasets.remove_columns(cols_to_drop)



Map: 100%|██████████| 6774/6774 [00:01<00:00, 6694.13 examples/s]
Map: 100%|██████████| 1694/1694 [00:00<00:00, 2024.55 examples/s]


In [13]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=4,ignore_mismatched_sizes=True)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_weighted": f1_score(labels, preds, average="weighted", zero_division=0),
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
    }

training_args = TrainingArguments(
    output_dir=f"./results/{checkpoint}",
    num_train_epochs=6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir="./logs",
    # Add these two lines to show training loss
    logging_strategy="steps",
    logging_steps=6,  
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
)




trainer = Trainer(
    model=model,                                   
    args=training_args,                            
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,                           
    compute_metrics=compute_metrics,               
)


trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro
1,0.2214,0.180278,0.953955,0.944672,0.683054
2,0.1412,0.162177,0.963991,0.959514,0.832023
3,0.074,0.134996,0.968713,0.964677,0.892395
4,0.0511,0.135353,0.966942,0.966207,0.903513
5,0.0356,0.123908,0.969303,0.96723,0.89293
6,0.081,0.137194,0.971074,0.969076,0.900558




TrainOutput(global_step=366, training_loss=0.16426894569494685, metrics={'train_runtime': 268.3679, 'train_samples_per_second': 151.449, 'train_steps_per_second': 1.364, 'total_flos': 5330289919234176.0, 'train_loss': 0.16426894569494685, 'epoch': 6.0})

In [14]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

id2label = {0: "Valid", 1: "Advertisement", 2: "Irrelevant", 3: "Rant_Without_Visit"}
all_labels = [0, 1, 2, 3]
target_names = [id2label[i] for i in all_labels]

predictions = trainer.predict(tokenized_datasets["validation"])
logits = predictions.predictions
labels = predictions.label_ids
preds = np.argmax(logits, axis=-1).astype(int)


# print(classification_report(
#     labels, preds,
#     labels=all_labels,              # <-- ensure fixed label set
#     target_names=target_names,
#     zero_division=0,
#     digits=3
# ))

# print("Confusion matrix:\n", confusion_matrix(labels, preds, labels=all_labels))



In [15]:
import numpy as np
from datasets import Dataset

predictions = trainer.predict(tokenized_datasets["validation"])
logits = predictions.predictions
labels = predictions.label_ids
preds = np.argmax(logits, axis=-1).astype(int)




In [16]:
from datasets import load_from_disk

load_path = "./stage2_filtered_data"
loaded_stage2_dataset = load_from_disk(load_path)
print(loaded_stage2_dataset)

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 13
})


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

id2label = {0: "Valid", 1: "Advertisement", 2: "Irrelevant", 3: "Rant_Without_Visit"}
all_labels = [0, 1, 2, 3]
target_names = [id2label[i] for i in all_labels]
stage2_predictions = trainer_stage2.predict(loaded_stage2_dataset)
predictions = trainer.predict(loaded_stage2_dataset)
logits = predictions.predictions
labels = predictions.label_ids
preds_stage2 = np.argmax(logits, axis=-1).astype(int)
# print(classification_report(
#     labels, preds_stage2,
#     labels=all_labels,              # <-- ensure fixed label set
#     target_names=target_names,
#     zero_division=0,
#     digits=3
# ))

# print("Confusion matrix:\n", confusion_matrix(labels, preds_stage2, labels=all_labels))



In [18]:
mask = (preds == 0)
final_preds = np.copy(preds)
mapped_preds_stage2 = preds_stage2 
# final_preds[mask] = mapped_preds_stage2


from sklearn.metrics import classification_report

target_names = ['Valid', 'Advertisement', 'Irrelevant', 'Rant_Without_Visit'] 
print(classification_report(labels, preds, target_names=target_names))

                    precision    recall  f1-score   support

             Valid       0.98      0.99      0.98      1537
     Advertisement       1.00      0.94      0.97        34
        Irrelevant       0.76      0.69      0.72        96
Rant_Without_Visit       1.00      0.89      0.94        27

          accuracy                           0.97      1694
         macro avg       0.93      0.88      0.90      1694
      weighted avg       0.97      0.97      0.97      1694

