In [3]:
!pip install vaderSentiment textstat

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Collecting textstat
  Downloading textstat-0.7.10-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading textstat-0.7.10-py3-none-any.whl (239 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.2/239.2 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m96.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, vaderSentiment, textstat
Successfully installed pyphen-0.17.2 textstat-0.7.10 vaderSentiment-3.3.2


In [4]:
import pandas as pd
import numpy as np
import json
import ast  # For safely evaluating string-formatted lists
import re   # --- NEW --- For regular expressions
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tqdm import tqdm

# --- NEW: Import libraries for feature engineering ---
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import textstat

from google.colab import drive
drive.mount('/content/drive')

# Initialize tqdm to work with pandas' apply method
tqdm.pandas()

# =============================================================================
# HELPER FUNCTIONS
# =============================================================================

# --- NEW: Helper functions for feature engineering ---
analyzer = SentimentIntensityAnalyzer()
CTA_WORDS = ['visit', 'check out', 'try our', 'don\'t miss', 'shop now']
OFFER_WORDS = ['deal', 'offer', 'discount', 'promotion', 'free', 'sale']

def get_sentiment(text):
    return analyzer.polarity_scores(str(text))['compound']

def get_caps_ratio(text):
    text = str(text)
    if not text: return 0
    upper_chars = sum(1 for c in text if c.isupper())
    alpha_chars = sum(1 for c in text if c.isalpha())
    return upper_chars / alpha_chars if alpha_chars > 0 else 0

def has_keyword(text, keywords):
    return 1 if any(word in str(text).lower() for word in keywords) else 0

def get_readability(text):
    return textstat.flesch_kincaid_grade(str(text))

# --- Original helper functions ---
def parse_and_flatten_misc_data(data):
    if not isinstance(data, str): return []
    try:
        misc_dict = json.loads(data)
        flat_list = []
        for key, values in misc_dict.items():
            if isinstance(values, list):
                for value in values:
                    flat_list.append(f"{key.strip()}:{value.strip()}")
        return flat_list
    except (json.JSONDecodeError, TypeError): return []

def parse_categories(data):
    if not isinstance(data, str): return []
    try: return ast.literal_eval(data)
    except (ValueError, SyntaxError): return []

# =============================================================================
# MAIN DATA PREPARATION SCRIPT
# =============================================================================

def prepare_data(input_csv_path, output_csv_path):
    print("Step 1: Loading raw data...")
    df = pd.read_csv(input_csv_path)

    print("Step 2: Encoding target variable 'predicted_classification'...")
    target_mapping = {'relevant': 0, 'spam': 1, 'rant': 2, 'advertisement': 3}
    df['target_class'] = df['predicted_classification'].map(target_mapping)
    df.dropna(subset=['target_class'], inplace=True)
    df['target_class'] = df['target_class'].astype(int)

    print("Step 3: Encoding 'user_id', 'gmap_id', and 'price'...")
    for col in ['user_id', 'gmap_id', 'price']:
        encoder = LabelEncoder()
        df[f'{col}_encoded'] = encoder.fit_transform(df[col].astype(str))

    print("Step 4: Parsing and creating tag vocabularies...")
    df['misc_tags_flat'] = df['misc_data'].progress_apply(parse_and_flatten_misc_data)
    df['category_tags_list'] = df['categories'].progress_apply(parse_categories)
    all_misc_tags = set(tag for tag_list in df['misc_tags_flat'] for tag in tag_list)
    all_category_tags = set(tag for tag_list in df['category_tags_list'] for tag in tag_list)
    all_tags = sorted(list(all_misc_tags.union(all_category_tags)))
    tag_to_id = {tag: i + 1 for i, tag in enumerate(all_tags)}
    df['misc_tags_encoded'] = df['misc_tags_flat'].progress_apply(lambda tags: [tag_to_id.get(tag, 0) for tag in tags])
    df['category_tags_encoded'] = df['category_tags_list'].progress_apply(lambda tags: [tag_to_id.get(tag, 0) for tag in tags])

    # --- NEW: STEP 5 - FEATURE ENGINEERING ---
    print("Step 5a: Engineering Text Meta-Features...")
    df['sentiment'] = df['text'].progress_apply(get_sentiment)
    df['caps_ratio'] = df['text'].progress_apply(get_caps_ratio)
    df['readability_grade'] = df['text'].progress_apply(get_readability)
    df['has_cta'] = df['text'].progress_apply(lambda text: has_keyword(text, CTA_WORDS))
    df['has_offer'] = df['text'].progress_apply(lambda text: has_keyword(text, OFFER_WORDS))

    print("Step 5b: Engineering User Behavioral Features...")
    user_stats = df.groupby('user_id')['rating'].agg(['mean', 'std', 'count']).rename(columns={
        'mean': 'user_avg_rating', 'std': 'user_std_rating', 'count': 'user_review_count'})
    df = df.merge(user_stats, on='user_id', how='left')
    df.fillna({'user_std_rating': 0}, inplace=True) # Fill NaN for users with only 1 review
    df['is_5_star_only_user'] = ((df['user_std_rating'] == 0) & (df['user_avg_rating'] == 5.0)).astype(int)

    # --- UPDATED: STEP 6 - NORMALIZE ALL NUMERICAL FEATURES ---
    print("Step 6: Normalizing all numerical features...")
    numerical_cols = [
        'avg_rating', 'num_of_reviews', 'pics_count',
        'sentiment', 'caps_ratio', 'readability_grade', 'has_cta', 'has_offer',
        'user_avg_rating', 'user_std_rating', 'user_review_count', 'is_5_star_only_user'
    ]
    # Ensure all columns exist, add placeholder if not
    for col in numerical_cols:
        if col not in df.columns:
            df[col] = 0
            print(f"Warning: Column '{col}' not found. Added as a placeholder.")

    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    # --- UPDATED: STEP 7 - FINALIZE AND SAVE ---
    print("Step 7: Selecting final columns and saving the processed data...")
    final_columns = [
        'text', 'response_text', 'user_id_encoded', 'gmap_id_encoded', 'price_id_encoded',
        'category_tags_encoded', 'misc_tags_encoded', 'target_class'
    ] + numerical_cols # Add all the new numerical features

    # Ensure all required columns exist
    for col in final_columns:
        if col not in df.columns:
            df[col] = 0 if 'encoded' in col or 'target' in col else ''
            print(f"Warning: Column '{col}' not found. Added as a placeholder.")

    processed_df = df[final_columns]
    processed_df.to_csv(output_csv_path, index=False)

    print("\nData preparation complete!")
    print(f"Processed data saved to: {output_csv_path}")

    # --- STEP 8: Print Vocabulary Sizes for Model Configuration ---
    print("\n--- COPY THESE VALUES INTO YOUR PYTORCH SCRIPT CONFIGURATION ---")
    print(f"NUM_USERS = {df['user_id_encoded'].nunique()}")
    print(f"NUM_GMAP_IDS = {df['gmap_id_encoded'].nunique()}")
    print(f"NUM_PRICE_TIERS = {df['price_id_encoded'].nunique()}")
    print(f"NUM_CATEGORIES_TAGS = {len(tag_to_id) + 1}")
    print(f"NUM_MISC_TAGS = {len(tag_to_id) + 1}")
    print("------------------------------------------------------------------\n")

if __name__ == '__main__':
    # <<< CHANGE THIS >>> to the path of your raw data file
    INPUT_CSV = '/content/drive/MyDrive/my_predictions_on_wah.csv'

    # <<< IMPORTANT: CHANGE THIS >>> to a new output file name
    OUTPUT_CSV = '/content/drive/MyDrive/processed_reviews_v2.csv'

    prepare_data(INPUT_CSV, OUTPUT_CSV)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Step 1: Loading raw data...
Step 2: Encoding target variable 'predicted_classification'...
Step 3: Encoding 'user_id', 'gmap_id', and 'price'...
Step 4: Parsing and creating tag vocabularies...


100%|██████████| 57539/57539 [00:00<00:00, 65067.43it/s]
100%|██████████| 57539/57539 [00:00<00:00, 84885.70it/s]
100%|██████████| 57539/57539 [00:00<00:00, 471918.05it/s]
100%|██████████| 57539/57539 [00:00<00:00, 1229806.81it/s]


Step 5a: Engineering Text Meta-Features...


100%|██████████| 57539/57539 [00:08<00:00, 6762.25it/s]
100%|██████████| 57539/57539 [00:00<00:00, 78239.93it/s]
100%|██████████| 57539/57539 [00:04<00:00, 12640.30it/s]
100%|██████████| 57539/57539 [00:00<00:00, 235309.55it/s]
100%|██████████| 57539/57539 [00:00<00:00, 206301.52it/s]


Step 5b: Engineering User Behavioral Features...
Step 6: Normalizing all numerical features...
Step 7: Selecting final columns and saving the processed data...

Data preparation complete!
Processed data saved to: /content/drive/MyDrive/processed_reviews_v2.csv

--- COPY THESE VALUES INTO YOUR PYTORCH SCRIPT CONFIGURATION ---
NUM_USERS = 55172
NUM_GMAP_IDS = 37333
NUM_PRICE_TIERS = 1
NUM_CATEGORIES_TAGS = 201
NUM_MISC_TAGS = 201
------------------------------------------------------------------



In [6]:
# =============================================================================
# 1. IMPORTS & SETUP
# =============================================================================
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, cohen_kappa_score
from tqdm.notebook import tqdm
import ast

from transformers import BertModel, BertTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW
from torchvision import ops

from google.colab import drive
drive.mount('/content/drive')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# =============================================================================
# 2. CONFIGURATION
# =============================================================================
# ### --- Model Hyperparameters --- ###
USER_EMB_DIM = 32
GMAP_EMB_DIM = 64
PRICE_EMB_DIM = 4
TAG_EMB_DIM = 16
BERT_MODEL_NAME = 'bert-base-uncased'
HIDDEN_DIM = 256
NUM_NUMERICAL_FEATURES = 12

# ### --- Dataset & Vocabulary Sizes --- ###
NUM_USERS = 55172
NUM_GMAP_IDS = 37333
NUM_PRICE_TIERS = 1
NUM_CATEGORIES_TAGS = 201
NUM_MISC_TAGS = 201
NUM_CLASSES = 4
CLASS_NAMES = ['relevant', 'spam', 'rant', 'advertisement']

# ### --- Training Parameters --- ###
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
EPOCHS = 15
PATIENCE = 3
USE_FOCAL_LOSS = True


# =============================================================================
# 3. TOWER DEFINITIONS (The "Sub-Models")
# =============================================================================
class TextTower(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state[:, 0, :]

class UserTower(nn.Module):
    def __init__(self, num_users, embedding_dim):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
    def forward(self, user_ids):
        return self.user_embedding(user_ids)

class BusinessTower(nn.Module):
    def __init__(self, num_gmap_ids, num_prices, num_cat_tags, num_misc_tags, num_numerical_features):
        super().__init__()
        self.gmap_embedding = nn.Embedding(num_gmap_ids, GMAP_EMB_DIM)
        self.price_embedding = nn.Embedding(num_prices, PRICE_EMB_DIM)
        self.category_embedding_bag = nn.EmbeddingBag(num_cat_tags, TAG_EMB_DIM, mode='mean', padding_idx=0)
        self.misc_embedding_bag = nn.EmbeddingBag(num_misc_tags, TAG_EMB_DIM, mode='mean', padding_idx=0)
        combined_dim = GMAP_EMB_DIM + PRICE_EMB_DIM + TAG_EMB_DIM + TAG_EMB_DIM + num_numerical_features
        self.mlp = nn.Sequential(nn.Linear(combined_dim, 128), nn.ReLU(), nn.Linear(128, 64))
    def forward(self, gmap_ids, price_ids, cat_tags, misc_tags, numerical_features):
        gmap_vec = self.gmap_embedding(gmap_ids)
        price_vec = self.price_embedding(price_ids)
        cat_vec = self.category_embedding_bag(cat_tags)
        misc_vec = self.misc_embedding_bag(misc_tags)
        combined_vec = torch.cat([gmap_vec, price_vec, cat_vec, misc_vec, numerical_features], dim=1)
        return self.mlp(combined_vec)


# =============================================================================
# 4. THE MAIN MODEL (The Manager)
# =============================================================================
class MainModel(nn.Module):
    def __init__(self, num_numerical_features):
        super().__init__()
        self.text_tower = TextTower(BERT_MODEL_NAME)
        self.user_tower = UserTower(NUM_USERS, USER_EMB_DIM)
        self.business_tower = BusinessTower(NUM_GMAP_IDS, NUM_PRICE_TIERS, NUM_CATEGORIES_TAGS, NUM_MISC_TAGS, num_numerical_features)
        text_output_dim, user_output_dim, business_output_dim = 768, USER_EMB_DIM, 64
        self.attention = nn.MultiheadAttention(embed_dim=text_output_dim,kdim=user_output_dim + business_output_dim,vdim=user_output_dim + business_output_dim,num_heads=8,batch_first=True)
        classifier_input_dim = text_output_dim + text_output_dim
        self.classifier = nn.Sequential(nn.Linear(classifier_input_dim, HIDDEN_DIM),nn.ReLU(),nn.Dropout(0.5),nn.Linear(HIDDEN_DIM, NUM_CLASSES))
    def forward(self, batch):
        text_vec = self.text_tower(batch['input_ids'], batch['attention_mask'])
        user_vec = self.user_tower(batch['user_id'])
        business_vec = self.business_tower(batch['gmap_id'], batch['price_id'],batch['category_tags'], batch['misc_tags'],batch['numerical_features'])
        query = text_vec.unsqueeze(1)
        key_value = torch.cat([user_vec, business_vec], dim=1).unsqueeze(1)
        attn_output, _ = self.attention(query, key_value, key_value)
        attn_output = attn_output.squeeze(1)
        final_vec = torch.cat([text_vec, attn_output], dim=1)
        return self.classifier(final_vec)


# =============================================================================
# 5. DATASET & DATALOADER
# =============================================================================
MAX_LEN = 128
MAX_TAGS = 10
class ReviewDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.df = dataframe
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        inputs = self.tokenizer.encode_plus(str(row['text']), add_special_tokens=True, max_length=MAX_LEN, padding='max_length', truncation=True, return_tensors='pt')
        category_tags = row.get('category_tags', [])[:MAX_TAGS]
        category_tags_padded = category_tags + [0] * (MAX_TAGS - len(category_tags))
        misc_tags = row.get('misc_tags', [])[:MAX_TAGS]
        misc_tags_padded = misc_tags + [0] * (MAX_TAGS - len(misc_tags))
        numerical_features_tensor = torch.tensor([
            row['avg_rating'], row['num_of_reviews'], row['pics_count'],
            row['sentiment'], row['caps_ratio'], row['readability_grade'],
            row['has_cta'], row['has_offer'],
            row['user_avg_rating'], row['user_std_rating'],
            row['user_review_count'], row['is_5_star_only_user']
        ], dtype=torch.float)
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'user_id': torch.tensor(row['user_id_encoded'], dtype=torch.long),
            'gmap_id': torch.tensor(row['gmap_id_encoded'], dtype=torch.long),
            'price_id': torch.tensor(row['price_id_encoded'], dtype=torch.long),
            'category_tags': torch.tensor(category_tags_padded, dtype=torch.long),
            'misc_tags': torch.tensor(misc_tags_padded, dtype=torch.long),
            'numerical_features': numerical_features_tensor,
            'target': torch.tensor(row['target_class'], dtype=torch.long)
        }


# =============================================================================
# 6. TRAINING & EVALUATION LOOPS
# =============================================================================
def train_epoch(model, dataloader, optimizer, device, scheduler, class_weights_tensor):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        batch = {k: v.to(device) for k, v in batch.items()}
        targets = batch.pop('target')
        outputs = model(batch)
        if USE_FOCAL_LOSS:
            targets_one_hot = nn.functional.one_hot(targets, num_classes=NUM_CLASSES).float()
            unweighted_loss = ops.sigmoid_focal_loss(outputs, targets_one_hot, alpha=-1, gamma=2, reduction='none')
            weights_for_batch = class_weights_tensor[targets]
            weighted_loss = unweighted_loss * weights_for_batch.unsqueeze(1)
            loss = weighted_loss.mean()
        else:
            loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)
            loss = loss_fn(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_epoch(model, dataloader, device, class_weights_tensor):
    model.eval()
    total_loss = 0
    all_preds, all_targets = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            targets = batch.pop('target')
            outputs = model(batch)
            if USE_FOCAL_LOSS:
                targets_one_hot = nn.functional.one_hot(targets, num_classes=NUM_CLASSES).float()
                unweighted_loss = ops.sigmoid_focal_loss(outputs, targets_one_hot, alpha=-1, gamma=2, reduction='none')
                weights_for_batch = class_weights_tensor[targets]
                weighted_loss = unweighted_loss * weights_for_batch.unsqueeze(1)
                loss = weighted_loss.mean()
            else:
                loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)
                loss = loss_fn(outputs, targets)
            total_loss += loss.item()
            _, preds = torch.max(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
    avg_loss = total_loss / len(dataloader)
    report = classification_report(all_targets, all_preds, target_names=CLASS_NAMES, digits=4, zero_division=0)
    kappa = cohen_kappa_score(all_targets, all_preds)
    return avg_loss, report, kappa

# =============================================================================
# 7. MAIN EXECUTION
# =============================================================================
if __name__ == '__main__':
    print("Loading feature-engineered data...")
    try:
        df = pd.read_csv('/content/drive/MyDrive/processed_reviews_v2.csv')
    except FileNotFoundError:
        print("ERROR: 'processed_reviews_v2.csv' not found.")
        print("Please run the 'prepare_data_v2.py' script first.")
        exit()

    if 'category_tags_encoded' in df.columns:
        df['category_tags'] = df['category_tags_encoded'].apply(ast.literal_eval)
    if 'misc_tags_encoded' in df.columns:
        df['misc_tags'] = df['misc_tags_encoded'].apply(ast.literal_eval)
    print("Data loaded and tag columns converted successfully.")

    class_counts = df['target_class'].value_counts().sort_index().values
    total_samples = float(sum(class_counts))
    class_weights = [np.log(total_samples / count) for count in class_counts]
    CLASS_WEIGHTS = [w / sum(class_weights) * NUM_CLASSES for w in class_weights]
    print(f"Calculated Class Weights: {CLASS_WEIGHTS}")
    class_weights_tensor = torch.tensor(CLASS_WEIGHTS, dtype=torch.float).to(device)

    tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['target_class'])
    print(f"Training on {len(train_df)} samples, validating on {len(val_df)} samples.")
    train_dataset = ReviewDataset(train_df, tokenizer)
    val_dataset = ReviewDataset(val_df, tokenizer)
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    model = MainModel(num_numerical_features=NUM_NUMERICAL_FEATURES).to(device)

    # --- UPDATED: GRADUAL UNFREEZING ---
    print("Freezing all BERT layers initially...")
    for param in model.text_tower.bert.parameters():
        param.requires_grad = False

    # Unfreeze the last 2 layers of the BERT encoder
    # BERT's main layers are in `bert.encoder.layer` which is a list of 12 layers
    num_layers_to_unfreeze = 2
    print(f"Unfreezing the top {num_layers_to_unfreeze} BERT layers...")
    for i in range(num_layers_to_unfreeze):
        for param in model.text_tower.bert.encoder.layer[-(i+1)].parameters():
            param.requires_grad = True

    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    total_steps = len(train_dataloader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    epochs_no_improve = 0
    best_val_loss = float('inf')

    print("Starting training with new features and unfrozen layers...")
    for epoch in range(EPOCHS):
        avg_train_loss = train_epoch(model, dataloader, optimizer, device, scheduler, class_weights_tensor)
        avg_val_loss, report, kappa = evaluate_epoch(model, val_dataloader, device, class_weights_tensor)

        print(f"\n--- Epoch {epoch + 1}/{EPOCHS} ---")
        print(f"Average Training Loss: {avg_train_loss:.4f}")
        print(f"Average Validation Loss: {avg_val_loss:.4f}")
        print(f"Cohen's Kappa: {kappa:.4f}")
        print("Validation Classification Report:")
        print(report)

        if avg_val_loss < best_val_loss:
            print("Validation loss improved. Saving model to 'best_model_state.bin'...")
            torch.save(model.state_dict(), 'best_model_state.bin')
            best_val_loss = avg_val_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            print(f"Validation loss did not improve. Counter: {epochs_no_improve}/{PATIENCE}")

        if epochs_no_improve == PATIENCE:
            print("Early stopping triggered.")
            break
        print("---------------------------------\n")

    print("Training finished.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda
Loading feature-engineered data...
Data loaded and tag columns converted successfully.
Calculated Class Weights: [np.float64(0.04551597792290425), np.float64(0.6028185668604471), np.float64(1.4374671720206607), np.float64(1.9141982831959878)]
Training on 46031 samples, validating on 11508 samples.
Freezing pre-trained BERT layers...
Starting training with new features...


Training:   0%|          | 0/2877 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/720 [00:00<?, ?it/s]


--- Epoch 1/15 ---
Average Training Loss: 0.0106
Average Validation Loss: 0.0079
Cohen's Kappa: 0.5800
Validation Classification Report:
               precision    recall  f1-score   support

     relevant     0.9846    0.8600    0.9181      9877
         spam     0.5033    0.9520    0.6585      1520
         rant     0.5000    0.0326    0.0612        92
advertisement     0.0000    0.0000    0.0000        19

     accuracy                         0.8641     11508
    macro avg     0.4970    0.4611    0.4094     11508
 weighted avg     0.9155    0.8641    0.8754     11508

Validation loss improved. Saving model to 'best_model_state.bin'...
---------------------------------



Training:   0%|          | 0/2877 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/720 [00:00<?, ?it/s]


--- Epoch 2/15 ---
Average Training Loss: 0.0080
Average Validation Loss: 0.0069
Cohen's Kappa: 0.5785
Validation Classification Report:
               precision    recall  f1-score   support

     relevant     0.9892    0.8493    0.9139      9877
         spam     0.5350    0.9513    0.6848      1520
         rant     0.1296    0.4565    0.2019        92
advertisement     0.0000    0.0000    0.0000        19

     accuracy                         0.8583     11508
    macro avg     0.4134    0.5643    0.4502     11508
 weighted avg     0.9207    0.8583    0.8765     11508

Validation loss improved. Saving model to 'best_model_state.bin'...
---------------------------------



Training:   0%|          | 0/2877 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [12]:
import pandas as pd
import numpy as np
import json
import ast
import re
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tqdm import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import textstat

tqdm.pandas()

# =============================================================================
# HELPER FUNCTIONS (Unchanged)
# =============================================================================
analyzer = SentimentIntensityAnalyzer()
CTA_WORDS = ['visit', 'check out', 'try our', 'don\'t miss', 'shop now']
OFFER_WORDS = ['deal', 'offer', 'discount', 'promotion', 'free', 'sale']

def get_sentiment(text):
    return analyzer.polarity_scores(str(text))['compound']

def get_caps_ratio(text):
    text = str(text)
    if not text: return 0
    upper_chars = sum(1 for c in text if c.isupper())
    alpha_chars = sum(1 for c in text if c.isalpha())
    return upper_chars / alpha_chars if alpha_chars > 0 else 0

def has_keyword(text, keywords):
    return 1 if any(word in str(text).lower() for word in keywords) else 0

def get_readability(text):
    return textstat.flesch_kincaid_grade(str(text))

def get_exclamation_count(text):
    return str(text).count('!')

def get_word_count(text):
    return len(str(text).split())

def get_unique_word_ratio(text):
    text = str(text)
    if not text: return 0
    words = text.lower().split()
    if not words: return 0
    return len(set(words)) / len(words)

def get_category_keyword_match(row):
    text = str(row['text']).lower()
    categories = row.get('category_tags_list', [])
    if not categories: return 0
    match_count = sum(1 for cat in categories if cat.lower() in text)
    return match_count

def parse_and_flatten_misc_data(data):
    if not isinstance(data, str): return []
    try:
        misc_dict = json.loads(data)
        return [f"{key.strip()}:{value.strip()}" for key, values in misc_dict.items() if isinstance(values, list) for value in values]
    except (json.JSONDecodeError, TypeError): return []

def parse_categories(data):
    if not isinstance(data, str): return []
    try: return ast.literal_eval(data)
    except (ValueError, SyntaxError): return []

# =============================================================================
# MAIN DATA PREPARATION SCRIPT (DEFINITIVELY REORDERED LOGIC)
# =============================================================================
def prepare_data(input_csv_path, output_csv_path):
    print("Step 1: Loading raw data...")
    df = pd.read_csv(input_csv_path)

    print("Step 2: Encoding target variable...")
    target_mapping = {'relevant': 0, 'spam': 1, 'rant': 2, 'advertisement': 3}
    df['target_class'] = df['predicted_classification'].map(target_mapping)
    df.dropna(subset=['target_class'], inplace=True)
    df['target_class'] = df['target_class'].astype(int)

    print("Step 3: Parsing complex text fields...")
    df['misc_tags_flat'] = df['misc_data'].progress_apply(parse_and_flatten_misc_data)
    df['category_tags_list'] = df['categories'].progress_apply(parse_categories)

    # --- NEW LOGIC ORDER: STEP 4 - Perform simple ID encodings FIRST ---
    print("Step 4: Encoding 'user_id', 'gmap_id', and 'price' columns...")
    for col in ['user_id', 'gmap_id', 'price']:
        encoder = LabelEncoder()
        df[f'{col}_encoded'] = encoder.fit_transform(df[col].astype(str))

    print("Step 5: Engineering all new features...")
    # Text Meta-Features
    df['sentiment'] = df['text'].progress_apply(get_sentiment)
    df['caps_ratio'] = df['text'].progress_apply(get_caps_ratio)
    df['readability_grade'] = df['text'].progress_apply(get_readability)
    df['has_cta'] = df['text'].progress_apply(lambda text: has_keyword(text, CTA_WORDS))
    df['has_offer'] = df['text'].progress_apply(lambda text: has_keyword(text, OFFER_WORDS))
    df['exclamation_count'] = df['text'].progress_apply(get_exclamation_count)
    df['word_count'] = df['text'].progress_apply(get_word_count)
    df['unique_word_ratio'] = df['text'].progress_apply(get_unique_word_ratio)
    df['category_keyword_match'] = df.progress_apply(get_category_keyword_match, axis=1)

    # User Behavioral Features
    print("Step 5b: Engineering User Behavioral Features...")
    user_stats = df.groupby('user_id')['rating'].agg(['mean', 'std', 'count']).rename(columns={
        'mean': 'user_avg_rating', 'std': 'user_std_rating', 'count': 'user_review_count'})
    # Merge the stats back into the main dataframe
    df = df.merge(user_stats, on='user_id', how='left')
    df.fillna({'user_std_rating': 0}, inplace=True)
    df['is_5_star_only_user'] = ((df['user_std_rating'] == 0) & (df['user_avg_rating'] == 5.0)).astype(int)

    # Tag Encodings
    print("Step 5c: Encoding tag features...")
    all_tags = set(tag for tag_list in df['misc_tags_flat'] for tag in tag_list).union(
                 set(tag for tag_list in df['category_tags_list'] for tag in tag_list))
    tag_to_id = {tag: i + 1 for i, tag in enumerate(sorted(list(all_tags)))}
    df['misc_tags_encoded'] = df['misc_tags_flat'].progress_apply(lambda tags: [tag_to_id.get(tag, 0) for tag in tags])
    df['category_tags_encoded'] = df['category_tags_list'].progress_apply(lambda tags: [tag_to_id.get(tag, 0) for tag in tags])

    print("Step 6: Normalizing all numerical features...")
    numerical_cols = [
        'avg_rating', 'num_of_reviews', 'pics_count', 'sentiment', 'caps_ratio',
        'readability_grade', 'has_cta', 'has_offer', 'user_avg_rating', 'user_std_rating',
        'user_review_count', 'is_5_star_only_user', 'exclamation_count', 'word_count',
        'unique_word_ratio', 'category_keyword_match'
    ]
    for col in numerical_cols:
        if col not in df.columns: df[col] = 0
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    print("Step 7: Selecting final columns and saving...")
    # FIX IS HERE: Changed 'price_id_encoded' to 'price_encoded'
    final_columns = [
        'text', 'response_text', 'user_id_encoded', 'gmap_id_encoded', 'price_encoded',
        'category_tags_encoded', 'misc_tags_encoded', 'target_class'
    ] + numerical_cols

    # Ensure all selected columns exist before trying to create the new DataFrame
    final_columns = [col for col in final_columns if col in df.columns]

    processed_df = df[final_columns]
    processed_df.to_csv(output_csv_path, index=False)

    print(f"\nData preparation complete! Processed data saved to: {output_csv_path}")
    print("\n--- COPY THESE VALUES INTO YOUR PYTORCH SCRIPT CONFIGURATION ---")
    print(f"NUM_USERS = {df['user_id_encoded'].nunique()}")
    print(f"NUM_GMAP_IDS = {df['gmap_id_encoded'].nunique()}")
    print(f"NUM_PRICE_TIERS = {df['price_encoded'].nunique()}") # Corrected this line as well for consistency
    print(f"NUM_CATEGORIES_TAGS = {len(tag_to_id) + 1}")
    print(f"NUM_MISC_TAGS = {len(tag_to_id) + 1}")
    print("------------------------------------------------------------------\n")

if __name__ == '__main__':
    # Make sure to update these paths to your actual file locations
    INPUT_CSV = '/content/drive/MyDrive/my_predictions_on_wah.csv'
    OUTPUT_CSV = '/content/drive/MyDrive/processed_reviews_v3.csv'
    prepare_data(INPUT_CSV, OUTPUT_CSV)

Step 1: Loading raw data...
Step 2: Encoding target variable...
Step 3: Parsing complex text fields...


100%|██████████| 57539/57539 [00:00<00:00, 110711.72it/s]
100%|██████████| 57539/57539 [00:00<00:00, 82638.10it/s]


Step 4: Encoding 'user_id', 'gmap_id', and 'price' columns...
Step 5: Engineering all new features...


100%|██████████| 57539/57539 [00:08<00:00, 6681.25it/s]
100%|██████████| 57539/57539 [00:00<00:00, 78304.69it/s]
100%|██████████| 57539/57539 [00:03<00:00, 17821.32it/s]
100%|██████████| 57539/57539 [00:00<00:00, 235294.63it/s]
100%|██████████| 57539/57539 [00:00<00:00, 205921.65it/s]
100%|██████████| 57539/57539 [00:00<00:00, 887832.87it/s]
100%|██████████| 57539/57539 [00:00<00:00, 378625.16it/s]
100%|██████████| 57539/57539 [00:00<00:00, 199782.50it/s]
100%|██████████| 57539/57539 [00:00<00:00, 116483.59it/s]


Step 5b: Engineering User Behavioral Features...
Step 5c: Encoding tag features...


100%|██████████| 57539/57539 [00:00<00:00, 471842.39it/s]
100%|██████████| 57539/57539 [00:00<00:00, 1177329.47it/s]


Step 6: Normalizing all numerical features...
Step 7: Selecting final columns and saving...

Data preparation complete! Processed data saved to: /content/drive/MyDrive/processed_reviews_v3.csv

--- COPY THESE VALUES INTO YOUR PYTORCH SCRIPT CONFIGURATION ---
NUM_USERS = 55172
NUM_GMAP_IDS = 37333
NUM_PRICE_TIERS = 9
NUM_CATEGORIES_TAGS = 201
NUM_MISC_TAGS = 201
------------------------------------------------------------------



In [1]:
# =============================================================================
# 1. IMPORTS & SETUP
# =============================================================================
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, cohen_kappa_score
from tqdm.notebook import tqdm
import ast
import os

from transformers import BertModel, BertTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW
from torchvision import ops

# Set environment variable for more precise CUDA error debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

from google.colab import drive
drive.mount('/content/drive')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# =============================================================================
# 2. CONFIGURATION
# =============================================================================
# ### --- Model Hyperparameters --- ###
USER_EMB_DIM = 32
GMAP_EMB_DIM = 64
PRICE_EMB_DIM = 4
TAG_EMB_DIM = 16
BERT_MODEL_NAME = 'bert-base-uncased'
HIDDEN_DIM = 256
NUM_NUMERICAL_FEATURES = 16

# ### --- Dataset & Vocabulary Sizes --- ###
# These values MUST be >= (max_id_in_data + 1)
# The script will verify these for you. You may need to update them based on the output.
NUM_USERS = 55172
NUM_GMAP_IDS = 37333
NUM_PRICE_TIERS = 9
NUM_CATEGORIES_TAGS = 201 # <-- This is likely the problem. The script will tell you the correct value.
NUM_MISC_TAGS = 201       # <-- This is likely the problem. The script will tell you the correct value.
NUM_CLASSES = 4
CLASS_NAMES = ['relevant', 'spam', 'rant', 'advertisement']

# ### --- Training Parameters --- ###
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
EPOCHS = 15
PATIENCE = 3
USE_FOCAL_LOSS = True


# =============================================================================
# 3. TOWER DEFINITIONS (The "Sub-Models")
# =============================================================================
class TextTower(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state[:, 0, :]

class UserTower(nn.Module):
    def __init__(self, num_users, embedding_dim):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
    def forward(self, user_ids):
        return self.user_embedding(user_ids)

class BusinessTower(nn.Module):
    def __init__(self, num_gmap_ids, num_prices, num_cat_tags, num_misc_tags, num_numerical_features):
        super().__init__()
        self.gmap_embedding = nn.Embedding(num_gmap_ids, GMAP_EMB_DIM)
        self.price_embedding = nn.Embedding(num_prices, PRICE_EMB_DIM)
        self.category_embedding_bag = nn.EmbeddingBag(num_cat_tags, TAG_EMB_DIM, mode='mean', padding_idx=0)
        self.misc_embedding_bag = nn.EmbeddingBag(num_misc_tags, TAG_EMB_DIM, mode='mean', padding_idx=0)
        combined_dim = GMAP_EMB_DIM + PRICE_EMB_DIM + TAG_EMB_DIM + TAG_EMB_DIM + num_numerical_features
        self.mlp = nn.Sequential(nn.Linear(combined_dim, 128), nn.ReLU(), nn.Linear(128, 64))
    def forward(self, gmap_ids, price_ids, cat_tags, misc_tags, numerical_features):
        gmap_vec = self.gmap_embedding(gmap_ids)
        price_vec = self.price_embedding(price_ids)
        cat_vec = self.category_embedding_bag(cat_tags)
        misc_vec = self.misc_embedding_bag(misc_tags)
        combined_vec = torch.cat([gmap_vec, price_vec, cat_vec, misc_vec, numerical_features], dim=1)
        return self.mlp(combined_vec)


# =============================================================================
# 4. THE MAIN MODEL (The Manager)
# =============================================================================
class MainModel(nn.Module):
    def __init__(self, num_numerical_features):
        super().__init__()
        self.text_tower = TextTower(BERT_MODEL_NAME)
        self.user_tower = UserTower(NUM_USERS, USER_EMB_DIM)
        self.business_tower = BusinessTower(NUM_GMAP_IDS, NUM_PRICE_TIERS, NUM_CATEGORIES_TAGS, NUM_MISC_TAGS, num_numerical_features)
        text_output_dim, user_output_dim, business_output_dim = 768, USER_EMB_DIM, 64
        self.attention = nn.MultiheadAttention(embed_dim=text_output_dim,kdim=user_output_dim + business_output_dim,vdim=user_output_dim + business_output_dim,num_heads=8,batch_first=True)
        classifier_input_dim = text_output_dim + text_output_dim
        self.classifier = nn.Sequential(nn.Linear(classifier_input_dim, HIDDEN_DIM),nn.ReLU(),nn.Dropout(0.5),nn.Linear(HIDDEN_DIM, NUM_CLASSES))
    def forward(self, batch):
        text_vec = self.text_tower(batch['input_ids'], batch['attention_mask'])
        user_vec = self.user_tower(batch['user_id'])
        business_vec = self.business_tower(batch['gmap_id'], batch['price_id'],batch['category_tags'], batch['misc_tags'],batch['numerical_features'])
        query = text_vec.unsqueeze(1)
        key_value = torch.cat([user_vec, business_vec], dim=1).unsqueeze(1)
        attn_output, _ = self.attention(query, key_value, key_value)
        attn_output = attn_output.squeeze(1)
        final_vec = torch.cat([text_vec, attn_output], dim=1)
        return self.classifier(final_vec)


# =============================================================================
# 5. DATASET & DATALOADER
# =============================================================================
MAX_LEN = 128
MAX_TAGS = 10
class ReviewDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.df = dataframe
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        inputs = self.tokenizer.encode_plus(str(row['text']), add_special_tokens=True, max_length=MAX_LEN, padding='max_length', truncation=True, return_tensors='pt')
        # Use already parsed lists from main script
        category_tags = row['category_tags'][:MAX_TAGS]
        category_tags_padded = category_tags + [0] * (MAX_TAGS - len(category_tags))
        misc_tags = row['misc_tags'][:MAX_TAGS]
        misc_tags_padded = misc_tags + [0] * (MAX_TAGS - len(misc_tags))

        numerical_features_tensor = torch.tensor([
            row['avg_rating'], row['num_of_reviews'], row['pics_count'],
            row['sentiment'], row['caps_ratio'], row['readability_grade'],
            row['has_cta'], row['has_offer'], row['user_avg_rating'],
            row['user_std_rating'], row['user_review_count'],
            row['is_5_star_only_user'], row['exclamation_count'],
            row['word_count'], row['unique_word_ratio'],
            row['category_keyword_match']
        ], dtype=torch.float)

        return {
            'input_ids': inputs['input_ids'].squeeze(0), 'attention_mask': inputs['attention_mask'].squeeze(0),
            'user_id': torch.tensor(row['user_id_encoded'], dtype=torch.long), 'gmap_id': torch.tensor(row['gmap_id_encoded'], dtype=torch.long),
            'price_id': torch.tensor(row['price_encoded'], dtype=torch.long),
            'category_tags': torch.tensor(category_tags_padded, dtype=torch.long),
            'misc_tags': torch.tensor(misc_tags_padded, dtype=torch.long), 'numerical_features': numerical_features_tensor,
            'target': torch.tensor(row['target_class'], dtype=torch.long)
        }


# =============================================================================
# 6. TRAINING & EVALUATION LOOPS
# =============================================================================
def train_epoch(model, dataloader, optimizer, device, scheduler, class_weights_tensor):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        batch = {k: v.to(device) for k, v in batch.items()}
        targets = batch.pop('target')
        outputs = model(batch)
        if USE_FOCAL_LOSS:
            targets_one_hot = nn.functional.one_hot(targets, num_classes=NUM_CLASSES).float()
            unweighted_loss = ops.sigmoid_focal_loss(outputs, targets_one_hot, alpha=-1, gamma=2, reduction='none')
            weights_for_batch = class_weights_tensor[targets]
            weighted_loss = unweighted_loss * weights_for_batch.unsqueeze(1)
            loss = weighted_loss.mean()
        else:
            loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)
            loss = loss_fn(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_epoch(model, dataloader, device, class_weights_tensor):
    model.eval()
    total_loss = 0
    all_preds, all_targets = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            targets = batch.pop('target')
            outputs = model(batch)
            if USE_FOCAL_LOSS:
                targets_one_hot = nn.functional.one_hot(targets, num_classes=NUM_CLASSES).float()
                unweighted_loss = ops.sigmoid_focal_loss(outputs, targets_one_hot, alpha=-1, gamma=2, reduction='none')
                weights_for_batch = class_weights_tensor[targets]
                weighted_loss = unweighted_loss * weights_for_batch.unsqueeze(1)
                loss = weighted_loss.mean()
            else:
                loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)
                loss = loss_fn(outputs, targets)
            total_loss += loss.item()
            _, preds = torch.max(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
    avg_loss = total_loss / len(dataloader)
    report = classification_report(all_targets, all_preds, target_names=CLASS_NAMES, digits=4, zero_division=0)
    kappa = cohen_kappa_score(all_targets, all_preds)
    return avg_loss, report, kappa

# =============================================================================
# 7. MAIN EXECUTION
# =============================================================================
if __name__ == '__main__':
    print("Loading feature-engineered data...")
    try:
        df = pd.read_csv('/content/drive/MyDrive/processed_reviews_v3.csv')
    except FileNotFoundError:
        print("ERROR: 'processed_reviews_v3.csv' not found.")
        print("Please run the 'prepare_data' script first.")
        exit()

    # Convert string representations of lists to actual lists
    if 'category_tags_encoded' in df.columns:
        df['category_tags'] = df['category_tags_encoded'].apply(ast.literal_eval)
    if 'misc_tags_encoded' in df.columns:
        df['misc_tags'] = df['misc_tags_encoded'].apply(ast.literal_eval)
    print("Tag columns parsed successfully.")

    # --- NEW & IMPROVED: DATA INTEGRITY VERIFICATION BLOCK ---
    print("\n--- Verifying Data Integrity for All Embedding Layers ---")
    error_found = False
    # Check simple ID columns
    for col, config_val, name in [('user_id_encoded', NUM_USERS, 'NUM_USERS'),
                                  ('gmap_id_encoded', NUM_GMAP_IDS, 'NUM_GMAP_IDS'),
                                  ('price_encoded', NUM_PRICE_TIERS, 'NUM_PRICE_TIERS')]:
        max_val = df[col].max()
        if max_val >= config_val:
            print(f"!! CONFIG ERROR in '{name}': Max data index is {max_val}, but config size is {config_val}.")
            print(f"   -> Suggestion: Change '{name}' to at least {max_val + 1}.")
            error_found = True
        else:
            print(f"'{name}' OK. (Max Index: {max_val}, Config Size: {config_val})")

    # Check tag columns (which are lists of lists)
    max_cat_tag = df['category_tags'].explode().max()
    max_misc_tag = df['misc_tags'].explode().max()

    if max_cat_tag >= NUM_CATEGORIES_TAGS:
        print(f"!! CONFIG ERROR in 'NUM_CATEGORIES_TAGS': Max tag ID is {max_cat_tag}, but config size is {NUM_CATEGORIES_TAGS}.")
        print(f"   -> Suggestion: Change 'NUM_CATEGORIES_TAGS' and 'NUM_MISC_TAGS' to at least {max(max_cat_tag, max_misc_tag) + 1}.")
        error_found = True
    else:
        print(f"'NUM_CATEGORIES_TAGS' OK. (Max Index: {max_cat_tag}, Config Size: {NUM_CATEGORIES_TAGS})")

    if max_misc_tag >= NUM_MISC_TAGS:
        print(f"!! CONFIG ERROR in 'NUM_MISC_TAGS': Max tag ID is {max_misc_tag}, but config size is {NUM_MISC_TAGS}.")
        print(f"   -> Suggestion: Change 'NUM_CATEGORIES_TAGS' and 'NUM_MISC_TAGS' to at least {max(max_cat_tag, max_misc_tag) + 1}.")
        error_found = True
    else:
        print(f"'NUM_MISC_TAGS' OK. (Max Index: {max_misc_tag}, Config Size: {NUM_MISC_TAGS})")

    if error_found:
        print("\n!! Errors found. Please update your CONFIGURATION section and re-run. Exiting. !!")
        exit()
    print("-------------------------------------------------------\n")
    # --- END OF VERIFICATION BLOCK ---

    class_counts = df['target_class'].value_counts().sort_index().values
    total_samples = float(sum(class_counts))
    class_weights = [np.log(total_samples / count) for count in class_counts]
    CLASS_WEIGHTS = [w / sum(class_weights) * NUM_CLASSES for w in class_weights]
    print(f"Calculated Class Weights: {CLASS_WEIGHTS}")
    class_weights_tensor = torch.tensor(CLASS_WEIGHTS, dtype=torch.float).to(device)

    tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['target_class'])
    print(f"Training on {len(train_df)} samples, validating on {len(val_df)} samples.")
    train_dataset = ReviewDataset(train_df, tokenizer)
    val_dataset = ReviewDataset(val_df, tokenizer)
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    model = MainModel(num_numerical_features=NUM_NUMERICAL_FEATURES).to(device)

    print("Unfreezing the top 2 BERT layers...")
    for param in model.text_tower.bert.parameters():
        param.requires_grad = False
    num_layers_to_unfreeze = 2
    for i in range(num_layers_to_unfreeze):
        for param in model.text_tower.bert.encoder.layer[-(i+1)].parameters():
            param.requires_grad = True

    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    total_steps = len(train_dataloader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    epochs_no_improve = 0
    best_val_loss = float('inf')

    print("Starting training with FINAL features and unfrozen layers...")
    for epoch in range(EPOCHS):
        avg_train_loss = train_epoch(model, train_dataloader, optimizer, device, scheduler, class_weights_tensor)
        avg_val_loss, report, kappa = evaluate_epoch(model, val_dataloader, device, class_weights_tensor)

        print(f"\n--- Epoch {epoch + 1}/{EPOCHS} ---")
        print(f"Average Training Loss: {avg_train_loss:.4f}")
        print(f"Average Validation Loss: {avg_val_loss:.4f}")
        print(f"Cohen's Kappa: {kappa:.4f}")
        print("Validation Classification Report:")
        print(report)

        if avg_val_loss < best_val_loss:
            print("Validation loss improved. Saving model to 'best_model_state.bin'...")
            torch.save(model.state_dict(), 'best_model_state.bin')
            best_val_loss = avg_val_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            print(f"Validation loss did not improve. Counter: {epochs_no_improve}/{PATIENCE}")

        if epochs_no_improve == PATIENCE:
            print("Early stopping triggered.")
            break
        print("---------------------------------\n")

    print("Training finished.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda
Loading feature-engineered data...
Tag columns parsed successfully.

--- Verifying Data Integrity for All Embedding Layers ---
'NUM_USERS' OK. (Max Index: 55171, Config Size: 55172)
'NUM_GMAP_IDS' OK. (Max Index: 37332, Config Size: 37333)
'NUM_PRICE_TIERS' OK. (Max Index: 8, Config Size: 9)
'NUM_CATEGORIES_TAGS' OK. (Max Index: nan, Config Size: 201)
'NUM_MISC_TAGS' OK. (Max Index: 200, Config Size: 201)
-------------------------------------------------------

Calculated Class Weights: [np.float64(0.04551597792290425), np.float64(0.6028185668604471), np.float64(1.4374671720206607), np.float64(1.9141982831959878)]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Training on 46031 samples, validating on 11508 samples.
Unfreezing the top 2 BERT layers...
Starting training with FINAL features and unfrozen layers...


Training:   0%|          | 0/2877 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/720 [00:00<?, ?it/s]


--- Epoch 1/15 ---
Average Training Loss: 0.0070
Average Validation Loss: 0.0049
Cohen's Kappa: 0.6537
Validation Classification Report:
               precision    recall  f1-score   support

     relevant     0.9941    0.8805    0.9339      9877
         spam     0.6257    0.9691    0.7605      1520
         rant     0.1531    0.6739    0.2495        92
advertisement     0.0000    0.0000    0.0000        19

     accuracy                         0.8891     11508
    macro avg     0.4432    0.6309    0.4860     11508
 weighted avg     0.9370    0.8891    0.9039     11508

Validation loss improved. Saving model to 'best_model_state.bin'...
---------------------------------



Training:   0%|          | 0/2877 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/720 [00:00<?, ?it/s]


--- Epoch 2/15 ---
Average Training Loss: 0.0047
Average Validation Loss: 0.0041
Cohen's Kappa: 0.7009
Validation Classification Report:
               precision    recall  f1-score   support

     relevant     0.9930    0.9051    0.9470      9877
         spam     0.7229    0.9474    0.8200      1520
         rant     0.1567    0.7935    0.2616        92
advertisement     0.0851    0.2105    0.1212        19

     accuracy                         0.9087     11508
    macro avg     0.4894    0.7141    0.5375     11508
 weighted avg     0.9491    0.9087    0.9234     11508

Validation loss improved. Saving model to 'best_model_state.bin'...
---------------------------------



Training:   0%|          | 0/2877 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/720 [00:00<?, ?it/s]


--- Epoch 3/15 ---
Average Training Loss: 0.0037
Average Validation Loss: 0.0049
Cohen's Kappa: 0.7950
Validation Classification Report:
               precision    recall  f1-score   support

     relevant     0.9890    0.9484    0.9683      9877
         spam     0.8146    0.9395    0.8726      1520
         rant     0.2642    0.7065    0.3846        92
advertisement     0.0789    0.1579    0.1053        19

     accuracy                         0.9440     11508
    macro avg     0.5367    0.6881    0.5827     11508
 weighted avg     0.9587    0.9440    0.9495     11508

Validation loss did not improve. Counter: 1/3
---------------------------------



Training:   0%|          | 0/2877 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/720 [00:00<?, ?it/s]


--- Epoch 4/15 ---
Average Training Loss: 0.0029
Average Validation Loss: 0.0040
Cohen's Kappa: 0.7416
Validation Classification Report:
               precision    recall  f1-score   support

     relevant     0.9927    0.9211    0.9556      9877
         spam     0.7596    0.9586    0.8476      1520
         rant     0.2457    0.7826    0.3740        92
advertisement     0.0455    0.3158    0.0795        19

     accuracy                         0.9240     11508
    macro avg     0.5109    0.7445    0.5642     11508
 weighted avg     0.9544    0.9240    0.9352     11508

Validation loss improved. Saving model to 'best_model_state.bin'...
---------------------------------



Training:   0%|          | 0/2877 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/720 [00:00<?, ?it/s]


--- Epoch 5/15 ---
Average Training Loss: 0.0024
Average Validation Loss: 0.0045
Cohen's Kappa: 0.7486
Validation Classification Report:
               precision    recall  f1-score   support

     relevant     0.9934    0.9248    0.9578      9877
         spam     0.7602    0.9553    0.8466      1520
         rant     0.2108    0.8043    0.3341        92
advertisement     0.0577    0.1579    0.0845        19

     accuracy                         0.9266     11508
    macro avg     0.5055    0.7106    0.5558     11508
 weighted avg     0.9548    0.9266    0.9367     11508

Validation loss did not improve. Counter: 1/3
---------------------------------



Training:   0%|          | 0/2877 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/720 [00:00<?, ?it/s]


--- Epoch 6/15 ---
Average Training Loss: 0.0020
Average Validation Loss: 0.0067
Cohen's Kappa: 0.8026
Validation Classification Report:
               precision    recall  f1-score   support

     relevant     0.9890    0.9554    0.9719      9877
         spam     0.8632    0.9092    0.8856      1520
         rant     0.2232    0.7935    0.3484        92
advertisement     0.1026    0.2105    0.1379        19

     accuracy                         0.9467     11508
    macro avg     0.5445    0.7171    0.5860     11508
 weighted avg     0.9648    0.9467    0.9541     11508

Validation loss did not improve. Counter: 2/3
---------------------------------



Training:   0%|          | 0/2877 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/720 [00:00<?, ?it/s]


--- Epoch 7/15 ---
Average Training Loss: 0.0016
Average Validation Loss: 0.0062
Cohen's Kappa: 0.7917
Validation Classification Report:
               precision    recall  f1-score   support

     relevant     0.9916    0.9442    0.9673      9877
         spam     0.7933    0.9546    0.8665      1520
         rant     0.2735    0.6630    0.3873        92
advertisement     0.0980    0.2632    0.1429        19

     accuracy                         0.9422     11508
    macro avg     0.5391    0.7063    0.5910     11508
 weighted avg     0.9582    0.9422    0.9480     11508

Validation loss did not improve. Counter: 3/3
Early stopping triggered.
Training finished.


In [2]:
# --- ADD THIS SNIPPET TO SAVE THE FINAL MODEL ---

print("Training finished.")

# Define the path for the final model
FINAL_MODEL_SAVE_PATH = '/content/drive/MyDrive/BERTBERT1.bin'

print(f"Saving final model to {FINAL_MODEL_SAVE_PATH}...")
# Save the model's learned weights (the state dictionary)
torch.save(model.state_dict(), FINAL_MODEL_SAVE_PATH)
print("Final model saved successfully.")

# --- END OF SNIPPET ---

# ... rest of your script for evaluating on the test set ...

Training finished.
Saving final model to /content/drive/MyDrive/final_model_epoch_{EPOCHS}.bin...
Final model saved successfully.
