In [None]:
!pip install vaderSentiment textstat

import pandas as pd
import numpy as np
import json
import ast
import re
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tqdm import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import textstat

from google.colab import drive
drive.mount('/content/drive')

tqdm.pandas()

# =============================================================================
# HELPER FUNCTIONS (Unchanged)
# =============================================================================
analyzer = SentimentIntensityAnalyzer()
CTA_WORDS = ['visit', 'check out', 'try our', 'don\'t miss', 'shop now']
OFFER_WORDS = ['deal', 'offer', 'discount', 'promotion', 'free', 'sale']

def get_sentiment(text):
    return analyzer.polarity_scores(str(text))['compound']

def get_caps_ratio(text):
    text = str(text)
    if not text: return 0
    upper_chars = sum(1 for c in text if c.isupper())
    alpha_chars = sum(1 for c in text if c.isalpha())
    return upper_chars / alpha_chars if alpha_chars > 0 else 0

def has_keyword(text, keywords):
    return 1 if any(word in str(text).lower() for word in keywords) else 0

def get_readability(text):
    return textstat.flesch_kincaid_grade(str(text))

def get_exclamation_count(text):
    return str(text).count('!')

def get_word_count(text):
    return len(str(text).split())

def get_unique_word_ratio(text):
    text = str(text)
    if not text: return 0
    words = text.lower().split()
    if not words: return 0
    return len(set(words)) / len(words)

def get_category_keyword_match(row):
    text = str(row['text']).lower()
    categories = row.get('category_tags_list', [])
    if not categories: return 0
    match_count = sum(1 for cat in categories if cat.lower() in text)
    return match_count

def parse_and_flatten_misc_data(data):
    if not isinstance(data, str): return []
    try:
        misc_dict = json.loads(data)
        return [f"{key.strip()}:{value.strip()}" for key, values in misc_dict.items() if isinstance(values, list) for value in values]
    except (json.JSONDecodeError, TypeError): return []

def parse_categories(data):
    if not isinstance(data, str): return []
    try: return ast.literal_eval(data)
    except (ValueError, SyntaxError): return []

# =============================================================================
# MAIN DATA PREPARATION SCRIPT (v4 with Temporal Features)
# =============================================================================
def prepare_data(input_csv_path, output_csv_path):
    print("Step 1: Loading raw data...")
    df = pd.read_csv(input_csv_path)

    print("Step 2: Encoding target variable...")
    target_mapping = {'relevant': 0, 'spam': 1, 'rant': 2, 'advertisement': 3}
    df['target_class'] = df['predicted_classification'].map(target_mapping)
    df.dropna(subset=['target_class'], inplace=True)
    df['target_class'] = df['target_class'].astype(int)

    print("Step 3: Parsing complex text fields...")
    df['misc_tags_flat'] = df['misc_data'].progress_apply(parse_and_flatten_misc_data)
    df['category_tags_list'] = df['categories'].progress_apply(parse_categories)

    print("Step 4: Engineering text-based and user-based features...")
    # Text Meta-Features
    df['sentiment'] = df['text'].progress_apply(get_sentiment)
    df['caps_ratio'] = df['text'].progress_apply(get_caps_ratio)
    df['readability_grade'] = df['text'].progress_apply(get_readability)
    df['has_cta'] = df['text'].progress_apply(lambda text: has_keyword(text, CTA_WORDS))
    df['has_offer'] = df['text'].progress_apply(lambda text: has_keyword(text, OFFER_WORDS))
    df['exclamation_count'] = df['text'].progress_apply(get_exclamation_count)
    df['word_count'] = df['text'].progress_apply(get_word_count)
    df['unique_word_ratio'] = df['text'].progress_apply(get_unique_word_ratio)
    df['category_keyword_match'] = df.progress_apply(get_category_keyword_match, axis=1)

    # User Behavioral Features
    user_stats = df.groupby('user_id')['rating'].agg(['mean', 'std', 'count']).rename(columns={
        'mean': 'user_avg_rating', 'std': 'user_std_rating', 'count': 'user_review_count'})
    df = df.merge(user_stats, on='user_id', how='left')
    df.fillna({'user_std_rating': 0}, inplace=True)
    df['is_5_star_only_user'] = ((df['user_std_rating'] == 0) & (df['user_avg_rating'] == 5.0)).astype(int)

    # --- NEW: STEP 5 - Engineering Temporal Features ---
    print("Step 5: Engineering temporal features...")
    # Convert timestamp columns to datetime objects, coercing errors to NaT (Not a Time)
    df['review_timestamp_dt'] = pd.to_datetime(df['review_timestamp'], errors='coerce')
    df['response_time_dt'] = pd.to_datetime(df['response_time'], errors='coerce')

    # Feature 1 & 2: Hour of Day and Day of Week
    df['hour_of_day'] = df['review_timestamp_dt'].dt.hour
    df['day_of_week'] = df['review_timestamp_dt'].dt.dayofweek

    # Feature 3: Time since user's last review
    df.sort_values(by=['user_id', 'review_timestamp_dt'], inplace=True)
    df['time_since_last_review_seconds'] = df.groupby('user_id')['review_timestamp_dt'].diff().dt.total_seconds()
    # Fill missing values for first reviews with a neutral value (the median)
    median_time_gap = df['time_since_last_review_seconds'].median()
    df['time_since_last_review_seconds'].fillna(median_time_gap, inplace=True)

    # Feature 4: Review-to-Response Delay
    df['response_delay_hours'] = (df['response_time_dt'] - df['review_timestamp_dt']).dt.total_seconds() / 3600
    # Fill missing responses with a special value to indicate "no response"
    df['response_delay_hours'].fillna(-1, inplace=True)

    print("Step 6: Encoding all categorical IDs and tags...")
    for col in ['user_id', 'gmap_id', 'price']:
        encoder = LabelEncoder()
        # Creates columns like 'user_id_encoded', 'gmap_id_encoded', 'price_encoded'
        df[f'{col}_encoded'] = encoder.fit_transform(df[col].astype(str))

    all_tags = set(tag for tag_list in df['misc_tags_flat'] for tag in tag_list).union(
                 set(tag for tag_list in df['category_tags_list'] for tag in tag_list))
    tag_to_id = {tag: i + 1 for i, tag in enumerate(sorted(list(all_tags)))}
    df['misc_tags_encoded'] = df['misc_tags_flat'].progress_apply(lambda tags: [tag_to_id.get(tag, 0) for tag in tags])
    df['category_tags_encoded'] = df['category_tags_list'].progress_apply(lambda tags: [tag_to_id.get(tag, 0) for tag in tags])

    print("Step 7: Normalizing all numerical features...")
    numerical_cols = [
        'avg_rating', 'num_of_reviews', 'pics_count', 'sentiment', 'caps_ratio',
        'readability_grade', 'has_cta', 'has_offer', 'user_avg_rating', 'user_std_rating',
        'user_review_count', 'is_5_star_only_user', 'exclamation_count', 'word_count',
        'unique_word_ratio', 'category_keyword_match', 'hour_of_day', 'day_of_week',
        'time_since_last_review_seconds', 'response_delay_hours'
    ]
    for col in numerical_cols:
        if col not in df.columns: df[col] = 0
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    print("Step 8: Selecting final columns and saving...")
    # --- FIX 1: Corrected 'price_id_encoded' to 'price_encoded' ---
    final_columns = [
        'text', 'response_text', 'user_id_encoded', 'gmap_id_encoded', 'price_encoded',
        'category_tags_encoded', 'misc_tags_encoded', 'target_class'
    ] + numerical_cols

    processed_df = df[final_columns]
    processed_df.to_csv(output_csv_path, index=False)

    print(f"\nData preparation complete! Processed data saved to: {output_csv_path}")
    print("\n--- COPY THESE VALUES INTO YOUR PYTORCH SCRIPT CONFIGURATION ---")
    print(f"NUM_USERS = {df['user_id_encoded'].nunique()}")
    print(f"NUM_GMAP_IDS = {df['gmap_id_encoded'].nunique()}")
    # --- FIX 2: Corrected 'price_id_encoded ' to 'price_encoded' ---
    print(f"NUM_PRICE_TIERS = {df['price_encoded'].nunique()}")
    print(f"NUM_CATEGORIES_TAGS = {len(tag_to_id) + 1}")
    print(f"NUM_MISC_TAGS = {len(tag_to_id) + 1}")
    print("------------------------------------------------------------------\n")

if __name__ == '__main__':
    INPUT_CSV = '/content/drive/MyDrive/my_predictions_on_wah.csv'
    OUTPUT_CSV = '/content/drive/MyDrive/processed_reviews_v4.csv'
    prepare_data(INPUT_CSV, OUTPUT_CSV)

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Collecting textstat
  Downloading textstat-0.7.10-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading textstat-0.7.10-py3-none-any.whl (239 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.2/239.2 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m96.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, vaderSentiment, textstat
Successfully installed pyphen-0.17.2 textstat-0.7.10 vaderSentiment-3.3.2
Mounted at /content/drive
Step 1

100%|██████████| 57539/57539 [00:00<00:00, 111417.69it/s]
100%|██████████| 57539/57539 [00:00<00:00, 70198.11it/s]


Step 4: Engineering text-based and user-based features...


100%|██████████| 57539/57539 [00:08<00:00, 6697.74it/s]
100%|██████████| 57539/57539 [00:00<00:00, 78877.66it/s]
100%|██████████| 57539/57539 [00:04<00:00, 12415.33it/s]
100%|██████████| 57539/57539 [00:00<00:00, 236946.16it/s]
100%|██████████| 57539/57539 [00:00<00:00, 207169.29it/s]
100%|██████████| 57539/57539 [00:00<00:00, 915632.70it/s]
100%|██████████| 57539/57539 [00:00<00:00, 402782.60it/s]
100%|██████████| 57539/57539 [00:00<00:00, 203095.42it/s]
100%|██████████| 57539/57539 [00:00<00:00, 113007.90it/s]


Step 5: Engineering temporal features...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['time_since_last_review_seconds'].fillna(median_time_gap, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['response_delay_hours'].fillna(-1, inplace=True)


Step 6: Encoding all categorical IDs and tags...


100%|██████████| 57539/57539 [00:00<00:00, 346309.35it/s]
100%|██████████| 57539/57539 [00:00<00:00, 235972.64it/s]


Step 7: Normalizing all numerical features...
Step 8: Selecting final columns and saving...

Data preparation complete! Processed data saved to: /content/drive/MyDrive/processed_reviews_v4.csv

--- COPY THESE VALUES INTO YOUR PYTORCH SCRIPT CONFIGURATION ---
NUM_USERS = 55172
NUM_GMAP_IDS = 37333
NUM_PRICE_TIERS = 9
NUM_CATEGORIES_TAGS = 201
NUM_MISC_TAGS = 201
------------------------------------------------------------------



In [2]:
# =============================================================================
# 1. IMPORTS & SETUP
# =============================================================================
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, SequentialSampler
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, cohen_kappa_score
from sklearn.decomposition import PCA
from tqdm.notebook import tqdm
import ast

from transformers import BertModel, BertTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW
from torchvision import ops

from google.colab import drive
drive.mount('/content/drive')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# =============================================================================
# 2. CONFIGURATION
# =============================================================================
# ### --- Model Hyperparameters --- ###
USER_EMB_DIM = 32
GMAP_EMB_DIM = 64
PRICE_EMB_DIM = 4
TAG_EMB_DIM = 16
BERT_MODEL_NAME = 'bert-base-uncased'
HIDDEN_DIM = 256
PCA_DIM = 128
NUM_NUMERICAL_FEATURES = 20

# ### --- Training Parameters --- ###
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
EPOCHS = 15
PATIENCE = 3
USE_FOCAL_LOSS = True
CLASS_NAMES = ['relevant', 'spam', 'rant', 'advertisement']
NUM_CLASSES = len(CLASS_NAMES)


# =============================================================================
# 3. TOWER DEFINITIONS (The "Sub-Models")
# =============================================================================
class TextTower(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state[:, 0, :]

class UserTower(nn.Module):
    def __init__(self, num_users, embedding_dim):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
    def forward(self, user_ids):
        return self.user_embedding(user_ids)

class BusinessTower(nn.Module):
    def __init__(self, num_gmap_ids, num_prices, num_cat_tags, num_misc_tags, num_numerical_features):
        super().__init__()
        self.gmap_embedding = nn.Embedding(num_gmap_ids, GMAP_EMB_DIM)
        self.price_embedding = nn.Embedding(num_prices, PRICE_EMB_DIM)
        self.category_embedding_bag = nn.EmbeddingBag(num_cat_tags, TAG_EMB_DIM, mode='mean', padding_idx=0)
        self.misc_embedding_bag = nn.EmbeddingBag(num_misc_tags, TAG_EMB_DIM, mode='mean', padding_idx=0)
        combined_dim = GMAP_EMB_DIM + PRICE_EMB_DIM + TAG_EMB_DIM + TAG_EMB_DIM + num_numerical_features
        self.mlp = nn.Sequential(nn.Linear(combined_dim, 128), nn.ReLU(), nn.Linear(128, 64))
    def forward(self, gmap_ids, price_ids, cat_tags, misc_tags, numerical_features):
        gmap_vec = self.gmap_embedding(gmap_ids)
        price_vec = self.price_embedding(price_ids)
        cat_vec = self.category_embedding_bag(cat_tags)
        misc_vec = self.misc_embedding_bag(misc_tags)
        combined_vec = torch.cat([gmap_vec, price_vec, cat_vec, misc_vec, numerical_features], dim=1)
        return self.mlp(combined_vec)


# =============================================================================
# 4. THE MAIN MODEL (The Manager)
# =============================================================================
class MainModel(nn.Module):
    def __init__(self, num_numerical_features, num_users, num_gmap_ids, num_price_tiers, num_tags):
        super().__init__()
        self.text_tower = TextTower(BERT_MODEL_NAME)
        # --- The PCA layer will be added dynamically after it's trained ---
        self.pca_transform = nn.Identity() # Placeholder
        self.user_tower = UserTower(num_users, USER_EMB_DIM)
        self.business_tower = BusinessTower(num_gmap_ids, num_price_tiers, num_tags, num_tags, num_numerical_features)

        user_output_dim, business_output_dim = USER_EMB_DIM, 64
        self.attention = nn.MultiheadAttention(embed_dim=PCA_DIM, kdim=user_output_dim + business_output_dim, vdim=user_output_dim + business_output_dim, num_heads=8, batch_first=True)

        classifier_input_dim = PCA_DIM + PCA_DIM
        self.classifier = nn.Sequential(nn.Linear(classifier_input_dim, HIDDEN_DIM), nn.ReLU(), nn.Dropout(0.5), nn.Linear(HIDDEN_DIM, NUM_CLASSES))

    def forward(self, batch):
        text_vec_raw = self.text_tower(batch['input_ids'], batch['attention_mask'])
        text_vec = self.pca_transform(text_vec_raw)

        user_vec = self.user_tower(batch['user_id'])
        business_vec = self.business_tower(batch['gmap_id'], batch['price_id'], batch['category_tags'], batch['misc_tags'], batch['numerical_features'])

        query = text_vec.unsqueeze(1)
        key_value = torch.cat([user_vec, business_vec], dim=1).unsqueeze(1)
        attn_output, _ = self.attention(query, key_value, key_value)
        attn_output = attn_output.squeeze(1)

        final_vec = torch.cat([text_vec, attn_output], dim=1)
        return self.classifier(final_vec)


# =============================================================================
# 5. DATASET & DATALOADER
# =============================================================================
MAX_LEN = 128
MAX_TAGS = 10
class ReviewDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.df = dataframe
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        inputs = self.tokenizer.encode_plus(str(row['text']), add_special_tokens=True, max_length=MAX_LEN, padding='max_length', truncation=True, return_tensors='pt')

        category_tags = row.get('category_tags_list', [])[:MAX_TAGS]
        category_tags_padded = category_tags + [0] * (MAX_TAGS - len(category_tags))
        misc_tags = row.get('misc_tags_list', [])[:MAX_TAGS]
        misc_tags_padded = misc_tags + [0] * (MAX_TAGS - len(misc_tags))

        numerical_features_tensor = torch.tensor([
            row['avg_rating'], row['num_of_reviews'], row['pics_count'],
            row['sentiment'], row['caps_ratio'], row['readability_grade'],
            row['has_cta'], row['has_offer'], row['user_avg_rating'],
            row['user_std_rating'], row['user_review_count'],
            row['is_5_star_only_user'], row['exclamation_count'],
            row['word_count'], row['unique_word_ratio'],
            row['category_keyword_match'], row['hour_of_day'], row['day_of_week'],
            row['time_since_last_review_seconds'], row['response_delay_hours']
        ], dtype=torch.float)

        return {
            'input_ids': inputs['input_ids'].squeeze(0), 'attention_mask': inputs['attention_mask'].squeeze(0),
            'user_id': torch.tensor(row['user_id_encoded'], dtype=torch.long), 'gmap_id': torch.tensor(row['gmap_id_encoded'], dtype=torch.long),
            'price_id': torch.tensor(row['price_encoded'], dtype=torch.long), 'category_tags': torch.tensor(category_tags_padded, dtype=torch.long),
            'misc_tags': torch.tensor(misc_tags_padded, dtype=torch.long), 'numerical_features': numerical_features_tensor,
            'target': torch.tensor(row['target_class'], dtype=torch.long)
        }


# =============================================================================
# 6. TRAINING & EVALUATION LOOPS
# =============================================================================
def train_epoch(model, dataloader, optimizer, device, scheduler, class_weights_tensor):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        batch = {k: v.to(device) for k, v in batch.items()}
        targets = batch.pop('target')
        outputs = model(batch)
        if USE_FOCAL_LOSS:
            targets_one_hot = nn.functional.one_hot(targets, num_classes=NUM_CLASSES).float()
            unweighted_loss = ops.sigmoid_focal_loss(outputs, targets_one_hot, alpha=-1, gamma=2, reduction='none')
            weights_for_batch = class_weights_tensor[targets]
            weighted_loss = unweighted_loss * weights_for_batch.unsqueeze(1)
            loss = weighted_loss.mean()
        else:
            loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)
            loss = loss_fn(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_epoch(model, dataloader, device, class_weights_tensor):
    model.eval()
    total_loss = 0
    all_preds, all_targets = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            targets = batch.pop('target')
            outputs = model(batch)
            if USE_FOCAL_LOSS:
                targets_one_hot = nn.functional.one_hot(targets, num_classes=NUM_CLASSES).float()
                unweighted_loss = ops.sigmoid_focal_loss(outputs, targets_one_hot, alpha=-1, gamma=2, reduction='none')
                weights_for_batch = class_weights_tensor[targets]
                weighted_loss = unweighted_loss * weights_for_batch.unsqueeze(1)
                loss = weighted_loss.mean()
            else:
                loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)
                loss = loss_fn(outputs, targets)
            total_loss += loss.item()
            _, preds = torch.max(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
    avg_loss = total_loss / len(dataloader)
    report = classification_report(all_targets, all_preds, target_names=CLASS_NAMES, digits=4, zero_division=0)
    kappa = cohen_kappa_score(all_targets, all_preds)
    return avg_loss, report, kappa

# =============================================================================
# 7. MAIN EXECUTION
# =============================================================================
if __name__ == '__main__':
    print("Loading FINAL feature-engineered data (v4)...")
    try:
        df = pd.read_csv('/content/drive/MyDrive/processed_reviews_v4.csv')
    except FileNotFoundError:
        print("ERROR: 'processed_reviews_v4.csv' not found.")
        print("Please run the 'prepare_data_v4.py' script first.")
        exit()

    print("\n--- Dynamically Configuring Vocabulary Sizes ---")
    df['category_tags_list'] = df['category_tags_encoded'].apply(ast.literal_eval)
    df['misc_tags_list'] = df['misc_tags_encoded'].apply(ast.literal_eval)
    NUM_USERS = df['user_id_encoded'].max() + 1
    NUM_GMAP_IDS = df['gmap_id_encoded'].max() + 1
    NUM_PRICE_TIERS = df['price_encoded'].max() + 1
    max_cat_tag = df['category_tags_list'].explode().max()
    max_misc_tag = df['misc_tags_list'].explode().max()
    max_cat_tag = 0 if pd.isna(max_cat_tag) else max_cat_tag
    max_misc_tag = 0 if pd.isna(max_misc_tag) else max_misc_tag
    NUM_TAGS = int(max(max_cat_tag, max_misc_tag)) + 1
    print(f"Determined NUM_USERS: {NUM_USERS}, NUM_GMAP_IDS: {NUM_GMAP_IDS}, NUM_PRICE_TIERS: {NUM_PRICE_TIERS}, NUM_TAGS: {NUM_TAGS}")
    print("----------------------------------------------\n")

    tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['target_class'])

    # --- FIX: RESTRUCTURED TO LOAD BERT ONLY ONCE ---

    # --- Step 1: Initialize the main model (but BERT weights are not yet on GPU)
    model = MainModel(
        num_numerical_features=NUM_NUMERICAL_FEATURES,
        num_users=NUM_USERS,
        num_gmap_ids=NUM_GMAP_IDS,
        num_price_tiers=NUM_PRICE_TIERS,
        num_tags=NUM_TAGS
    )

    # --- Step 2: Pre-compute embeddings for PCA using the model's text tower
    print("--- Step 1/3: Pre-computing BERT embeddings for PCA ---")
    pca_prep_dataset = ReviewDataset(train_df, tokenizer)
    pca_prep_loader = DataLoader(pca_prep_dataset, batch_size=BATCH_SIZE*2, sampler=SequentialSampler(pca_prep_dataset))

    # Move only the text tower to the GPU for this step
    text_tower_for_pca = model.text_tower.to(device)
    text_tower_for_pca.eval()

    all_embeddings = []
    with torch.no_grad():
        for batch in tqdm(pca_prep_loader, desc="Generating Embeddings"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            embeddings = text_tower_for_pca(input_ids, attention_mask)
            all_embeddings.append(embeddings.cpu().numpy())

    all_embeddings = np.concatenate(all_embeddings, axis=0)
    # Move the text tower back to the CPU to free GPU memory
    text_tower_for_pca.to('cpu')

    # --- Step 3: Train PCA
    print("\n--- Step 2/3: Training PCA on embeddings ---")
    pca = PCA(n_components=PCA_DIM)
    pca.fit(all_embeddings)
    print(f"PCA Trained. Explained variance ratio: {pca.explained_variance_ratio_.sum():.4f}")

    # --- Step 4: Create the PCA layer and add it to the model
    bert_output_dim = 768
    pca_layer = nn.Linear(bert_output_dim, PCA_DIM, bias=True)
    pca_layer.weight.data = torch.tensor(pca.components_, dtype=torch.float)
    pca_layer.bias.data = torch.tensor(-np.dot(pca.mean_, pca.components_.T), dtype=torch.float)
    for param in pca_layer.parameters():
        param.requires_grad = False

    model.pca_transform = pca_layer # Replace placeholder with the real, trained layer
    print("PCA transformation layer created and injected into the model.")
    print("------------------------------------------\n")

    # --- Step 5: Now move the entire, complete model to the GPU for training
    model.to(device)

    # --- Step 6: Create final dataloaders and proceed with training
    print(f"--- Step 3/3: Creating final dataloaders ---")
    print(f"Training on {len(train_df)} samples, validating on {len(val_df)} samples.")
    train_dataset = ReviewDataset(train_df, tokenizer)
    val_dataset = ReviewDataset(val_df, tokenizer)
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    class_counts = df['target_class'].value_counts().sort_index().values
    total_samples = float(sum(class_counts))
    class_weights = [np.log(total_samples / count) for count in class_counts]
    CLASS_WEIGHTS = [w / sum(class_weights) * NUM_CLASSES for w in class_weights]
    print(f"Calculated Class Weights: {CLASS_WEIGHTS}")
    class_weights_tensor = torch.tensor(CLASS_WEIGHTS, dtype=torch.float).to(device)

    print("Unfreezing the top 2 BERT layers...")
    for param in model.text_tower.bert.parameters():
        param.requires_grad = False
    num_layers_to_unfreeze = 2
    for i in range(num_layers_to_unfreeze):
        for param in model.text_tower.bert.encoder.layer[-(i+1)].parameters():
            param.requires_grad = True

    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    total_steps = len(train_dataloader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    epochs_no_improve = 0
    best_val_loss = float('inf')

    print("\nStarting training with PCA and unfrozen layers...")
    for epoch in range(EPOCHS):
        avg_train_loss = train_epoch(model, train_dataloader, optimizer, device, scheduler, class_weights_tensor)
        avg_val_loss, report, kappa = evaluate_epoch(model, val_dataloader, device, class_weights_tensor)

        print(f"\n--- Epoch {epoch + 1}/{EPOCHS} ---")
        print(f"Average Training Loss: {avg_train_loss:.4f}")
        print(f"Average Validation Loss: {avg_val_loss:.4f}")
        print(f"Cohen's Kappa: {kappa:.4f}")
        print("Validation Classification Report:")
        print(report)

        if avg_val_loss < best_val_loss:
            print("Validation loss improved. Saving model to 'best_model_state.bin'...")
            torch.save(model.state_dict(), 'best_model_state.bin')
            best_val_loss = avg_val_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            print(f"Validation loss did not improve. Counter: {epochs_no_improve}/{PATIENCE}")

        if epochs_no_improve == PATIENCE:
            print("Early stopping triggered.")
            break
        print("---------------------------------\n")

    print("Training finished.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda
Loading FINAL feature-engineered data (v4)...

--- Dynamically Configuring Vocabulary Sizes ---
Determined NUM_USERS: 55172, NUM_GMAP_IDS: 37333, NUM_PRICE_TIERS: 9, NUM_TAGS: 201
----------------------------------------------



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

--- Step 1/3: Pre-computing BERT embeddings for PCA ---


Generating Embeddings:   0%|          | 0/1439 [00:00<?, ?it/s]


--- Step 2/3: Training PCA on embeddings ---
PCA Trained. Explained variance ratio: 0.8543
PCA transformation layer created and injected into the model.
------------------------------------------

--- Step 3/3: Creating final dataloaders ---
Training on 46031 samples, validating on 11508 samples.
Calculated Class Weights: [np.float64(0.04551597792290425), np.float64(0.6028185668604471), np.float64(1.4374671720206607), np.float64(1.9141982831959878)]
Unfreezing the top 2 BERT layers...

Starting training with PCA and unfrozen layers...


Training:   0%|          | 0/2877 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/720 [00:00<?, ?it/s]


--- Epoch 1/15 ---
Average Training Loss: 0.0071
Average Validation Loss: 0.0051
Cohen's Kappa: 0.7187
Validation Classification Report:
               precision    recall  f1-score   support

     relevant     0.9889    0.9193    0.9528      9877
         spam     0.7457    0.9316    0.8283      1520
         rant     0.1475    0.6848    0.2428        92
advertisement     0.0000    0.0000    0.0000        19

     accuracy                         0.9175     11508
    macro avg     0.4705    0.6339    0.5060     11508
 weighted avg     0.9484    0.9175    0.9291     11508

Validation loss improved. Saving model to 'best_model_state.bin'...
---------------------------------



Training:   0%|          | 0/2877 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/720 [00:00<?, ?it/s]


--- Epoch 2/15 ---
Average Training Loss: 0.0047
Average Validation Loss: 0.0044
Cohen's Kappa: 0.6767
Validation Classification Report:
               precision    recall  f1-score   support

     relevant     0.9924    0.8958    0.9416      9877
         spam     0.7442    0.9322    0.8277      1520
         rant     0.1667    0.7065    0.2697        92
advertisement     0.0470    0.7368    0.0883        19

     accuracy                         0.8989     11508
    macro avg     0.4876    0.8179    0.5318     11508
 weighted avg     0.9514    0.8989    0.9198     11508

Validation loss improved. Saving model to 'best_model_state.bin'...
---------------------------------



Training:   0%|          | 0/2877 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/720 [00:00<?, ?it/s]


--- Epoch 3/15 ---
Average Training Loss: 0.0038
Average Validation Loss: 0.0042
Cohen's Kappa: 0.7393
Validation Classification Report:
               precision    recall  f1-score   support

     relevant     0.9920    0.9252    0.9574      9877
         spam     0.7616    0.9355    0.8397      1520
         rant     0.1895    0.7065    0.2989        92
advertisement     0.1047    0.4737    0.1714        19

     accuracy                         0.9241     11508
    macro avg     0.5119    0.7602    0.5668     11508
 weighted avg     0.9537    0.9241    0.9353     11508

Validation loss improved. Saving model to 'best_model_state.bin'...
---------------------------------



Training:   0%|          | 0/2877 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/720 [00:00<?, ?it/s]


--- Epoch 4/15 ---
Average Training Loss: 0.0032
Average Validation Loss: 0.0046
Cohen's Kappa: 0.6776
Validation Classification Report:
               precision    recall  f1-score   support

     relevant     0.9935    0.8922    0.9401      9877
         spam     0.7438    0.9533    0.8356      1520
         rant     0.2429    0.6522    0.3540        92
advertisement     0.0339    0.7895    0.0649        19

     accuracy                         0.8982     11508
    macro avg     0.5035    0.8218    0.5487     11508
 weighted avg     0.9529    0.8982    0.9202     11508

Validation loss did not improve. Counter: 1/3
---------------------------------



Training:   0%|          | 0/2877 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/720 [00:00<?, ?it/s]


--- Epoch 5/15 ---
Average Training Loss: 0.0026
Average Validation Loss: 0.0049
Cohen's Kappa: 0.7419
Validation Classification Report:
               precision    recall  f1-score   support

     relevant     0.9931    0.9219    0.9562      9877
         spam     0.7317    0.9671    0.8331      1520
         rant     0.2338    0.5870    0.3344        92
advertisement     0.0808    0.4211    0.1356        19

     accuracy                         0.9244     11508
    macro avg     0.5099    0.7243    0.5648     11508
 weighted avg     0.9510    0.9244    0.9336     11508

Validation loss did not improve. Counter: 2/3
---------------------------------



Training:   0%|          | 0/2877 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/720 [00:00<?, ?it/s]


--- Epoch 6/15 ---
Average Training Loss: 0.0022
Average Validation Loss: 0.0055
Cohen's Kappa: 0.7809
Validation Classification Report:
               precision    recall  f1-score   support

     relevant     0.9917    0.9410    0.9657      9877
         spam     0.7919    0.9461    0.8621      1520
         rant     0.2641    0.6630    0.3777        92
advertisement     0.1011    0.4737    0.1667        19

     accuracy                         0.9387     11508
    macro avg     0.5372    0.7559    0.5930     11508
 weighted avg     0.9580    0.9387    0.9460     11508

Validation loss did not improve. Counter: 3/3
Early stopping triggered.
Training finished.


In [3]:
# --- ADD THIS SNIPPET TO SAVE THE FINAL MODEL ---

print("Training finished.")

# Define the path for the final model
FINAL_MODEL_SAVE_PATH = '/content/drive/MyDrive/pca.bin'

print(f"Saving final model to {FINAL_MODEL_SAVE_PATH}...")
# Save the model's learned weights (the state dictionary)
torch.save(model.state_dict(), FINAL_MODEL_SAVE_PATH)
print("Final model saved successfully.")

# --- END OF SNIPPET ---

  # ... rest of your script for evaluating on the test set ..

Training finished.
Saving final model to /content/drive/MyDrive/pca.bin...
Final model saved successfully.
