In [None]:
## CELL 1: SETUP AND INSTALLATION (FINAL)

# We are forcing the installation of missing components to avoid errors
print("--- Installing Libraries (Finalized Set) ---")
!pip install transformers datasets accelerate pandas numpy scikit-learn pypdf
# We ensure the necessary tokenizer components and compatibility layers are installed
!pip install sentencepiece protobuf tiktoken tf-keras

import pandas as pd
import numpy as np
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Trainer,
    pipeline
)
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import pypdf
import os
import torch.nn.functional as F

print("\n--- Setup Complete. Libraries Loaded. ---")

In [None]:
## CELL 2: DATA LOADING AND PREPARATION (CLIMATE-FEVER FINAL FIX)

MODEL_CLS = "distilroberta-base"

# --- 1. Load the CLIMATE-FEVER dataset ---
print("--- Loading CLIMATE-FEVER Dataset ---")
raw_datasets_cls = load_dataset("climate_fever")

# FIX: Check the split names. The correct split name is likely 'claim' or 'default'.
# We use the name 'claim' based on dataset documentation examples.
if "claim" in raw_datasets_cls:
    data_split_name = "claim"
else:
    # If the exact split is unknown, we default to the first one found (usually 'train' or 'default')
    data_split_name = list(raw_datasets_cls.keys())[0]

print(f"Using dataset split: {data_split_name}")

# --- 2. Filter and Map to Binary Labels ---
def map_to_binary_fever(example):
    label = example['claim_label']

    # Mapping 1=REFUTES (Misinfo) to 1, 0=SUPPORTS (Factual) to 0
    if label == 1:
        example['labels'] = 1
    elif label == 0:
        example['labels'] = 0
    else:
        example['labels'] = -1 # Ignore NOT_ENOUGH_INFO

    return example

# Apply re-mapping and filter out ignored samples
full_claims_dataset = raw_datasets_cls[data_split_name].map(map_to_binary_fever)
full_claims_dataset = full_claims_dataset.filter(lambda x: x['labels'] != -1)
full_claims_dataset = full_claims_dataset.remove_columns(['claim_label', 'claim_id', 'evidences'])
full_claims_dataset = full_claims_dataset.rename_column('claim', 'text')


# --- 3. Split, Tokenize, and Sample ---
# Convert to Pandas for train/test splitting
df_full = full_claims_dataset.to_pandas()
df_train, df_test = train_test_split(df_full, test_size=0.2, random_state=42)

tokenized_train_cls = Dataset.from_pandas(df_train, preserve_index=False)
tokenized_test_cls = Dataset.from_pandas(df_test, preserve_index=False)

# Keep minimal sample sizes (no need for manual balancing as REFUTES/SUPPORTS are better distributed)
N_TRAIN = min(len(tokenized_train_cls), 1500)
N_TEST = min(len(tokenized_test_cls), 300)

small_train_dataset_cls = tokenized_train_cls.shuffle(seed=42).select(range(N_TRAIN))
small_test_dataset_cls = tokenized_test_cls.shuffle(seed=42).select(range(N_TEST))

# Tokenize and set PyTorch format
tokenizer_cls = AutoTokenizer.from_pretrained(MODEL_CLS)
def tokenize_function_cls(examples):
    return tokenizer_cls(examples["text"], truncation=True, padding="max_length", max_length=128)

small_train_dataset_cls = small_train_dataset_cls.map(tokenize_function_cls, batched=True).remove_columns(['text'])
small_test_dataset_cls = small_test_dataset_cls.map(tokenize_function_cls, batched=True).remove_columns(['text'])

required_cols = ['input_ids', 'attention_mask', 'labels']
small_train_dataset_cls.set_format(type='torch', columns=required_cols)
small_test_dataset_cls.set_format(type='torch', columns=required_cols)

# Initialize the DistilRoBERTa model
model_cls = AutoModelForSequenceClassification.from_pretrained(MODEL_CLS, num_labels=2, output_attentions=True)

print(f"\nModel: {MODEL_CLS} loaded.")
print(f"Training Claims: {len(small_train_dataset_cls)} (High Quality)")
print("Classification Data Prepared. **RERUN CELL 3 NOW!**")

In [None]:
## NECESSARY STEP BEFORE CELL 3 (If you haven't run this recently)
from google.colab import drive
import os

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Define the Drive save path (MUST be the same path as Cell 7 saving)
SAVE_PATH = '/content/drive/MyDrive/NLP_Climate_Awareness_System'
os.makedirs(SAVE_PATH, exist_ok=True)

In [None]:
# Setup optimizer
optimizer = AdamW(model_cls.parameters(), lr=2e-5) # 2e-5 is 0.00002

In [None]:
## CELL 3: CUSTOM PYTORCH TRAINING LOOP (FINAL OPTIMIZATION)

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
import os
from transformers import AutoModelForSequenceClassification # Needed for loading best model

# --- Environment and Device Setup ---
# Assumes SAVE_PATH is defined and Drive is mounted
SAVE_PATH = '/content/drive/MyDrive/NLP_Climate_Awareness_System'

try:
    train_dataset = small_train_dataset_cls
    test_dataset = small_test_dataset_cls
except NameError:
    print("FATAL ERROR: Dataset variables not found. Please run Cell 2 first.")
    raise

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_cls.to(device)

# --- DataLoaders (Remains the same) ---
def collate_fn(batch):
    # This custom collator fixes the NumPy inhomogeneous shape error
    input_ids = torch.stack([x['input_ids'] for x in batch])
    attention_mask = torch.stack([x['attention_mask'] for x in batch])
    labels = torch.tensor([x['labels'] for x in batch])
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

# Setup optimizer
# CRITICAL FIX: Reduced Learning Rate for finer tuning over 10 epochs
optimizer = AdamW(model_cls.parameters(), lr=1e-5)

# Training function
def train_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# Evaluation function
def evaluate(model, loader, device):
    from sklearn.metrics import accuracy_score, f1_score
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return accuracy, f1

# --- TRAINING LOOP WITH EARLY STOPPING AND CHECKPOINTS ---
print("--- Starting Training (Optimized with Early Stopping & LR) ---")
num_epochs = 20 # Allow more time for learning
patience = 3     # Stop if F1 score doesn't improve for 3 epochs
best_f1 = 0.0
epochs_no_improve = 0
should_stop = False

for epoch in range(num_epochs):
    if should_stop:
        print(f"\nStopping early after {patience} epochs with no improvement.")
        break

    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    # Train
    train_loss = train_epoch(model_cls, train_loader, optimizer, device)
    print(f"Training Loss: {train_loss:.4f}")

    # Evaluate
    accuracy, f1 = evaluate(model_cls, test_loader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation F1-Score: {f1:.4f}")

    # Checkpoint and Early Stopping Logic (based on F1-Score)
    if f1 > best_f1:
        best_f1 = f1
        epochs_no_improve = 0
        print(f"--> NEW BEST F1-SCORE: {best_f1:.4f}. Saving model checkpoint to Drive...")
        # Save the best model state
        model_cls.save_pretrained(SAVE_PATH)
        tokenizer_cls.save_pretrained(SAVE_PATH)
    else:
        epochs_no_improve += 1
        print(f"F1 not improved. Patience left: {patience - epochs_no_improve}")
        if epochs_no_improve >= patience:
            should_stop = True

# --- Final Evaluation (Load Best Model) ---
print("\n--- Finalizing Results ---")
# Load the best model saved during training for final evaluation
model_cls = AutoModelForSequenceClassification.from_pretrained(SAVE_PATH)
model_cls.to(device)

final_accuracy, final_f1 = evaluate(model_cls, test_loader, device)

print("\n--- FINAL OPTIMIZED RESULTS ---")
print(f"Best Accuracy: {final_accuracy:.4f}")
print(f"Best F1-Score (Saved Checkpoint): {final_f1:.4f}")

In [None]:
## CELL 3.5: SUMMARIZER INITIALIZATION (FIX)

# Define the model name used earlier
SUM_MODEL = "facebook/bart-large-cnn"

print(f"--- Initializing Summarizer Pipeline ({SUM_MODEL}) ---")
try:
    # Load the BART summarizer pipeline globally
    global summarizer
    summarizer = pipeline(
        "summarization",
        model=SUM_MODEL,
        tokenizer=SUM_MODEL,
        device=0 if torch.cuda.is_available() else -1
    )
    print("Summarizer pipeline loaded successfully.")
except Exception as e:
    print(f"FATAL ERROR: Could not load Summarizer. Check internet connection. Details: {e}")

In [None]:
## CELL 4: POLICY SUMMARIZATION (BART INFERENCE)

# Placeholder Policy Document (Use a long text to demonstrate summarization)
policy_text_example = (
    "The Intergovernmental Panel on Climate Change (IPCC) released its Sixth Assessment Report (AR6) detailing the urgency of global warming. The report emphasizes that to limit temperature rise to 1.5Â°C above pre-industrial levels, global greenhouse gas emissions must peak before 2025 and be reduced by 43% by 2030. Furthermore, the report stresses the need for extensive investments in both solar and wind energy production, alongside nature-based solutions like reforestation. The economic transition, while challenging, is highlighted as having multiple co-benefits, including improved air quality and reduced reliance on volatile global fossil fuel markets."
)

# Generate Summary
print("--- Abstractive Policy Summarization (BART) ---")
# Generate summary using the loaded BART pipeline.
generated_summary = summarizer(
    policy_text_example,
    max_length=200,
    min_length=60,  # Increased min length for detailed policy summary
    do_sample=False
)[0]['summary_text']

print("\n[Original Policy Snippet (IPCC Report)]")
print(policy_text_example)
print("\n[Generated Policy Summary (Abstractive)]")
print(f"**{generated_summary}**")

In [None]:
## CELL 5: VISUALIZATION (ATTENTION MECHANISMS)

# Reload the BEST model saved in your Drive for visualization
try:
    best_model_cls = AutoModelForSequenceClassification.from_pretrained(SAVE_PATH, output_attentions=True)
    best_model_cls.to(device)
    print("Best trained model loaded from Drive for visualization.")
except Exception as e:
    # If the model fails to load, use the one currently in memory from Cell 3
    print(f"Error loading best model from Drive: {e}. Falling back to last model in memory.")
    best_model_cls = model_cls

def visualize_attention_key_terms(text_input, model, tokenizer):
    # Tokenize input and move to device
    inputs = tokenizer(text_input, return_tensors="pt", truncation=True).to(model.device)

    with torch.no_grad():
        outputs = model(**inputs, output_attentions=True)

    # Get attention weights from the last encoder layer
    attention_weights = outputs.attentions[-1]

    # Average across all heads and layers, focus on CLS token's attention to other tokens
    avg_attention = attention_weights.mean(dim=2).mean(dim=0)[0, 1:-1]

    # Get the corresponding tokens
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])[1:-1]

    # Create a list of (token, weight) pairs and sort
    combined = sorted(zip(tokens, avg_attention.cpu().numpy()), key=lambda x: x[1], reverse=True)

    # Get the top 5 most attended terms
    key_terms = [term[0].replace('Ä ', '') for term in combined if term[0].isalpha() or term[0].startswith('Ä ')][:5]

    return key_terms

# Example Claim for the Misinformation Detector (Use a challenging anti-climate claim)
sample_claim = "The carbon tax is a hoax and an attack on small businesses, not climate change."

key_words = visualize_attention_key_terms(sample_claim, best_model_cls, tokenizer_cls)

print("\n--- Key Term Visualization (Attention Proxy) ---")
print(f"Input: {sample_claim}")
print(f"Top 5 Key Terms (Words that drove the classification decision): {key_words}")

In [None]:
## CELL 6: FINAL VERIFICATION TEST

# The best model (best_model_cls) is assumed to be loaded in Cell 5.

# Define the explicit label mapping
ID2LABEL = {0: "Factual/Pro-Climate ðŸŸ¢", 1: "Anti-Climate/Misinformation ðŸ”´"}

def run_single_test_with_threshold(claim, model, tokenizer):
    # This runs the final classification logic (same as in app.py)
    inputs = tokenizer(claim, return_tensors="pt", truncation=True, padding=True).to(model.device)

    with torch.no_grad():
        outputs = model(**inputs)

    probabilities = F.softmax(outputs.logits, dim=1)

    # Use the threshold logic (0.10) for Misinformation detection
    misinfo_prob = probabilities[0, 1].item()

    if misinfo_prob > 0.10:
        predicted_label = ID2LABEL[1]
        score = misinfo_prob
    else:
        predicted_label = ID2LABEL[0]
        score = probabilities[0, 0].item()

    return predicted_label, score

print("\n--- FINAL VERIFICATION TEST (Using Best Model) ---")

test_claims = [
    # 1. Economic Alarm (Should be flagged as Misinformation)
    "The Green New Deal will instantly destroy 90% of US manufacturing jobs and trigger a massive recession.",
    # 2. Denial Claim (Should be flagged as Misinformation)
    "Climate change isn't real because the Arctic ice extent has actually been recovering since the cold winter of 2012.",
    # 3. Clear Factual Claim (Should be Factual)
    "Global sea levels rose by an average of 3.7 millimeters per year between 2006 and 2018.",
]

for i, claim in enumerate(test_claims):
    label, score = run_single_test_with_threshold(claim, best_model_cls, tokenizer_cls)
    print(f"\nTest {i+1}: {claim[:60]}...")
    print(f"  Prediction: {label}")
    print(f"  Confidence: {score:.4f}")

In [None]:
## CELL 7: SAVE TRAINED MODEL TO GOOGLE DRIVE

from google.colab import drive
import os
import torch

# 1. Mount Google Drive
print("--- Mounting Google Drive ---")
drive.mount('/content/drive')

# 2. Define the save path
# This will create a folder named 'NLP_Climate_Awareness_System' in your Drive.
SAVE_PATH = '/content/drive/MyDrive/NLP_Climate_Awareness_System'
os.makedirs(SAVE_PATH, exist_ok=True)

# 3. Save the model and tokenizer weights
print(f"\nSaving FINAL TRAINED MODEL to: {SAVE_PATH}...")

# Ensure the model is in CPU mode before saving large files (optional, but safer)
model_cls.cpu()

# Save the model and the tokenizer
model_cls.save_pretrained(SAVE_PATH)
tokenizer_cls.save_pretrained(SAVE_PATH)

print("\nðŸŽ‰ MODEL AND TOKENIZER SAVED SUCCESSFULLY!")
print("You can now safely close your Colab session.")

# Move the model back to GPU (optional, if you plan to run Cell 4/5 again)
if torch.cuda.is_available():
    model_cls.to(torch.device('cuda'))