<a href="https://colab.research.google.com/github/thegoodgamer14/tech-spec-extractor/blob/main/mlm_adaptation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
print("Setting up...")
!pip install datasets transformers torch accelerate torchviz graphviz scikit-learn -q

Setting up...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━

In [2]:
import torch
import glob
import re
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer
)
from google.colab import drive
import logging
import random
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from torchviz import make_dot
import matplotlib.pyplot as plt
import os

In [3]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
print("Setup complete.")

Setup complete.


In [4]:
print("Configuring paths and hyperparameters...")
# --- Paths ---
DRIVE_MOUNT_POINT = '/content/drive'
DRIVE_OUTPUT_BASE = f'{DRIVE_MOUNT_POINT}/MyDrive/tech-spec-extractor/data/processed'

# Input text file directories
MECH_TEXT_FOLDER = f'{DRIVE_OUTPUT_BASE}/mechanical-text'
ELEC_TEXT_FOLDER = f'{DRIVE_OUTPUT_BASE}/electrical-text'

# Output directories
MLM_OUTPUT_DIR_DRIVE = f'{DRIVE_OUTPUT_BASE}/results_mlm_adaptation'
FINAL_ADAPTED_MODEL_DIR_DRIVE = f'{DRIVE_OUTPUT_BASE}/adapted_distilbert_me'

# --- Model & Training Hyperparameters ---
BASE_MODEL_FOR_MLM = "distilbert-base-uncased"  # Original model to adapt
MLM_LEARNING_RATE = 5e-5
MLM_BATCH_SIZE = 8       # Adjust based on GPU memory
MLM_NUM_EPOCHS = 5       # Increased to 5 as requested
MLM_WEIGHT_DECAY = 0.01
MLM_PROBABILITY = 0.15   # Standard masking probability for MLM
MAX_SEQ_LENGTH = 512     # Max sequence length for tokenizer
MLM_LOGGING_STEPS = 100
MLM_SAVE_STEPS = 500     # Save checkpoint frequency

# Create output directories
os.makedirs(MLM_OUTPUT_DIR_DRIVE, exist_ok=True)
os.makedirs(FINAL_ADAPTED_MODEL_DIR_DRIVE, exist_ok=True)

print(f"Mechanical Text Folder: {MECH_TEXT_FOLDER}")
print(f"Electrical Text Folder: {ELEC_TEXT_FOLDER}")
print(f"MLM Output Dir: {MLM_OUTPUT_DIR_DRIVE}")
print(f"Final Adapted Model Dir: {FINAL_ADAPTED_MODEL_DIR_DRIVE}")
print(f"Base Model: {BASE_MODEL_FOR_MLM}")
print("Configuration complete.")

Configuring paths and hyperparameters...
Mechanical Text Folder: /content/drive/MyDrive/tech-spec-extractor/data/processed/mechanical-text
Electrical Text Folder: /content/drive/MyDrive/tech-spec-extractor/data/processed/electrical-text
MLM Output Dir: /content/drive/MyDrive/tech-spec-extractor/data/processed/results_mlm_adaptation
Final Adapted Model Dir: /content/drive/MyDrive/tech-spec-extractor/data/processed/adapted_distilbert_me
Base Model: distilbert-base-uncased
Configuration complete.


In [5]:
# === Block 3: Mount Google Drive ===
print("Mounting Google Drive...")
try:
    # Simple mount without force_remount to avoid the common error
    drive.mount(DRIVE_MOUNT_POINT, force_remount=True)
    logging.info("Google Drive mounted successfully.")

    # Verify input folders exist
    path_issues = False
    if not os.path.isdir(MECH_TEXT_FOLDER):
        logging.warning(f"Mechanical text folder not found: {MECH_TEXT_FOLDER}")
        path_issues = True
    if not os.path.isdir(ELEC_TEXT_FOLDER):
        logging.warning(f"Electrical text folder not found: {ELEC_TEXT_FOLDER}")
        path_issues = True

    # Create output directories if they don't exist
    os.makedirs(MLM_OUTPUT_DIR_DRIVE, exist_ok=True)
    os.makedirs(FINAL_ADAPTED_MODEL_DIR_DRIVE, exist_ok=True)

    if path_issues:
        logging.warning("Please check your folder paths and try again.")
    else:
        logging.info("All input folders found.")

except Exception as e:
    logging.error(f"Error mounting Google Drive or verifying folders: {e}")
    # If there's an error with "already contains files", explain solution
    if "Mountpoint must not already contain files" in str(e):
        print("\nTRY THIS: In Colab menu, go to 'Runtime' > 'Factory reset runtime' to clean up, then run again.")
    raise e
print("Drive mounted.")

Mounting Google Drive...
Mounted at /content/drive
Drive mounted.


In [6]:
print("Loading and processing text files...")

def basic_text_cleaning(text):
    """Perform basic cleaning on extracted text"""
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove very short lines that might be artifacts
    lines = [line for line in text.split("\n") if len(line.strip()) > 10]
    return "\n".join(lines).strip()

Loading and processing text files...


In [7]:
def load_text_files(folder, doc_type):
    """Load and process text files from a folder"""
    text_files = glob.glob(os.path.join(folder, "*.txt"))

    docs = []

    for file_path in tqdm(text_files, desc=f"Loading {doc_type} documents"):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()

            # Basic cleaning
            text = basic_text_cleaning(text)

            if len(text.split()) > 20:  # Only include if has substantial content
                docs.append({
                    "text": text,
                    "doc_type": doc_type,
                    "file_name": os.path.basename(file_path)
                })
            else:
                logging.warning(f"Skipping {os.path.basename(file_path)} due to insufficient content.")

        except Exception as e:
            logging.warning(f"Error processing {file_path}: {e}")

    return docs

In [8]:
try:
    # Load mechanical and electrical documents
    mechanical_docs = load_text_files(MECH_TEXT_FOLDER, "mechanical")
    electrical_docs = load_text_files(ELEC_TEXT_FOLDER, "electrical")

    # Combine all documents
    all_docs = mechanical_docs + electrical_docs
    random.shuffle(all_docs)  # Shuffle to mix mechanical and electrical docs

    logging.info(f"Loaded {len(mechanical_docs)} mechanical documents and {len(electrical_docs)} electrical documents.")
    logging.info(f"Total documents for training: {len(all_docs)}")

    if len(all_docs) == 0:
        raise ValueError("No valid documents found. Please check your text files.")

except Exception as e:
    logging.error(f"Error loading text files: {e}")
    raise e

Loading mechanical documents:   0%|          | 0/30 [00:00<?, ?it/s]

Loading electrical documents:   0%|          | 0/27 [00:00<?, ?it/s]

In [9]:
print("Creating dataset for MLM training...")

def chunk_long_documents(docs, max_length=5000, overlap=500):
    """Split long documents into chunks with some overlap"""
    chunked_docs = []

    for doc in docs:
        text = doc["text"]
        doc_type = doc["doc_type"]
        file_name = doc["file_name"]

        # If text is short enough, keep as is
        if len(text) <= max_length:
            chunked_docs.append(doc)
        else:
            # Split into chunks with overlap
            words = text.split()
            chunks = []

            for i in range(0, len(words), max_length - overlap):
                chunk = " ".join(words[i:i + max_length])
                if len(chunk.split()) > 50:  # Only include substantial chunks
                    chunks.append(chunk)

            # Create new document entries for each chunk
            for i, chunk in enumerate(chunks):
                chunked_docs.append({
                    "text": chunk,
                    "doc_type": doc_type,
                    "file_name": f"{file_name}_chunk{i+1}"
                })

    return chunked_docs


Creating dataset for MLM training...


In [10]:
print("Creating dataset for MLM training...")

def chunk_long_documents(docs, max_length=5000, overlap=500):
    """Split long documents into chunks with some overlap"""
    chunked_docs = []

    for doc in docs:
        text = doc["text"]
        doc_type = doc["doc_type"]
        file_name = doc["file_name"]

        # If text is short enough, keep as is
        if len(text) <= max_length:
            chunked_docs.append(doc)
        else:
            # Split into chunks with overlap
            words = text.split()
            chunks = []

            for i in range(0, len(words), max_length - overlap):
                chunk = " ".join(words[i:i + max_length])
                if len(chunk.split()) > 50:  # Only include substantial chunks
                    chunks.append(chunk)

            # Create new document entries for each chunk
            for i, chunk in enumerate(chunks):
                chunked_docs.append({
                    "text": chunk,
                    "doc_type": doc_type,
                    "file_name": f"{file_name}_chunk{i+1}"
                })

    return chunked_docs

Creating dataset for MLM training...


In [11]:
try:
    # Chunk long documents if needed
    chunked_docs = chunk_long_documents(all_docs)
    logging.info(f"After chunking: {len(chunked_docs)} document segments for training")

    # Extract just the texts for the dataset
    train_texts = [doc["text"] for doc in chunked_docs]

    # Create a Hugging Face dataset
    me_dataset = Dataset.from_dict({"text": train_texts})
    logging.info(f"Created dataset with {len(me_dataset)} samples")

except Exception as e:
    logging.error(f"Error creating dataset: {e}")
    raise e

In [12]:
logging.info(f"Loading base model ({BASE_MODEL_FOR_MLM}) and tokenizer for MLM...")
try:
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_FOR_MLM)
    model = AutoModelForMaskedLM.from_pretrained(BASE_MODEL_FOR_MLM)

    # Check if GPU is available and move model
    if torch.cuda.is_available():
        model.to('cuda')
        logging.info("Model moved to GPU.")
    else:
        logging.info("GPU not available, using CPU.")
except Exception as e:
    logging.error(f"Error loading model or tokenizer: {e}")
    raise e
logging.info("Base model and tokenizer loaded.")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [26]:
try:
    # Create a simple function to evaluate MLM perplexity
    def evaluate_mlm_perplexity(model, dataset, tokenizer):
      """Evaluate perplexity (lower is better) of MLM model on a dataset"""
      try:
          # Force CUDA if available
          device = 'cuda' if torch.cuda.is_available() else 'cpu'
          print(f"Using device: {device}")

          model.eval()
          model = model.to(device)  # Ensure model is on the correct device
          total_loss = 0
          total_tokens = 0

          # Create a small dataloader for evaluation
          eval_subset = dataset.select(range(min(20, len(dataset))))

          with torch.no_grad():
              # Process each example individually
              for i in range(len(eval_subset)):
                  example = eval_subset[i]

                  # Tokenize on CPU first
                  encoding = tokenizer(example['text'], return_tensors='pt',
                                      padding='max_length', truncation=True,
                                      max_length=MAX_SEQ_LENGTH)

                  # Move everything to device CONSISTENTLY
                  input_ids = encoding["input_ids"].to(device)

                  # Create attention mask if not present
                  if "attention_mask" not in encoding:
                      attention_mask = torch.ones_like(input_ids, device=device)
                  else:
                      attention_mask = encoding["attention_mask"].to(device)

                  # Labels are a copy of input_ids (AFTER moving to device)
                  labels = input_ids.clone()

                  # Create random array on the same device
                  rand = torch.rand(input_ids.shape, device=device)

                  # Special token IDs for masking
                  cls_token_id = tokenizer.cls_token_id if tokenizer.cls_token_id is not None else 101
                  sep_token_id = tokenizer.sep_token_id if tokenizer.sep_token_id is not None else 102
                  pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0

                  # Create mask using tensors all on same device
                  mask_arr = ((rand < 0.15) &
                            (input_ids != cls_token_id) &
                            (input_ids != sep_token_id) &
                            (input_ids != pad_token_id))

                  # Get indices of masked tokens
                  masked_indices = mask_arr.nonzero(as_tuple=True)

                  if len(masked_indices[0]) > 0:  # Only process if there are tokens to mask
                      # Create a copy of input_ids for masked version
                      masked_inputs = input_ids.clone()

                      # Apply mask token
                      masked_inputs[masked_indices] = tokenizer.mask_token_id

                      # Forward pass (all tensors on same device)
                      outputs = model(input_ids=masked_inputs,
                                    attention_mask=attention_mask,
                                    labels=labels)

                      loss = outputs.loss
                      # Count only masked tokens for loss
                      masked_count = mask_arr.sum().item()
                      if masked_count > 0:  # Avoid division by zero
                          total_loss += loss.item() * masked_count
                          total_tokens += masked_count

          # Calculate perplexity (exp of average loss)
          if total_tokens > 0:
              perplexity = torch.exp(torch.tensor(total_loss / total_tokens))
              return perplexity.item()
          else:
              print("Warning: No tokens were masked during evaluation")
              return 100.0  # Default value

      except Exception as e:
          print(f"Detailed error in evaluate_mlm_perplexity: {str(e)}")
          import traceback
          traceback.print_exc()
          return 1000.0  # Return a high perplexity value as fallback

    # Get baseline perplexity
    baseline_perplexity = evaluate_mlm_perplexity(model, me_dataset, tokenizer)
    print(f"\n=== Baseline Model Metrics ===")
    print(f"MLM Perplexity: {baseline_perplexity:.2f} (lower is better)")
    print(f"This is the model's performance on your technical domain before adaptation.\n")

    # Optionally, try to visualize model architecture
    try:
        print("\n=== Model Architecture Visualization ===")
        # Create a simple diagram of the model structure
        dummy_input = tokenizer("This is a sample text", return_tensors="pt")

        # Check if we can use CUDA for the dummy forward pass
        if torch.cuda.is_available():
            dummy_input = {k: v.to('cuda') for k, v in dummy_input.items()}

        # Create a simple visualization of top-level components
        print("Model top-level architecture:")
        for name, child in model.named_children():
            print(f"- {name}: {child.__class__.__name__}")

        # Optionally, generate a basic model summary table
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

        print(f"\nModel: {model.__class__.__name__}")
        print(f"Total parameters: {total_params:,}")
        print(f"Trainable parameters: {trainable_params:,}")
        print(f"Non-trainable parameters: {total_params - trainable_params:,}")

        # Note: A full graphical visualization with torchviz is too complex for transformer models
        print("\nFull graphical visualization skipped - transformer models generate extremely large diagrams.")

    except Exception as e:
        print(f"Could not visualize model architecture: {e}")

except Exception as e:
    logging.error(f"Error evaluating baseline model: {e}")

Using device: cuda

=== Baseline Model Metrics ===
MLM Perplexity: 1.89 (lower is better)
This is the model's performance on your technical domain before adaptation.


=== Model Architecture Visualization ===
Model top-level architecture:
- activation: GELUActivation
- distilbert: DistilBertModel
- vocab_transform: Linear
- vocab_layer_norm: LayerNorm
- vocab_projector: Linear
- mlm_loss_fct: CrossEntropyLoss

Model: DistilBertForMaskedLM
Total parameters: 66,985,530
Trainable parameters: 66,985,530
Non-trainable parameters: 0

Full graphical visualization skipped - transformer models generate extremely large diagrams.


In [27]:
logging.info("Splitting data and tokenizing the dataset for MLM training...")
try:
    # Split dataset into train and test (70:30 split)
    train_texts, test_texts = train_test_split(
        train_texts,
        test_size=0.3,
        random_state=42
    )

    # Create train and test datasets
    train_dataset = Dataset.from_dict({"text": train_texts})
    test_dataset = Dataset.from_dict({"text": test_texts})

    logging.info(f"Dataset split into {len(train_dataset)} training samples and {len(test_dataset)} test samples")

    # Define tokenization function for MLM
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            max_length=MAX_SEQ_LENGTH,
            padding=False,  # We'll dynamically pad in the data collator
            return_special_tokens_mask=True  # Needed for MLM
        )

    # Tokenize the train dataset
    tokenized_train_dataset = train_dataset.map(
        tokenize_function,
        batched=True,
        num_proc=1,  # Increase if on a powerful machine
        remove_columns=["text"]  # Remove original text column
    )

    # Tokenize the test dataset
    tokenized_test_dataset = test_dataset.map(
        tokenize_function,
        batched=True,
        num_proc=1,
        remove_columns=["text"]
    )

    logging.info("Tokenization complete.")

    # Filter out very short sequences if any existed
    tokenized_train_dataset = tokenized_train_dataset.filter(
        lambda example: len(example['input_ids']) > 20
    )
    tokenized_test_dataset = tokenized_test_dataset.filter(
        lambda example: len(example['input_ids']) > 20
    )

    logging.info(f"Final dataset sizes after filtering:")
    logging.info(f"  - Training: {len(tokenized_train_dataset)} samples")
    logging.info(f"  - Testing:  {len(tokenized_test_dataset)} samples")

except Exception as e:
    logging.error(f"Error tokenizing dataset: {e}")
    raise e

Map:   0%|          | 0/155 [00:00<?, ? examples/s]

Map:   0%|          | 0/67 [00:00<?, ? examples/s]

Filter:   0%|          | 0/155 [00:00<?, ? examples/s]

Filter:   0%|          | 0/67 [00:00<?, ? examples/s]

In [28]:
logging.info("Setting up Data Collator for Language Modeling...")
# This handles dynamic padding and random token masking
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=MLM_PROBABILITY
)
logging.info("Data collator set up.")

In [29]:
logging.info("Configuring Training Arguments for MLM...")
training_args = TrainingArguments(
    output_dir=MLM_OUTPUT_DIR_DRIVE,
    overwrite_output_dir=True,
    num_train_epochs=MLM_NUM_EPOCHS,
    per_device_train_batch_size=MLM_BATCH_SIZE,
    per_device_eval_batch_size=MLM_BATCH_SIZE,
    gradient_accumulation_steps=4,  # Accumulate gradients to handle small batch sizes
    save_strategy="epoch",
    save_steps=MLM_SAVE_STEPS,
    save_total_limit=2,             # Keep only the last 2 checkpoints
    learning_rate=MLM_LEARNING_RATE,
    weight_decay=MLM_WEIGHT_DECAY,
    logging_dir=f'{MLM_OUTPUT_DIR_DRIVE}/logs',
    logging_steps=MLM_LOGGING_STEPS,
    eval_strategy="epoch",           # Evaluate after each epoch
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    report_to="none",                # Disable wandb/tensorboard reporting
    dataloader_drop_last=False,      # Use all data
    dataloader_num_workers=2,        # Increase for faster data loading
    load_best_model_at_end=True,     # Load the best model at the end of training
    metric_for_best_model="eval_loss" # Use evaluation loss to determine best model
)
logging.info("MLM Training Arguments configured.")

In [30]:
logging.info("Instantiating Trainer for MLM...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,
)
logging.info("Trainer instantiated.")

# Start training
logging.info(f"Starting continued MLM pre-training for {MLM_NUM_EPOCHS} epochs...")
try:
    train_result = trainer.train()
    logging.info("MLM Training finished.")

    # Log training metrics
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

    # Evaluate the model on the test set
    eval_results = trainer.evaluate()
    logging.info(f"Final evaluation results: {eval_results}")
    trainer.save_metrics("eval", eval_results)

    # Calculate perplexity on test set
    perplexity = np.exp(eval_results["eval_loss"])
    logging.info(f"Test set perplexity: {perplexity:.2f}")

    # Compare with baseline
    improvement = ((baseline_perplexity - perplexity) / baseline_perplexity) * 100
    print("\n=== Training Results ===")
    print(f"Baseline perplexity: {baseline_perplexity:.2f}")
    print(f"After adaptation perplexity: {perplexity:.2f}")
    print(f"Improvement: {improvement:.2f}% ({'better' if improvement > 0 else 'worse'})")

except Exception as e:
    logging.error(f"Error during MLM training: {e}")
    logging.error("If you're seeing CUDA out of memory errors, try reducing MLM_BATCH_SIZE.")
    raise e

Epoch,Training Loss,Validation Loss
1,No log,2.675809
2,No log,2.614072
3,No log,2.638262
4,No log,2.573059
5,No log,2.560773


There were missing keys in the checkpoint model loaded: ['vocab_projector.weight'].


***** train metrics *****
  epoch                    =        5.0
  total_flos               =    95679GF
  train_loss               =     2.6817
  train_runtime            = 0:00:50.09
  train_samples_per_second =      15.47
  train_steps_per_second   =      0.499



=== Training Results ===
Baseline perplexity: 1.89
After adaptation perplexity: 12.57
Improvement: -566.60% (worse)


In [31]:
logging.info(f"Saving the final MLM-adapted model to: {FINAL_ADAPTED_MODEL_DIR_DRIVE}")
try:
    # Save model and tokenizer
    trainer.save_model(FINAL_ADAPTED_MODEL_DIR_DRIVE)
    tokenizer.save_pretrained(FINAL_ADAPTED_MODEL_DIR_DRIVE)
    logging.info("MLM-adapted model and tokenizer saved successfully.")

except Exception as e:
    logging.error(f"Error saving final model: {e}")
    raise e

In [32]:
print("Performing detailed evaluation on test dataset...")
try:
    saved_files = os.listdir(FINAL_ADAPTED_MODEL_DIR_DRIVE)
    print(f"Files saved in model directory: {saved_files}")

    required_files = ["pytorch_model.bin", "config.json", "tokenizer.json", "tokenizer_config.json"]
    missing_files = [f for f in required_files if f not in saved_files]

    if missing_files:
        print(f"Warning: Some important files are missing: {missing_files}")
    else:
        print("All essential model files are present!")

    # Load the saved model for testing
    print("\n=== Testing Final Adapted Model on Test Dataset ===")
    adapted_model = AutoModelForMaskedLM.from_pretrained(FINAL_ADAPTED_MODEL_DIR_DRIVE)
    adapted_tokenizer = AutoTokenizer.from_pretrained(FINAL_ADAPTED_MODEL_DIR_DRIVE)

    # Move model to appropriate device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    adapted_model.to(device)

    # Create test data collator
    test_data_collator = DataCollatorForLanguageModeling(
        tokenizer=adapted_tokenizer,
        mlm=True,
        mlm_probability=MLM_PROBABILITY
    )

    # Create test trainer
    test_trainer = Trainer(
        model=adapted_model,
        args=TrainingArguments(
            output_dir="/tmp/test_output",
            per_device_eval_batch_size=MLM_BATCH_SIZE,
            report_to="none",
        ),
        data_collator=test_data_collator,
    )

    # Evaluate on test dataset
    test_results = test_trainer.evaluate(eval_dataset=tokenized_test_dataset)

    # Calculate perplexity
    test_perplexity = np.exp(test_results["eval_loss"])

    # Compare with baseline
    improvement = ((baseline_perplexity - test_perplexity) / baseline_perplexity) * 100

    print("\n=== Evaluation Results ===")
    print(f"Test Loss: {test_results['eval_loss']:.4f}")
    print(f"Test Perplexity: {test_perplexity:.2f}")
    print(f"Baseline Perplexity: {baseline_perplexity:.2f}")
    print(f"Improvement: {improvement:.2f}% ({'better' if improvement > 0 else 'worse'})")

    # Generate sample predictions to verify model behavior
    print("\n=== Sample Predictions ===")
    # Select a few examples from test dataset
    sample_count = min(3, len(test_dataset))
    for i in range(sample_count):
        text = test_dataset[i]["text"]

        # Tokenize the sample
        tokens = adapted_tokenizer.tokenize(text[:100])  # First 100 chars

        # Select a random position to mask (avoiding special tokens)
        valid_positions = [j for j, token in enumerate(tokens)
                           if not token.startswith('[') and not token.endswith(']')]

        if valid_positions:
            mask_pos = random.choice(valid_positions)
            original_token = tokens[mask_pos]
            tokens[mask_pos] = adapted_tokenizer.mask_token

            # Convert back to text
            masked_text = adapted_tokenizer.convert_tokens_to_string(tokens)

            # Make prediction
            inputs = adapted_tokenizer(masked_text, return_tensors="pt").to(device)
            with torch.no_grad():
                outputs = adapted_model(**inputs)

            # Get predictions for masked token
            mask_token_index = torch.where(inputs["input_ids"] == adapted_tokenizer.mask_token_id)[1]
            logits = outputs.logits
            mask_token_logits = logits[0, mask_token_index, :]

            # Get top 3 predictions
            top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist()
            top_3_tokens = [adapted_tokenizer.convert_ids_to_tokens(token_id) for token_id in top_3_tokens]

            print(f"Sample {i+1}:")
            print(f"Text with [MASK]: {masked_text}")
            print(f"Original token: '{original_token}'")
            print(f"Top 3 predictions: {top_3_tokens}")
            print()

except Exception as e:
    print(f"Error during evaluation: {e}")

print("\n--- MLM Adaptation Complete ---")
print(f"The adapted DistilBERT model is saved at: {FINAL_ADAPTED_MODEL_DIR_DRIVE}")

Performing detailed evaluation on test dataset...
Files saved in model directory: ['config.json', 'model.safetensors', 'tokenizer_config.json', 'special_tokens_map.json', 'vocab.txt', 'tokenizer.json', 'training_args.bin']

=== Testing Final Adapted Model on Test Dataset ===



=== Evaluation Results ===
Test Loss: 2.5849
Test Perplexity: 13.26
Baseline Perplexity: 1.89
Improvement: -603.14% (worse)

=== Sample Predictions ===
Sample 1:
Text with [MASK]: of reflective sheeting, shall be prepared as follows : i. blank panel shall be given a preliminary [MASK]
Original token: 'c'
Top 3 predictions: ['.', 'examination', 'drawing']

Sample 2:
Text with [MASK]: less than 11 [MASK] 0 % at seven days, per astm c642. table 4 - 9 : submittal for approval requirements for ad
Original token: '.'
Top 3 predictions: ['.', '-', '–']

Sample 3:
Text with [MASK]: [MASK] consideration in sizing final circuits and other distribution circuits. moreover, certain appli
Original token: 'into'
Top 3 predictions: ['special', 'for', 'additional']


--- MLM Adaptation Complete ---
The adapted DistilBERT model is saved at: /content/drive/MyDrive/tech-spec-extractor/data/processed/adapted_distilbert_me
