# Baseline NER Model Training

This notebook implements and trains a baseline NER model using a feedforward neural network with word embeddings.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import json
import tensorflow as tf
from datetime import datetime

# Add src directory to path
sys.path.append('../src')

from baseline_model import BaselineNERModel, create_baseline_model, train_baseline_model
from data_preprocessing import NERDataProcessor
from evaluation import NERModelEvaluator, evaluate_predictions
from utils import load_results, save_results

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Configure TensorFlow
tf.keras.utils.set_random_seed(42)
print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {tf.config.list_physical_devices('GPU')}")

## 1. Load Preprocessed Data

In [None]:
# Load preprocessed data
print("Loading preprocessed data...")

# Load the numpy arrays
data = np.load('../results/processed_data.npz')
X_train = data['X_train']
X_val = data['X_val']
X_test = data['X_test']
y_train = data['y_train']
y_val = data['y_val']
y_test = data['y_test']

# Load metadata
metadata = load_results('../results/preprocessing_metadata.json')

print(f"Data loaded successfully!")
print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")
print(f"Vocabulary size: {metadata['vocab_size']:,}")
print(f"Number of tags: {metadata['num_tags']}")
print(f"Max sequence length: {metadata['max_sequence_length']}")

## 2. Create Baseline Model

In [None]:
# Model hyperparameters
model_params = {
    'embedding_dim': 100,
    'hidden_dim': 128,
    'dropout_rate': 0.3
}

print(f"Creating baseline model with parameters:")
for param, value in model_params.items():
    print(f"  {param}: {value}")

# Create the baseline model
baseline_model = create_baseline_model(
    vocab_size=metadata['vocab_size'],
    num_tags=metadata['num_tags'],
    max_sequence_length=metadata['max_sequence_length'],
    **model_params
)

print("\nBaseline model created successfully!")

In [None]:
# Build and display model architecture
baseline_model.build_model()
print("Model Architecture:")
print(baseline_model.get_model_summary())

## 3. Train the Baseline Model

In [None]:
# Training parameters
training_params = {
    'epochs': 30,
    'batch_size': 32,
    'patience': 10
}

print(f"Training parameters:")
for param, value in training_params.items():
    print(f"  {param}: {value}")

print("\nStarting training...")
start_time = datetime.now()

# Train the model
training_results = baseline_model.train(
    X_train, y_train,
    X_val, y_val,
    model_save_path='../models/baseline_model.h5',
    **training_params
)

end_time = datetime.now()
training_time = (end_time - start_time).total_seconds()

print(f"\nTraining completed in {training_time:.1f} seconds ({training_time/60:.1f} minutes)")
print(f"Epochs trained: {training_results['epochs_trained']}")
print(f"Best validation accuracy: {training_results['best_val_accuracy']:.4f}")
print(f"Best validation loss: {training_results['best_val_loss']:.4f}")

In [None]:
# Plot training history
baseline_model.plot_training_history(save_path='../results/visualizations/baseline_training_history.png')

## 4. Evaluate the Baseline Model

In [None]:
# Evaluate on test set
print("Evaluating baseline model on test set...")

# Basic evaluation metrics
test_metrics = baseline_model.evaluate(X_test, y_test)
print(f"Test Loss: {test_metrics['test_loss']:.4f}")
print(f"Test Accuracy: {test_metrics['test_accuracy']:.4f}")

# Get predictions
print("\nGenerating predictions...")
y_pred = baseline_model.predict(X_test)
print(f"Predictions shape: {y_pred.shape}")

In [None]:
# Comprehensive evaluation using our custom evaluator
print("Performing comprehensive evaluation...")

# Create evaluator
id_to_tag = {int(k): v for k, v in metadata['id_to_tag'].items()}
evaluator = NERModelEvaluator(id_to_tag)

# Evaluate the model
evaluation_results = evaluator.evaluate_model(y_test, y_pred, X_test)

print("\nEvaluation Results:")
print("=" * 50)

# Token-level metrics
token_metrics = evaluation_results['token_level']
print(f"Token-level Accuracy: {token_metrics['accuracy']:.4f}")
print(f"Token-level Precision: {token_metrics['precision']:.4f}")
print(f"Token-level Recall: {token_metrics['recall']:.4f}")
print(f"Token-level F1-Score: {token_metrics['f1_score']:.4f}")

# Sequence-level metrics
seq_metrics = evaluation_results['sequence_level']
print(f"\nSequence-level Accuracy: {seq_metrics['sequence_accuracy']:.4f}")
print(f"Exact Matches: {seq_metrics['exact_matches']}/{seq_metrics['total_sequences']}")

# Entity-level metrics
entity_metrics = evaluation_results['entity_level']
print(f"\nEntity-level Precision: {entity_metrics['precision']:.4f}")
print(f"Entity-level Recall: {entity_metrics['recall']:.4f}")
print(f"Entity-level F1-Score: {entity_metrics['f1_score']:.4f}")

In [None]:
# Per-tag performance analysis
per_tag_metrics = evaluation_results['per_tag']
print("\nPer-Tag Performance:")
print("=" * 30)

# Convert to DataFrame for better display
per_tag_df = pd.DataFrame(per_tag_metrics).T
per_tag_df = per_tag_df.sort_values('f1_score', ascending=False)

print(per_tag_df.round(4))

# Plot per-tag F1 scores
evaluator.plot_per_tag_metrics(per_tag_metrics, metric='f1_score')

In [None]:
# Plot confusion matrix
confusion_matrix = evaluation_results['confusion_matrix']
evaluator.plot_confusion_matrix(confusion_matrix, title="Baseline Model - Confusion Matrix")

In [None]:
# Generate classification report
y_true_flat = evaluator._flatten_sequences(y_test)
y_pred_flat = evaluator._flatten_sequences(y_pred)

classification_report = evaluator.generate_classification_report(y_true_flat, y_pred_flat)
print("Detailed Classification Report:")
print("=" * 40)
print(classification_report)

## 5. Error Analysis

In [None]:
# Analyze prediction errors
print("Error Analysis:")
print("=" * 20)

# Find sequences with lowest prediction accuracy
sequence_accuracies = []
for i in range(len(y_test)):
    true_seq = y_test[i]
    pred_seq = y_pred[i]
    
    # Calculate accuracy for this sequence (ignoring padding)
    non_pad_mask = true_seq != 0  # Assuming 0 is padding
    if non_pad_mask.sum() > 0:
        seq_acc = (true_seq[non_pad_mask] == pred_seq[non_pad_mask]).mean()
        sequence_accuracies.append((i, seq_acc))

# Sort by accuracy
sequence_accuracies.sort(key=lambda x: x[1])

print(f"\nWorst performing sequences (lowest accuracy):")
for i, (seq_idx, acc) in enumerate(sequence_accuracies[:5]):
    true_tags = [id_to_tag[tag_id] for tag_id in y_test[seq_idx] if tag_id != 0]
    pred_tags = [id_to_tag[tag_id] for tag_id in y_pred[seq_idx] if tag_id != 0]
    
    print(f"\nSequence {seq_idx} (Accuracy: {acc:.2f}):")
    print(f"True:  {' '.join(true_tags[:20])}{'...' if len(true_tags) > 20 else ''}")
    print(f"Pred:  {' '.join(pred_tags[:20])}{'...' if len(pred_tags) > 20 else ''}")

print(f"\nBest performing sequences (highest accuracy):")
for i, (seq_idx, acc) in enumerate(sequence_accuracies[-5:]):
    if acc < 1.0:  # Skip perfect predictions
        true_tags = [id_to_tag[tag_id] for tag_id in y_test[seq_idx] if tag_id != 0]
        pred_tags = [id_to_tag[tag_id] for tag_id in y_pred[seq_idx] if tag_id != 0]
        
        print(f"\nSequence {seq_idx} (Accuracy: {acc:.2f}):")
        print(f"True:  {' '.join(true_tags[:20])}{'...' if len(true_tags) > 20 else ''}")
        print(f"Pred:  {' '.join(pred_tags[:20])}{'...' if len(pred_tags) > 20 else ''}")
        break

In [None]:
# Analyze common prediction errors
error_counts = {}
total_errors = 0

for i in range(len(y_test)):
    true_seq = y_test[i]
    pred_seq = y_pred[i]
    
    # Find positions where predictions differ (ignoring padding)
    non_pad_mask = true_seq != 0
    error_mask = (true_seq != pred_seq) & non_pad_mask
    
    for j in range(len(true_seq)):
        if error_mask[j]:
            true_tag = id_to_tag[true_seq[j]]
            pred_tag = id_to_tag[pred_seq[j]]
            error_key = f"{true_tag} -> {pred_tag}"
            error_counts[error_key] = error_counts.get(error_key, 0) + 1
            total_errors += 1

# Display most common errors
print(f"\nMost Common Prediction Errors (Total: {total_errors:,}):")
sorted_errors = sorted(error_counts.items(), key=lambda x: x[1], reverse=True)
for i, (error, count) in enumerate(sorted_errors[:15]):
    percentage = (count / total_errors) * 100
    print(f"{i+1:2d}. {error}: {count:,} ({percentage:.1f}%)")

## 6. Save Results

In [None]:
# Compile comprehensive results
baseline_results = {
    'model_info': {
        'model_type': 'baseline_feedforward',
        'architecture': 'feedforward_with_embeddings',
        'parameters': model_params,
        'training_params': training_params,
        'training_time_seconds': training_time,
        'total_parameters': baseline_model.model.count_params() if baseline_model.model else 0
    },
    'training_results': training_results,
    'test_metrics': test_metrics,
    'evaluation_results': {
        'token_level': evaluation_results['token_level'],
        'sequence_level': evaluation_results['sequence_level'],
        'entity_level': evaluation_results['entity_level']
    },
    'per_tag_metrics': evaluation_results['per_tag'],
    'error_analysis': {
        'total_errors': total_errors,
        'most_common_errors': dict(sorted_errors[:10])
    },
    'metadata': {
        'timestamp': datetime.now().isoformat(),
        'data_splits': {
            'train_size': len(X_train),
            'val_size': len(X_val),
            'test_size': len(X_test)
        }
    }
}

# Save results
results_path = '../results/baseline_results.json'
save_results(baseline_results, results_path)
print(f"Baseline model results saved to: {results_path}")

# Save predictions for later analysis
predictions_path = '../results/baseline_predictions.npz'
np.savez_compressed(
    predictions_path,
    y_true=y_test,
    y_pred=y_pred,
    X_test=X_test
)
print(f"Baseline predictions saved to: {predictions_path}")

## 7. Summary

In [None]:
print("Baseline Model Summary:")
print("=" * 30)
print(f"🏗️  Architecture: Feedforward Neural Network with Embeddings")
print(f"📊 Parameters: {baseline_model.model.count_params():,}")
print(f"⏱️  Training Time: {training_time/60:.1f} minutes")
print(f"📈 Epochs: {training_results['epochs_trained']}")

print(f"\n🎯 Performance Metrics:")
print(f"   • Token Accuracy: {token_metrics['accuracy']:.3f}")
print(f"   • Token F1-Score: {token_metrics['f1_score']:.3f}")
print(f"   • Sequence Accuracy: {seq_metrics['sequence_accuracy']:.3f}")
print(f"   • Entity F1-Score: {entity_metrics['f1_score']:.3f}")

print(f"\n🔍 Key Insights:")
best_tag = max(per_tag_metrics.keys(), key=lambda x: per_tag_metrics[x]['f1_score'])
worst_tag = min(per_tag_metrics.keys(), key=lambda x: per_tag_metrics[x]['f1_score'])
print(f"   • Best performing tag: {best_tag} (F1: {per_tag_metrics[best_tag]['f1_score']:.3f})")
print(f"   • Worst performing tag: {worst_tag} (F1: {per_tag_metrics[worst_tag]['f1_score']:.3f})")
print(f"   • Most common error: {sorted_errors[0][0]} ({sorted_errors[0][1]:,} occurrences)")

print(f"\n✅ Model and results saved successfully!")
print(f"📁 Files saved:")
print(f"   • Model: ../models/baseline_model.h5")
print(f"   • Results: ../results/baseline_results.json")
print(f"   • Predictions: ../results/baseline_predictions.npz")
print(f"   • Training plot: ../results/visualizations/baseline_training_history.png")

## Conclusion

The baseline NER model has been successfully trained and evaluated:

**Model Characteristics:**
- Simple feedforward architecture with word embeddings
- No context awareness between words
- Limited understanding of sequence dependencies

**Expected Limitations:**
- Poor handling of unseen words
- Difficulty with entity boundaries
- No consideration of word order or context

**Next Steps:**
- Compare with advanced BiLSTM model
- Analyze areas for improvement
- Use insights to guide advanced model development