# Data Exploration for NER Dataset

This notebook explores the Named Entity Recognition (NER) dataset to understand its structure, distribution, and characteristics.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src directory to path
sys.path.append('../src')

from utils import (
    load_dataset, get_unique_tags, get_tag_distribution,
    plot_tag_distribution, print_dataset_info,
    get_sentence_length_stats, plot_sentence_length_distribution
)
from data_preprocessing import analyze_dataset

# Set style for plots
plt.style.use('default')
sns.set_palette("husl")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

## 1. Load and Basic Inspection

In [None]:
# Load the dataset
data_path = '../data/ner_dataset.csv'
df = load_dataset(data_path)

print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print("\nFirst few rows:")
df.head(10)

In [None]:
# Basic dataset information
print_dataset_info(df)

In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

print("\nData types:")
print(df.dtypes)

## 2. NER Tags Analysis

In [None]:
# Get unique tags and their distribution
unique_tags = get_unique_tags(df)
tag_distribution = get_tag_distribution(df)

print(f"Number of unique NER tags: {len(unique_tags)}")
print(f"Unique tags: {unique_tags}")

print("\nTag distribution:")
for tag, count in sorted(tag_distribution.items(), key=lambda x: x[1], reverse=True):
    percentage = (count / len(df)) * 100
    print(f"{tag}: {count:,} ({percentage:.2f}%)")

In [None]:
# Plot tag distribution
plot_tag_distribution(tag_distribution, "Distribution of NER Tags")

In [None]:
# Analyze entity types (B- and I- tags)
entity_tags = [tag for tag in unique_tags if tag.startswith(('B-', 'I-'))]
entity_types = set(tag[2:] for tag in entity_tags if len(tag) > 2)

print(f"Number of entity types: {len(entity_types)}")
print(f"Entity types: {sorted(entity_types)}")

# Count entities by type
entity_type_counts = {}
for entity_type in entity_types:
    b_count = df[df['Tag'] == f'B-{entity_type}'].shape[0]
    i_count = df[df['Tag'] == f'I-{entity_type}'].shape[0]
    entity_type_counts[entity_type] = {'B': b_count, 'I': i_count, 'Total': b_count + i_count}

entity_df = pd.DataFrame(entity_type_counts).T
entity_df = entity_df.sort_values('Total', ascending=False)
print("\nEntity type distribution:")
print(entity_df)

In [None]:
# Plot entity types distribution
plt.figure(figsize=(12, 6))
x = range(len(entity_df))
width = 0.35

plt.bar([i - width/2 for i in x], entity_df['B'], width, label='B- (Beginning)', alpha=0.8)
plt.bar([i + width/2 for i in x], entity_df['I'], width, label='I- (Inside)', alpha=0.8)

plt.xlabel('Entity Types')
plt.ylabel('Count')
plt.title('Distribution of B- and I- Tags by Entity Type')
plt.xticks(x, entity_df.index, rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

## 3. Sentence Analysis

In [None]:
# Sentence length statistics
sentence_stats = get_sentence_length_stats(df)

print("Sentence length statistics:")
for stat, value in sentence_stats.items():
    print(f"{stat.replace('_', ' ').title()}: {value:.2f}")

In [None]:
# Plot sentence length distribution
plot_sentence_length_distribution(df)

In [None]:
# Analyze some sample sentences
print("Sample sentences with their tags:")
for sent_id in df['Sentence #'].unique()[:5]:
    sent_data = df[df['Sentence #'] == sent_id]
    words = sent_data['Word'].tolist()
    tags = sent_data['Tag'].tolist()
    
    print(f"\n{sent_id}:")
    print(f"Words: {' '.join(words)}")
    print(f"Tags:  {' '.join(tags)}")
    
    # Extract entities from this sentence
    from utils import extract_entities
    entities = extract_entities(words, tags)
    if entities:
        print(f"Entities: {entities}")

## 4. Vocabulary Analysis

In [None]:
# Word frequency analysis
word_counts = df['Word'].value_counts()

print(f"Total vocabulary size: {len(word_counts)}")
print(f"Most frequent words:")
print(word_counts.head(20))

print(f"\nLeast frequent words (sample):")
print(word_counts.tail(10))

In [None]:
# Plot word frequency distribution (top 30)
plt.figure(figsize=(15, 6))
top_words = word_counts.head(30)
plt.bar(range(len(top_words)), top_words.values)
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 30 Most Frequent Words')
plt.xticks(range(len(top_words)), top_words.index, rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Analyze word frequency distribution
freq_counts = word_counts.value_counts().sort_index()

print("Word frequency distribution:")
print(f"Words appearing once: {freq_counts[1] if 1 in freq_counts else 0}")
print(f"Words appearing 2-5 times: {freq_counts[2:6].sum() if len(freq_counts) > 2 else 0}")
print(f"Words appearing 6-10 times: {freq_counts[6:11].sum() if len(freq_counts) > 6 else 0}")
print(f"Words appearing >10 times: {freq_counts[11:].sum() if len(freq_counts) > 11 else 0}")

# Plot frequency of frequencies
plt.figure(figsize=(12, 6))
plt.hist(word_counts.values, bins=50, edgecolor='black', alpha=0.7)
plt.xlabel('Word Frequency')
plt.ylabel('Number of Words')
plt.title('Distribution of Word Frequencies')
plt.yscale('log')
plt.show()

## 5. Comprehensive Dataset Analysis

In [None]:
# Use the comprehensive analysis function
analysis_results = analyze_dataset(df)

print("Comprehensive Dataset Analysis:")
print("=" * 50)

for key, value in analysis_results.items():
    print(f"\n{key.replace('_', ' ').title()}:")
    if isinstance(value, dict):
        for sub_key, sub_value in value.items():
            print(f"  {sub_key}: {sub_value}")
    else:
        print(f"  {value}")

## 6. Data Quality Assessment

In [None]:
# Check for potential data quality issues
print("Data Quality Assessment:")
print("=" * 30)

# Check for consecutive B- tags of the same type (potential issue)
consecutive_b_issues = 0
for sent_id in df['Sentence #'].unique()[:1000]:  # Check first 1000 sentences
    sent_data = df[df['Sentence #'] == sent_id]
    tags = sent_data['Tag'].tolist()
    
    for i in range(len(tags) - 1):
        if tags[i].startswith('B-') and tags[i+1].startswith('B-') and tags[i] == tags[i+1]:
            consecutive_b_issues += 1
            if consecutive_b_issues <= 5:  # Show first 5 examples
                print(f"Consecutive B- tags in {sent_id}: {tags[i]} -> {tags[i+1]}")

print(f"\nFound {consecutive_b_issues} potential consecutive B- tag issues in first 1000 sentences")

# Check for I- tags without preceding B- tags
orphan_i_issues = 0
for sent_id in df['Sentence #'].unique()[:1000]:  # Check first 1000 sentences
    sent_data = df[df['Sentence #'] == sent_id]
    tags = sent_data['Tag'].tolist()
    
    for i, tag in enumerate(tags):
        if tag.startswith('I-'):
            entity_type = tag[2:]
            # Check if there's a B- tag before this I- tag
            has_b_tag = False
            for j in range(i-1, -1, -1):
                if tags[j] == f'B-{entity_type}':
                    has_b_tag = True
                    break
                elif tags[j] != f'I-{entity_type}':
                    break
            
            if not has_b_tag:
                orphan_i_issues += 1
                if orphan_i_issues <= 5:  # Show first 5 examples
                    print(f"Orphan I- tag in {sent_id}: {tag} at position {i}")

print(f"Found {orphan_i_issues} potential orphan I- tag issues in first 1000 sentences")

## 7. Summary and Insights

In [None]:
print("Dataset Summary and Key Insights:")
print("=" * 40)

print(f"📊 Dataset Size: {df.shape[0]:,} tokens across {df['Sentence #'].nunique():,} sentences")
print(f"📝 Vocabulary: {df['Word'].nunique():,} unique words")
print(f"🏷️  NER Tags: {df['Tag'].nunique()} unique tags")
print(f"🎯 Entity Types: {len(entity_types)} types ({', '.join(sorted(entity_types))})")

print(f"\n📏 Sentence Lengths:")
print(f"   • Average: {sentence_stats['mean_length']:.1f} words")
print(f"   • Range: {sentence_stats['min_length']} - {sentence_stats['max_length']} words")
print(f"   • Median: {sentence_stats['median_length']:.1f} words")

o_percentage = (tag_distribution['O'] / len(df)) * 100
entity_percentage = 100 - o_percentage
print(f"\n🔍 Tag Distribution:")
print(f"   • Non-entity tokens (O): {o_percentage:.1f}%")
print(f"   • Entity tokens: {entity_percentage:.1f}%")

print(f"\n💡 Key Insights:")
print(f"   • This is a {['small', 'medium', 'large'][2 if df.shape[0] > 100000 else 1 if df.shape[0] > 10000 else 0]} dataset")
print(f"   • Entity density: {'High' if entity_percentage > 30 else 'Medium' if entity_percentage > 15 else 'Low'}")
print(f"   • Vocabulary richness: {'High' if df['Word'].nunique() > 20000 else 'Medium' if df['Word'].nunique() > 5000 else 'Low'}")
print(f"   • Sentence complexity: {'High' if sentence_stats['mean_length'] > 25 else 'Medium' if sentence_stats['mean_length'] > 15 else 'Low'}")

## Conclusion

This exploration provides a comprehensive understanding of the NER dataset:

1. **Dataset Scale**: Large-scale dataset suitable for training robust NER models
2. **Entity Diversity**: Multiple entity types with varying frequencies
3. **Text Complexity**: Diverse sentence lengths and vocabulary
4. **Data Quality**: Generally well-formatted IOB2 tagging with minimal issues

The insights from this exploration will guide the preprocessing steps and model architecture decisions in subsequent notebooks.