# EDA

In [None]:
import json
import pandas as pd
import numpy as np
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

from transformers import (
    AutoTokenizer, 
    AutoModelForTokenClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorForTokenClassification
)
from datasets import Dataset
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report
import torch

# Set random seeds for reproducibility
import random
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

print("Libraries imported successfully!")

In [None]:
# Load training data from JSONL file
train_data = []
with open('/root/train.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        train_data.append(json.loads(line))

print(f"Total training documents: {len(train_data)}")
print(f"\nSample document:")
print(json.dumps(train_data[0], indent=2)[:500])

In [None]:
import re
from collections import defaultdict, Counter

def audit_labels_memory(data):
    print("Sedang menganalisis konsistensi label di memori...")
    
    # Penyimpanan Data
    text_label_counts = defaultdict(Counter) # {text: {label1: count, label2: count}}
    suspicious_entries = [] # List untuk menyimpan data aneh berdasarkan pola
    
    # Statistik
    total_ents = 0
    
    # 1. SCANNING DATA (Langsung dari variable train_data)
    for doc in data:
        doc_id = doc['document_id']
        
        for ent in doc['entities']:
            text = ent['text']
            label = ent['label']
            total_ents += 1
            
            # Catat statistik ambiguitas (Teks sama tapi label beda)
            text_label_counts[text][label] += 1
            
            # --- Cek Pelanggaran Pola (Heuristics) ---
            
            # a. Cek Email (Wajib ada @)
            if label == 'EMAIL' and '@' not in text:
                suspicious_entries.append({
                    'Text': text, 'Label': label, 'Issue': 'Email tanpa "@"', 'Doc_ID': doc_id
                })
            
            # b. Cek Phone (Wajib ada angka)
            elif label == 'PHONE' and not any(c.isdigit() for c in text):
                suspicious_entries.append({
                    'Text': text, 'Label': label, 'Issue': 'Phone tanpa angka', 'Doc_ID': doc_id
                })
                
            # c. Cek Username (Tidak boleh ada spasi, kecuali kasus khusus)
            elif label == 'USERNAME' and ' ' in text.strip():
                suspicious_entries.append({
                    'Text': text, 'Label': label, 'Issue': 'Username berspasi', 'Doc_ID': doc_id
                })
                
            # d. Cek Job Title (Diawali gelar seperti Mr/Mrs -> Potensi salah label jadi JOB padahal NAME)
            elif label == 'JOB_TITLE' and re.match(r'^(Mr\.|Mrs\.|Ms\.|Dr\.)', text):
                suspicious_entries.append({
                    'Text': text, 'Label': label, 'Issue': 'Job Title diawali Gelar (Potensi Nama)', 'Doc_ID': doc_id
                })

            # e. Cek Name (Jika isinya angka saja -> Potensi Phone/Date)
            elif label == 'NAME' and re.match(r'^\d+$', text):
                 suspicious_entries.append({
                    'Text': text, 'Label': label, 'Issue': 'Nama berupa Angka', 'Doc_ID': doc_id
                })

    # 2. PROSES AMBIGUITAS
    ambiguous_data = []
    for text, counts in text_label_counts.items():
        if len(counts) > 1: # Punya lebih dari 1 jenis label
            conflict_detail = ", ".join([f"{lbl} ({cnt})" for lbl, cnt in counts.items()])
            total_cases = sum(counts.values())
            ambiguous_data.append({
                'Text': text,
                'Conflicts': conflict_detail,
                'Total_Count': total_cases
            })
    
    # 3. MENAMPILKAN HASIL
    
    # A. Laporan Ambiguitas
    print("\n" + "="*60)
    print(f"ðŸš© LAPORAN 1: AMBIGUITAS LABEL (Top 15 Kasus)")
    print("Teks yang SAMA persis tapi dilabeli BERBEDA di dokumen lain.")
    print("="*60)
    
    if ambiguous_data:
        df_ambig = pd.DataFrame(ambiguous_data).sort_values('Total_Count', ascending=False)
        # Tampilkan tabel rapi
        display(df_ambig.head(15))
    else:
        print("âœ… Tidak ditemukan ambiguitas label.")

    # B. Laporan Data Mencurigakan
    print("\n" + "="*60)
    print(f"ðŸš© LAPORAN 2: DATA MENCURIGAKAN (Heuristics Check)")
    print("Label yang melanggar aturan format umum.")
    print("="*60)
    
    if suspicious_entries:
        df_suspicious = pd.DataFrame(suspicious_entries)
        
        # Grouping berdasarkan Issue untuk ringkasan
        for issue in df_suspicious['Issue'].unique():
            subset = df_suspicious[df_suspicious['Issue'] == issue]
            print(f"\n--- Masalah: {issue} (Total: {len(subset)}) ---")
            display(subset[['Text', 'Label', 'Doc_ID']].head(5))
    else:
        print("âœ… Tidak ditemukan data yang melanggar pola umum.")

# Jalankan Fungsi menggunakan variabel train_data yang sudah ada
audit_labels_memory(train_data)

# Modelling

## 1. Install Required Libraries

In [None]:
!pip install transformers datasets accelerate seqeval -q

## 2. Import Libraries

In [None]:
import json
import pandas as pd
import numpy as np
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

from transformers import (
    AutoTokenizer, 
    AutoModelForTokenClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorForTokenClassification
)
from datasets import Dataset
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report
import torch

# Set random seeds for reproducibility
import random
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

print("Libraries imported successfully!")

## 3. Define Label Schema and Configuration

In [None]:
# Define PII labels using BIO tagging scheme
# B- = Beginning of entity, I- = Inside entity, O = Outside entity
labels = [
    "O",
    "B-NAME", "I-NAME",
    "B-DATE", "I-DATE",
    "B-EMAIL", "I-EMAIL",
    "B-PHONE", "I-PHONE",
    "B-JOB_TITLE", "I-JOB_TITLE",
    "B-ADDRESS", "I-ADDRESS",
    "B-USERNAME", "I-USERNAME"
]

label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}

print(f"Number of labels: {len(labels)}")
print(f"Labels: {labels}")

## 4. Load and Explore Data

In [None]:
# Load training data from JSONL file
train_data = []
with open('/root/train.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        train_data.append(json.loads(line))

print(f"Total training documents: {len(train_data)}")
print(f"\nSample document:")
print(json.dumps(train_data[0], indent=2)[:500])

## 5. Prepare Data for Token Classification

We need to convert character-level annotations to token-level annotations that BERT can understand.

In [None]:
def create_character_labels(text, entities):
    """Create character-level labels for the text"""
    char_labels = ['O'] * len(text)
    
    for entity in entities:
        start = entity['start']
        end = entity['end']
        label = entity['label']
        
        # First character gets B- (Beginning)
        char_labels[start] = f'B-{label}'
        
        # Remaining characters get I- (Inside)
        for i in range(start + 1, end):
            char_labels[i] = f'I-{label}'
    
    return char_labels

# Test the function
sample_text = train_data[0]['full_text']
sample_entities = train_data[0]['entities']
char_labels = create_character_labels(sample_text, sample_entities)

print(f"Text length: {len(sample_text)}")
print(f"Labels length: {len(char_labels)}")
print(f"\nSample entities found: {len([l for l in char_labels if l != 'O'])}")

## 6. Initialize Tokenizer

In [None]:
# Initialize BERT tokenizer
MODEL_NAME = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print(f"Tokenizer loaded: {MODEL_NAME}")
print(f"Vocab size: {tokenizer.vocab_size}")

## 7. Tokenization and Label Alignment

We need to align character-level labels with token-level labels for BERT.

In [None]:
def tokenize_and_align_labels(text, entities, tokenizer, max_length=512):
    """
    Tokenize text and align labels from character-level to token-level
    """
    # Create character-level labels
    char_labels = create_character_labels(text, entities)
    
    # Tokenize with offset mapping to track character positions
    encoding = tokenizer(
        text,
        truncation=True,
        max_length=max_length,
        padding=False,
        return_offsets_mapping=True
    )
    
    # Align labels to tokens
    token_labels = []
    offset_mapping = encoding['offset_mapping']
    
    for start_char, end_char in offset_mapping:
        # Special tokens (like [CLS], [SEP]) have offset (0, 0)
        if start_char == 0 and end_char == 0:
            token_labels.append(-100)  # Ignore special tokens in loss
        else:
            # Use the label of the first character of the token
            token_labels.append(label2id[char_labels[start_char]])
    
    # Remove offset_mapping as it's not needed for training
    encoding.pop('offset_mapping')
    encoding['labels'] = token_labels
    
    return encoding

# Test tokenization
sample_encoding = tokenize_and_align_labels(
    train_data[0]['full_text'], 
    train_data[0]['entities'], 
    tokenizer
)

print(f"Input IDs shape: {len(sample_encoding['input_ids'])}")
print(f"Labels shape: {len(sample_encoding['labels'])}")
print(f"Sample tokens: {tokenizer.convert_ids_to_tokens(sample_encoding['input_ids'][:20])}")

## 8. Prepare Dataset

Convert all training data to tokenized format and create train/validation split.

In [None]:
# Tokenize all training data
print("Tokenizing training data...")
tokenized_data = []

for i, doc in enumerate(train_data):
    if i % 1000 == 0:
        print(f"Processing document {i}/{len(train_data)}")
    
    encoding = tokenize_and_align_labels(
        doc['full_text'],
        doc['entities'],
        tokenizer,
        max_length=512
    )
    
    tokenized_data.append(encoding)

print(f"\nTokenized {len(tokenized_data)} documents")

In [None]:
# Create train/validation split (90/10)
from sklearn.model_selection import train_test_split

train_encodings, val_encodings = train_test_split(
    tokenized_data, 
    test_size=0.1, 
    random_state=42
)

# Convert to HuggingFace Dataset format
train_dataset = Dataset.from_dict({
    'input_ids': [enc['input_ids'] for enc in train_encodings],
    'attention_mask': [enc['attention_mask'] for enc in train_encodings],
    'labels': [enc['labels'] for enc in train_encodings]
})

val_dataset = Dataset.from_dict({
    'input_ids': [enc['input_ids'] for enc in val_encodings],
    'attention_mask': [enc['attention_mask'] for enc in val_encodings],
    'labels': [enc['labels'] for enc in val_encodings]
})

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

## 9. Initialize Model

In [None]:
# Initialize BERT model for token classification
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

print(f"Model loaded: {MODEL_NAME}")
print(f"Number of parameters: {model.num_parameters():,}")

## 10. Define Evaluation Metrics

In [None]:
def compute_metrics(eval_pred):
    """
    Compute micro F1, precision, and recall scores
    """
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)
    
    # Remove ignored index (special tokens)
    true_labels = []
    true_predictions = []
    
    for prediction, label in zip(predictions, labels):
        true_label = []
        true_prediction = []
        
        for pred_id, label_id in zip(prediction, label):
            if label_id != -100:
                true_label.append(id2label[label_id])
                true_prediction.append(id2label[pred_id])
        
        true_labels.append(true_label)
        true_predictions.append(true_prediction)
    
    # Calculate metrics
    results = {
        'precision': precision_score(true_labels, true_predictions),
        'recall': recall_score(true_labels, true_predictions),
        'f1': f1_score(true_labels, true_predictions),
    }
    
    return results

print("Metrics function defined successfully!")

## 11. Configure Training Arguments

In [None]:
# Configure training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=2,
    report_to="none",  # Disable wandb/tensorboard
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
)

print("Training arguments configured!")

## 12. Initialize Trainer

In [None]:
# Data collator for dynamic padding
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("Trainer initialized successfully!")

## 13. Train the Model

In [None]:
# Train the model
print("Starting training...")
trainer.train()

print("\nTraining completed!")

## 14. Evaluate on Validation Set

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()

print("Validation Results:")
for key, value in eval_results.items():
    print(f"  {key}: {value:.4f}")

## 15. Load Test Data

In [None]:
# Load test data
test_data = []
with open('/root/test.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        test_data.append(json.loads(line))

print(f"Total test documents: {len(test_data)}")
print(f"Sample test document ID: {test_data[0]['document_id']}")

## 16. Define Prediction Function

Convert model predictions back to character-level offsets for submission.

In [None]:
def predict_entities(text, model, tokenizer):
    """
    Predict PII entities in text and return character-level offsets
    """
    # Tokenize
    encoding = tokenizer(
        text,
        truncation=True,
        max_length=512,
        return_offsets_mapping=True,
        return_tensors="pt"
    )
    
    offset_mapping = encoding.pop('offset_mapping').squeeze().tolist()
    
    # Move to same device as model
    device = model.device
    encoding = {k: v.to(device) for k, v in encoding.items()}
    
    # Get predictions
    with torch.no_grad():
        outputs = model(**encoding)
        predictions = torch.argmax(outputs.logits, dim=-1).squeeze().tolist()
    
    # Ensure predictions is a list
    if not isinstance(predictions, list):
        predictions = [predictions]
    
    # Convert token predictions to entities
    entities = []
    current_entity = None
    
    for idx, (pred_id, (start_char, end_char)) in enumerate(zip(predictions, offset_mapping)):
        # Skip special tokens
        if start_char == 0 and end_char == 0:
            continue
        
        pred_label = id2label[pred_id]
        
        # Handle BIO tags
        if pred_label.startswith('B-'):
            # Save previous entity if exists
            if current_entity:
                entities.append(current_entity)
            
            # Start new entity
            label = pred_label[2:]  # Remove 'B-' prefix
            current_entity = {
                'label': label,
                'start': start_char,
                'end': end_char,
                'text': text[start_char:end_char]
            }
        
        elif pred_label.startswith('I-') and current_entity:
            # Continue current entity
            label = pred_label[2:]  # Remove 'I-' prefix
            if label == current_entity['label']:
                current_entity['end'] = end_char
                current_entity['text'] = text[current_entity['start']:end_char]
        
        else:  # 'O' or mismatched I- tag
            if current_entity:
                entities.append(current_entity)
                current_entity = None
    
    # Don't forget the last entity
    if current_entity:
        entities.append(current_entity)
    
    return entities

print("Prediction function defined successfully!")

## 17. Generate Predictions on Test Set

In [None]:
# Generate predictions for all test documents
print("Generating predictions on test set...")
model.eval()

all_predictions = []

for i, doc in enumerate(test_data):
    if i % 100 == 0:
        print(f"Processing document {i}/{len(test_data)}")
    
    document_id = doc['document_id']
    text = doc['full_text']
    
    # Get predictions
    entities = predict_entities(text, model, tokenizer)
    
    # Store predictions with document_id
    for entity in entities:
        all_predictions.append({
            'document_id': document_id,
            'label': entity['label'],
            'start_offset': entity['start'],
            'end_offset': entity['end']
        })

print(f"\nTotal predictions made: {len(all_predictions)}")

## 18. Create Submission File

In [None]:
# =============================================================================
# 18. Create Submission File (Match row_id exactly with sample_submission)
# =============================================================================

from collections import defaultdict

# Load sample submission
sample_sub = pd.read_csv('/root/sample_submission.csv')

print(f"Sample submission rows: {len(sample_sub)}")
print(f"Total predictions made: {len(all_predictions)}")

# Group predictions by document_id for O(1) lookup
predictions_by_doc = defaultdict(list)
for pred in all_predictions:
    predictions_by_doc[pred['document_id']].append({
        'start_offset': pred['start_offset'],
        'end_offset': pred['end_offset'],
        'label': pred['label']
    })

# Sort predictions within each document by start_offset
for doc_id in predictions_by_doc:
    predictions_by_doc[doc_id].sort(key=lambda x: x['start_offset'])

print(f"Documents with predictions: {len(predictions_by_doc)}")

# Track current index for each document
doc_pred_index = defaultdict(int)

# Create submission matching sample_submission structure
submission_data = []
filled_count = 0

for _, row in sample_sub.iterrows():
    doc_id = row['document_id']
    idx = doc_pred_index[doc_id]
    
    if doc_id in predictions_by_doc and idx < len(predictions_by_doc[doc_id]):
        # We have a prediction for this slot
        pred = predictions_by_doc[doc_id][idx]
        submission_data.append({
            'row_id': row['row_id'],
            'document_id': doc_id,
            'start_offset': pred['start_offset'],
            'end_offset': pred['end_offset'],
            'label': pred['label']
        })
        doc_pred_index[doc_id] += 1
        filled_count += 1
    else:
        # No prediction - use placeholder (dummy prediction)
        submission_data.append({
            'row_id': row['row_id'],
            'document_id': doc_id,
            'start_offset': 0,
            'end_offset': 1,
            'label': 'NAME'  
        })

submission_df = pd.DataFrame(submission_data)

# Save submission
submission_df.to_csv('submission.csv', index=False)

print(f"\nSubmission file created: submission.csv")
print(f"Submission shape: {submission_df.shape}")
print(f"Predictions filled: {filled_count}")
print(f"Placeholder rows: {len(submission_df) - filled_count}")

print(f"\nLabel distribution:")
print(submission_df['label'].value_counts())

## 19. Save Model (Optional)

In [None]:
# Save the trained model and tokenizer
model.save_pretrained("./bert-pii-model")
tokenizer.save_pretrained("./bert-pii-model")

print("Model and tokenizer saved to './bert-pii-model'")