# Task 2: CoNLL Format Dataset Labeling
Creating labeled dataset for NER training with BIO tagging

In [4]:
# Setup and imports
import sys
import pandas as pd
sys.path.append('../src')

import pandas as pd
from preprocessing.conll_labeler import CoNLLLabeler
from preprocessing.text_preprocessor import AmharicTextPreprocessor

In [5]:
# Load processed data from Task 1
processed_df = pd.read_csv("../data/processed/processed_telegram_data.csv")
print(f"Loaded {len(processed_df)} processed messages")

Loaded 5800 processed messages


In [6]:
# Initialize preprocessor and labeler
preprocessor = AmharicTextPreprocessor()
labeler = CoNLLLabeler()
print("Preprocessor and labeler initialized")

Preprocessor and labeler initialized


In [9]:
# Prepare sample for labeling (50 messages)
# sample_df = preprocessor.prepare_for_labeling(processed_df, sample_size=50)
sample_df = processed_df[['id', 'channel', 'text', 'tokens']]
print(f"Selected {len(sample_df)} messages for labeling")
print("Sample messages:")
for i, text in enumerate(sample_df['text'].head(3)):
    print(f"{i+1}. {text[:80]}...")

Selected 5800 messages for labeling
Sample messages:
1. 4 1 304 500 ዋጋ፦ ብር ውስን ፍሬ ነው ያለው መገናኛ መሰረት ደፋር ሞል ሁለተኛ ፎቅ ቢሮ ቁ 05 06 0902660722 ...
2. 6 የጫማ ማስቀመጫ ባለ ስድስት ደረጃ ቦታ ቆጣቢ ሲዘረጋ 27 27 86 ስፋት ከጠንካራ ፕላስቲክ የተሰራ ገጠማ የማይፈልግ 6 ጥ...
3. 6 የጫማ ማስቀመጫ ባለ ስድስት ደረጃ ቦታ ቆጣቢ ሲዘረጋ 27 27 86 ስፋት ከጠንካራ ፕላስቲክ የተሰራ ገጠማ የማይፈልግ 6 ጥ...


In [11]:
# Create labeled dataset
messages = sample_df['text'].tolist()
labeled_data = labeler.create_extended_dataset(messages, target_size=50)
print(f"Created labeled dataset with {len(labeled_data)} sentences")

Created labeled dataset with 50 sentences


In [12]:
# Display sample labeled data
print("Sample Labeled Data (CoNLL Format):")
for i, (message, tokens, labels) in enumerate(labeled_data[:3]):
    print(f"\nSample {i+1}: {message[:60]}...")
    print("Tokens and Labels:")
    for token, label in zip(tokens[:10], labels[:10]):
        print(f"  {token:<15} {label}")
    if len(tokens) > 10:
        print(f"  ... ({len(tokens)-10} more tokens)")

Sample Labeled Data (CoNLL Format):

Sample 1: ሰላም! የሕፃናት ጠርሙስ ዋጋ 150 ብር ነው። ቦሌ አካባቢ ነው።...
Tokens and Labels:
  ሰላም             O
  !               O
  የሕፃናት           B-Product
  ጠርሙስ            I-Product
  ዋጋ              B-PRICE
  150             B-PRICE
  ብር              I-PRICE
  ነው።             O
  ቦሌ              B-LOC
  አካባቢ            O
  ... (1 more tokens)

Sample 2: አዲስ አበባ ውስጥ የሚሸጥ ልብስ በ 200 ብር...
Tokens and Labels:
  አዲስ             B-LOC
  አበባ             I-LOC
  ውስጥ             O
  የሚሸጥ            O
  ልብስ             B-Product
  በ               B-PRICE
  200             B-PRICE
  ብር              I-PRICE

Sample 3: መርካቶ ውስጥ ጫማ 300 ብር...
Tokens and Labels:
  መርካቶ            B-LOC
  ውስጥ             O
  ጫማ              B-Product
  300             B-PRICE
  ብር              I-PRICE


In [13]:
# Entity statistics
all_labels = [label for _, _, labels in labeled_data for label in labels]
entity_counts = {}
for label in all_labels:
    if label != 'O':
        entity_type = label.split('-')[1] if '-' in label else label
        entity_counts[entity_type] = entity_counts.get(entity_type, 0) + 1

print("Entity Statistics:")
for entity_type, count in entity_counts.items():
    print(f"  {entity_type}: {count} mentions")
print(f"  Total entities: {sum(entity_counts.values())}")
print(f"  O (non-entity) labels: {all_labels.count('O')}")

Entity Statistics:
  Product: 17 mentions
  PRICE: 100 mentions
  LOC: 14 mentions
  Total entities: 131
  O (non-entity) labels: 1528


In [14]:
# Validate BIO tagging consistency
valid_count = 0
for message, tokens, labels in labeled_data:
    if labeler.validate_labels(tokens, labels):
        valid_count += 1

print(f"BIO Validation Results:")
print(f"  Valid sequences: {valid_count}/{len(labeled_data)}")
print(f"  Validation rate: {valid_count/len(labeled_data)*100:.1f}%")

BIO Validation Results:
  Valid sequences: 50/50
  Validation rate: 100.0%


In [15]:
# Save in CoNLL format
conll_path = "../data/labeled/ethiopian_ner_dataset.txt"
labeler.save_conll_format(labeled_data, conll_path)
print(f"CoNLL dataset saved to: {conll_path}")

[32m2025-08-21 18:45:48.047[0m | [1mINFO    [0m | [36mpreprocessing.conll_labeler[0m:[36msave_conll_format[0m:[36m147[0m - [1mCoNLL format data saved to ../data/labeled/ethiopian_ner_dataset.txt[0m


CoNLL dataset saved to: ../data/labeled/ethiopian_ner_dataset.txt


In [16]:
# Verify saved data by loading it back
loaded_data = labeler.load_conll_format(conll_path)
print(f"Verification: Loaded {len(loaded_data)} sentences from saved file")
print("Dataset ready for NER model training!")

[32m2025-08-21 18:45:49.681[0m | [1mINFO    [0m | [36mpreprocessing.conll_labeler[0m:[36mload_conll_format[0m:[36m181[0m - [1mLoaded 50 sentences from ../data/labeled/ethiopian_ner_dataset.txt[0m


Verification: Loaded 50 sentences from saved file
Dataset ready for NER model training!
