In [2]:


import pandas as pd
import os
import re
import ast
from pathlib import Path

class AmharicNERProcessor:
    def __init__(self):
        """Initialize the processor with file paths"""
        # Set up paths using pathlib
        self.base_dir = Path('../data')
        self.processed_path = self.base_dir / 'processed/cleaned_messages.csv'
        self.labeling_path = self.base_dir / 'labeled/labeled_data.csv'
        self.conll_path = '../CoNLL/amharic_ner.conll'
        
        # Create directories if they don't exist
        (self.base_dir / 'labeled').mkdir(parents=True, exist_ok=True)

    def tokenize_amharic(self, text):
        """
        Tokenize Amharic text while preserving special characters
        Args:
            text (str): Amharic text to tokenize
        Returns:
            list: List of tokens
        """
        if not isinstance(text, str) or not text.strip():
            return []
            
        # Tokenize Amharic words and keep punctuation as separate tokens
        tokens = re.findall(r'[\w\u1200-\u137F]+|[^\w\s]', text)
        return [token.strip() for token in tokens if token.strip()]

    def detect_entities(self, tokens):
        """
        Auto-detect potential entities to assist with labeling
        Args:
            tokens (list): List of tokenized words
        Returns:
            list: List of labels with initial entity detection
        """
        labels = ['O'] * len(tokens)
        
        # Price patterns (e.g., "500 ብር")
        for i, token in enumerate(tokens):
            if token == 'ብር' and i > 0 and tokens[i-1].isdigit():
                labels[i-1] = 'B-PRICE'
                labels[i] = 'I-PRICE'
            elif token.startswith('ዋጋ፦'):
                labels[i] = 'B-PRICE'
        
        # Product patterns
        product_keywords = ['ልብስ', 'ስልክ', 'መጽሐፍ', 'ኮምፒዩተር', 'ወተት', 'ጠርሙስ']
        for i, token in enumerate(tokens):
            if token in product_keywords:
                labels[i] = 'B-PRODUCT'
                # Mark following related words
                j = i + 1
                while j < len(tokens) and labels[j] == 'O' and not tokens[j].isdigit():
                    labels[j] = 'I-PRODUCT'
                    j += 1
        
        # Location patterns
        location_keywords = ['አድራሻ', 'መገናኛ', 'ቢሮ', 'ፎቅ', 'ከተማ', 'ደፋር']
        for i, token in enumerate(tokens):
            if token in location_keywords:
                labels[i] = 'B-LOC'
                # Mark address components
                j = i + 1
                while j < len(tokens) and labels[j] == 'O' and not tokens[j].isdigit():
                    labels[j] = 'I-LOC'
                    j += 1
        
        return labels

    def prepare_labeling_data(self, sample_size=50):
        """
        Prepare data for labeling with automatic entity detection
        Args:
            sample_size (int): Number of samples to prepare
        Returns:
            bool: True if successful, False otherwise
        """
        try:
            # Load and filter data
            df = pd.read_csv(self.processed_path)
            df = df[df['amharic_text'].str.len() > 20].copy()
            
            if len(df) < sample_size:
                print(f"Warning: Only {len(df)} samples available")
                sample_size = len(df)
            
            # Stratified sampling by channel
            samples = []
            for channel in df['channel'].unique():
                channel_samples = df[df['channel'] == channel].sample(frac=0.3, random_state=42)
                samples.extend(channel_samples.to_dict('records'))
                if len(samples) >= sample_size:
                    break
            
            # Prepare labeling data with auto-detection
            labeling_data = []
            for sample in samples[:sample_size]:
                tokens = self.tokenize_amharic(sample['amharic_text'])
                labels = self.detect_entities(tokens)
                
                labeling_data.append({
                    'original_text': sample['amharic_text'],
                    'channel': sample['channel'],
                    'message_id': sample['message_id'],
                    'tokens': tokens,
                    'labels': labels
                })
            
            # Save to CSV
            pd.DataFrame([{
                'original_text': item['original_text'],
                'tokens': str(item['tokens']),
                'labels': str(item['labels'])
            } for item in labeling_data]).to_csv(self.labeling_path, index=False)
            
            print(f"✅ Prepared {len(labeling_data)} samples at {self.labeling_path}")
            return True
            
        except Exception as e:
            print(f"❌ Error preparing labeling data: {str(e)}")
            return False

    def convert_to_conll(self):
        """
        Convert labeled data to CONLL format
        Returns:
            bool: True if successful, False otherwise
        """
        try:
            # Load labeled data
            df = pd.read_csv(self.labeling_path)
            
            # Convert to CONLL format
            conll_lines = []
            for _, row in df.iterrows():
                try:
                    tokens = ast.literal_eval(row['tokens'])
                    labels = ast.literal_eval(row['labels'])
                    
                    if len(tokens) != len(labels):
                        continue
                        
                    for token, label in zip(tokens, labels):
                        conll_lines.append(f"{token}\t{label}")
                    conll_lines.append("")  # Empty line between sentences
                except:
                    continue
            
            # Save CONLL file
            with open(self.conll_path, 'w', encoding='utf-8') as f:
                f.write("\n".join(conll_lines))
            
            print(f"✅ Successfully created CONLL file at {self.conll_path}")
            print(f"Total tokens: {len(conll_lines)-len(df)}")  # Subtract empty lines
            return True
            
        except Exception as e:
            print(f"❌ Error converting to CONLL: {str(e)}")
            return False

    def full_pipeline(self, sample_size=50):
        """Run the complete pipeline from preparation to CONLL conversion"""
        print("Starting Amharic NER processing pipeline...")
        
        if not self.prepare_labeling_data(sample_size):
            return False
        
        if not self.convert_to_conll():
            return False
        
        print("\nPipeline completed successfully!")
        return True


if __name__ == "__main__":
    processor = AmharicNERProcessor()
    
    # Run the complete pipeline
    success = processor.full_pipeline(50)
    
    if success:
        print("\nFinal outputs:")
        print(f"- Labeled data: {processor.labeling_path}")
        print(f"- CONLL format: {processor.conll_path}")
    else:
        print("\nProcessing failed. Please check error messages.")





Starting Amharic NER processing pipeline...
✅ Prepared 50 samples at ..\data\labeled\labeled_data.csv
✅ Successfully created CONLL file at ../CoNLL/amharic_ner.conll
Total tokens: 1607

Pipeline completed successfully!

Final outputs:
- Labeled data: ..\data\labeled\labeled_data.csv
- CONLL format: ../CoNLL/amharic_ner.conll
