In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import re
import emoji

In [31]:

# Load the dataset
df = pd.read_csv('C:/Users/Administrator/Documents/kifiya/Week_5/telegram_data.csv')

df = df.dropna(subset =['Message'])
# Load the tokenizer and model for NER
tokenizer = AutoTokenizer.from_pretrained("masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0")
model = AutoModelForTokenClassification.from_pretrained("masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0")

# Set up NER pipeline
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

# Function to clean the text (remove emojis, symbols, etc.)
def remove_emoji(text):
    if isinstance(text, str):
        return emoji.replace_emoji(text, replace='')
    return text

def remove_symbols(text):
    if isinstance(text, str):
        return re.sub(r'[^A-Za-z0-9ሀ-ፐ\s]+', '', text)
    return text

# Apply cleaning functions to 'Message' column
df['Message'] = df['Message'].apply(remove_emoji).apply(remove_symbols)


# Function to map NER model results to CoNLL format labels
def map_ner_to_conll(ner_results, text):
    # Split text into tokens
    tokens = text.split()
    
    # Initialize list of labels for each token
    labels = ['O'] * len(tokens)
    
    # Iterate through NER results and apply labels to tokens
    for entity in ner_results:
        word = entity['word'].replace('##', '')  # Clean subword tokenization artifacts
        entity_type = entity['entity']  # E.g., 'B-LOC', 'B-PER', etc.
        entity_score = entity['score']  # Score for confidence
        
        # Apply specific entity mappings for task
        if entity_type == 'B-LOC':
            label = 'B-LOC'
        elif entity_type == 'I-LOC':
            label = 'I-LOC'
        elif entity_type == 'B-ORG':  # Assuming 'ORG' is related to Product in your case
            label = 'B-Product'
        elif entity_type == 'I-ORG':
            label = 'I-Product'
        elif entity_type == 'B-MISC':  # Assuming Misc could contain prices
            label = 'B-PRICE'
        elif entity_type == 'I-MISC':
            label = 'I-PRICE'
        else:
            label = 'O'
        
        # Align the NER result with the tokens
        for i, token in enumerate(tokens):
            if word in token:
                labels[i] = label

    return tokens, labels

# Function to write labeled data to CoNLL format
def write_conll_format(df, output_file, nlp_pipeline):
    with open(output_file, 'w', encoding='utf-8') as f:
        for message in df['Message'].head(50):  # Limit to 50 messages for task
            if pd.isna(message):
                continue
            
            # Apply NER model to message
            ner_results = nlp_pipeline(message)
            
            # Map the NER results to CoNLL format tokens and labels
            tokens, labels = map_ner_to_conll(ner_results, message)
            
            # Write the tokens and their labels to the file
            for token, label in zip(tokens, labels):
                f.write(f"{token} {label}\n")
            f.write("\n")  # Blank line between messages

# Output file path for labeled data in CoNLL format
output_file = 'C:/Users/Administrator/Documents/kifiya/Week_5/labeled_data_conll.txt'

# Write the labeled data to CoNLL format
write_conll_format(df, output_file, nlp)

print(f"Labeled data saved to {output_file}")




Labeled data saved to C:/Users/Administrator/Documents/kifiya/Week_5/labeled_data_conll.txt
