In [64]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import re
import emoji

## Load the dataset

In [65]:
# Load the dataset
df = pd.read_csv('C:/Users/Administrator/Documents/kifiya/Week_5/telegram_data.csv')

df = df.dropna(subset=['Message'])
display(df.head())

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
2,Fashion tera,@Fashiontera,3964,〰️〰️〰️〰️〰️ \nCLASICO\n🔸🔸🔸🔸🔸🔸\nPrice 2200( No ...,2024-09-24 14:34:31+00:00,photos\@Fashiontera_3964.jpg
4,Fashion tera,@Fashiontera,3962,〰️〰️〰️〰️〰️〰️〰️\nPuma \nMade in Vietnam \n🔸🔸🔸🔸🔸...,2024-09-18 15:26:06+00:00,photos\@Fashiontera_3962.jpg
8,Fashion tera,@Fashiontera,3950,New year Discount \n🌼🌼🌼🌼🌼🌼🌼🌼🌼\n〰️〰️〰️〰️〰️ \n🔸🔸...,2024-09-08 16:08:15+00:00,photos\@Fashiontera_3950.jpg
19,Fashion tera,@Fashiontera,3939,〰️〰️〰️〰️〰️ \nSkechers \nMade in Vietnam \n🔸🔸🔸🔸...,2024-09-08 15:45:52+00:00,photos\@Fashiontera_3939.jpg
31,Fashion tera,@Fashiontera,3915,〰️〰️〰️〰️〰️〰️〰️\nReebok \nMade in Vietnam \n🔸🔸🔸...,2024-09-01 19:38:25+00:00,photos\@Fashiontera_3915.jpg


## Load the tokenizer and model for NER

In [66]:
# Load the tokenizer and model for NER
tokenizer = AutoTokenizer.from_pretrained("masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0")
model = AutoModelForTokenClassification.from_pretrained("masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0")

# Set up NER pipeline
nlp = pipeline("ner", model=model, tokenizer=tokenizer)



## clean the text 

In [67]:
# Function to clean the text (remove emojis, symbols, etc.)
def remove_emoji(text):
    if isinstance(text, str):
        return emoji.replace_emoji(text, replace='')
    return text

def remove_symbols(text):
    if isinstance(text, str):
        return re.sub(r'[^A-Za-z0-9ሀ-ፐ\s]+', '', text)
    return text

# Apply cleaning functions to 'Message' column
df['Message'] = df['Message'].apply(remove_emoji).apply(remove_symbols)
display(df.head())

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
2,Fashion tera,@Fashiontera,3964,\nCLASICO\n\nPrice 2200 No gift box\nFree De...,2024-09-24 14:34:31+00:00,photos\@Fashiontera_3964.jpg
4,Fashion tera,@Fashiontera,3962,\nPuma \nMade in Vietnam \n\nSize 404143\nPric...,2024-09-18 15:26:06+00:00,photos\@Fashiontera_3962.jpg
8,Fashion tera,@Fashiontera,3950,New year Discount \n\n \n\nInbox Hiwe5266\nስልክ...,2024-09-08 16:08:15+00:00,photos\@Fashiontera_3950.jpg
19,Fashion tera,@Fashiontera,3939,\nSkechers \nMade in Vietnam \n\nSize 4243\nP...,2024-09-08 15:45:52+00:00,photos\@Fashiontera_3939.jpg
31,Fashion tera,@Fashiontera,3915,\nReebok \nMade in Vietnam \n\nSize 404142\nPr...,2024-09-01 19:38:25+00:00,photos\@Fashiontera_3915.jpg


## map NER model results to CoNLL format labels

In [68]:
# Function to map NER model results to CoNLL format labels
def map_ner_to_conll(ner_results, tokens):
    labels = ['O'] * len(tokens)  # Initialize labels as 'O' (Outside)
    
    for entity in ner_results:
        word = entity['word'].replace('##', '')  # Remove subword artifacts from NER results
        entity_type = entity['entity']  # Extract entity type
        
        # Define mapping for NER entity types
        label = 'O'  # Default label
        if entity_type == 'B-LOC':
            label = 'B-LOC'
        elif entity_type == 'I-LOC':
            label = 'I-LOC'
        elif entity_type == 'B-ORG':
            label = 'B-PRODUCT'
        elif entity_type == 'I-ORG':
            label = 'I-PRODUCT'
        elif entity_type == 'B-MISC':
            label = 'B-PRICE'
        elif entity_type == 'I-MISC':
            label = 'I-PRICE'
        
        # Apply NER labels to matching tokens
        for i, token in enumerate(tokens):
            if word in token:
                labels[i] = label

    return labels

# Custom labeling function to identify prices and locations
def custom_label_prices_locations(tokens):
    labels = ['O'] * len(tokens)  # Initialize labels as 'O'
    
    for i, token in enumerate(tokens):
        # Label prices (numbers, ETB, ብር)
        if re.match(r'^\d+(\.\d{1,2})?$', token) or 'ETB' in token or 'ዋጋ' in token or '$' in token or 'ብር' in token:
            labels[i] = 'I-PRICE'
        # Label locations (e.g., Addis Ababa, ቦሌ)
        elif any(loc in token for loc in ['Addis Ababa', 'ለቡ', 'ቦሌ', 'ሜክሲኮ']):
            labels[i] = 'I-LOC'
    
    return labels

# Function to combine both NER and custom labels
def combine_labels(ner_labels, custom_labels):
    final_labels = []
    
    for ner_label, custom_label in zip(ner_labels, custom_labels):
        if ner_label != 'O':  # NER label takes precedence
            final_labels.append(ner_label)
        else:
            final_labels.append(custom_label)  # Otherwise, use custom label
    
    return final_labels

## process a message with both NER and custom methods

In [69]:

# Function to process a message with both NER and custom methods
def process_message(message, nlp_pipeline):
    tokens = re.findall(r'\S+', message)  # Tokenize the message
    
    # Apply NER model
    ner_results = nlp_pipeline(message)
    ner_labels = map_ner_to_conll(ner_results, tokens)
    
    # Apply custom labeling
    custom_labels = custom_label_prices_locations(tokens)
    
    # Combine both label sets
    final_labels = combine_labels(ner_labels, custom_labels)
    
    # Return tokens with their combined labels
    labeled_tokens = [f"{token} {label}" for token, label in zip(tokens, final_labels)]
    return "\n".join(labeled_tokens)

# Apply the combined processing to each message
df['Labeled_Message'] = df['Message'].apply(lambda msg: process_message(msg, nlp))



In [70]:
df['Labeled_Message'] 

2       CLASICO O\nPrice O\n2200 I-PRICE\nNo O\ngift O...
4       Puma O\nMade O\nin O\nVietnam O\nSize O\n40414...
8       New O\nyear O\nDiscount O\nInbox O\nHiwe5266 O...
19      Skechers O\nMade O\nin O\nVietnam O\nSize O\n4...
31      Reebok I-PRODUCT\nMade I-PRODUCT\nin O\nVietna...
                              ...                        
2612                                        Sold O\nout O
2613    Nikon O\nD9 O\nDigital O\nCamera O\nPrice O\n1...
2614    Vans O\nLeather O\nMade O\nin O\nVietnam O\nSi...
2615    Samsung O\nTV O\nCurved O\nFull O\nHD O\n55 I-...
2616    Rebook O\nMade O\nin O\nVietnam O\nSize O\n41 ...
Name: Labeled_Message, Length: 1920, dtype: object

## Save the final labeled data

In [71]:

# Save the final labeled data to a CoNLL-style file
output_file_combined = 'C:/Users/Administrator/Documents/kifiya/Week_5/labeled_data_conll.txt'
with open(output_file_combined, 'w', encoding='utf-8') as f:
    for index, row in df.iterrows():
        f.write(f"{row['Labeled_Message']}\n\n")

print(f"labeled data saved to {output_file_combined}")


labeled data saved to C:/Users/Administrator/Documents/kifiya/Week_5/labeled_data_conll.txt
