In [17]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import re
import emoji

## Load the dataset

In [18]:
# Load the dataset
df = pd.read_csv('C:/Users/Administrator/Documents/kifiya/Week_5/telegram_data.csv')

df = df.dropna(subset=['Message'])
display(df.head())

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
2,Fashion tera,@Fashiontera,3964,〰️〰️〰️〰️〰️ \nCLASICO\n🔸🔸🔸🔸🔸🔸\nPrice 2200( No ...,2024-09-24 14:34:31+00:00,photos\@Fashiontera_3964.jpg
4,Fashion tera,@Fashiontera,3962,〰️〰️〰️〰️〰️〰️〰️\nPuma \nMade in Vietnam \n🔸🔸🔸🔸🔸...,2024-09-18 15:26:06+00:00,photos\@Fashiontera_3962.jpg
8,Fashion tera,@Fashiontera,3950,New year Discount \n🌼🌼🌼🌼🌼🌼🌼🌼🌼\n〰️〰️〰️〰️〰️ \n🔸🔸...,2024-09-08 16:08:15+00:00,photos\@Fashiontera_3950.jpg
19,Fashion tera,@Fashiontera,3939,〰️〰️〰️〰️〰️ \nSkechers \nMade in Vietnam \n🔸🔸🔸🔸...,2024-09-08 15:45:52+00:00,photos\@Fashiontera_3939.jpg
31,Fashion tera,@Fashiontera,3915,〰️〰️〰️〰️〰️〰️〰️\nReebok \nMade in Vietnam \n🔸🔸🔸...,2024-09-01 19:38:25+00:00,photos\@Fashiontera_3915.jpg


## Load the tokenizer and model for NER

In [19]:
# Load the tokenizer and model for NER
tokenizer = AutoTokenizer.from_pretrained("mbeukman/xlm-roberta-base-finetuned-amharic-finetuned-ner-amharic")
model = AutoModelForTokenClassification.from_pretrained("mbeukman/xlm-roberta-base-finetuned-amharic-finetuned-ner-amharic")

# Set up NER pipeline
nlp = pipeline("ner", model=model, tokenizer=tokenizer)



## clean the text 

In [20]:
# Function to clean the text (remove emojis, symbols, etc.)
def remove_emoji(text):
    if isinstance(text, str):
        return emoji.replace_emoji(text, replace='')
    return text

def remove_symbols(text):
    if isinstance(text, str):
        return re.sub(r'[^A-Za-z0-9ሀ-ፐ\s]+', '', text)
    return text

# Apply cleaning functions to 'Message' column
df['Message'] = df['Message'].apply(remove_emoji).apply(remove_symbols)
display(df.head())

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
2,Fashion tera,@Fashiontera,3964,\nCLASICO\n\nPrice 2200 No gift box\nFree De...,2024-09-24 14:34:31+00:00,photos\@Fashiontera_3964.jpg
4,Fashion tera,@Fashiontera,3962,\nPuma \nMade in Vietnam \n\nSize 404143\nPric...,2024-09-18 15:26:06+00:00,photos\@Fashiontera_3962.jpg
8,Fashion tera,@Fashiontera,3950,New year Discount \n\n \n\nInbox Hiwe5266\nስልክ...,2024-09-08 16:08:15+00:00,photos\@Fashiontera_3950.jpg
19,Fashion tera,@Fashiontera,3939,\nSkechers \nMade in Vietnam \n\nSize 4243\nP...,2024-09-08 15:45:52+00:00,photos\@Fashiontera_3939.jpg
31,Fashion tera,@Fashiontera,3915,\nReebok \nMade in Vietnam \n\nSize 404142\nPr...,2024-09-01 19:38:25+00:00,photos\@Fashiontera_3915.jpg


## map NER model results to CoNLL format labels

In [21]:
# Function to map NER model results to CoNLL format labels
def map_ner_to_conll(ner_results, tokens):
    labels = ['O'] * len(tokens)  # Initialize labels as 'O' (Outside)
    
    for entity in ner_results:
        word = entity['word'].replace('##', '')  # Remove subword artifacts from NER results
        entity_type = entity['entity']  # Extract entity type
        
        # Define mapping for NER entity types
        label = 'O'  # Default label
        if entity_type == 'B-LOC':
            label = 'B-LOC'
        elif entity_type == 'I-LOC':
            label = 'I-LOC'
        elif entity_type == 'B-ORG':
            label = 'B-PRODUCT'
        elif entity_type == 'I-ORG':
            label = 'I-PRODUCT'
        elif entity_type == 'B-MISC':
            label = 'B-PRICE'
        elif entity_type == 'I-MISC':
            label = 'I-PRICE'
        
        # Apply NER labels to matching tokens
        for i, token in enumerate(tokens):
            if word in token:
                labels[i] = label

    return labels

# Custom labeling function to identify prices and locations
def custom_label_prices_locations(tokens):
    labels = ['O'] * len(tokens)  # Initialize labels as 'O'
    
    # Define price patterns and location names
    price_patterns = [r'^\d*(00|.*50)(\.\d{1,2})?$', 'ETB', 'ዋጋ', '\$', 'ብር', 'Birr']
    
    product_patterns=['sketchers', 'Nike', 'Adidas','Rebook','samsung','Samsung','Vans','Nikon', 'Nike','Puma',
                      'Adidas','Lacosta', 'Rolex','New','Allstar','Vapor','Sketchers','FILA','CK','TRAZER',
                      'Jordan','Womens','Human','Couple','Original','Victorias','BURBERRY','OFFER','Fila','2TB',
                      'CLASICO','Men','Balenciaga','Shose','CASENT','NIKE','Nike','Airforce','ROLEX','LOUIS','CYBER',
                      'Speed','speed','AIR','Air','Skacher','Time','All','Fitron','FITRON','EMPORIO','CK',
                      'CHANEL','Skechers','Sketcher','NB','Old','old','OLD','FENDI','SPEED','BRAND','Brand',
                      'BALENCIAGA','GUCCI','CHEKICH','GIORGIO','Jordan','JORDAN', 'Vest','European','Fur','VIGUER',
                      'Quality', 'QUALITY','SVETSEON','Couple','COUPLE','High','HIGH','Under','ADIDAS','VANS','Sun',
                      'Rolex','LEBRON','Lebron','Yezzy','ALEXANDER','XO','Jacket','55','HURACHE','Clark','Hermes','VM','RADO','Apple',
                      'Fendi','Police','Champion','Gucci','Stan','Calvin','SWISH','SKMEL','FOR','Cr','Military','VEST','YEEZY','DIESEL','chekich']
    
    locations = ['Addis','Ababa', 'ቦሌ', 'ሜክሲኮ', 'ለቡ', 'Mekelle', 'Adama', 'Gondar', 'ለቡ','መዳህኒዓለም', 
                 'መገናኛ', 'አበባ', 'ሀይሎች','ጦር', 'ድሪም', 'ታወር','205','አዲስ', 'ቁጥር', "ቢሮ", "ፎቅ", "2ተኛ" ]
    
    # First, process the specific tokens you provided with custom labels
    custom_tokens = {
          
        "አድራሻ"  : "B-LOC",       # Beginning of a location
        "Price": "B-PRICE",         #Beginning of a price
        "Prices": "B-PRICE",       #Beginning of a price
        "Free" : "O",
        "Delivery":"O",
        "Inbox" :"O",
        "Hiwe5266": "O",
        "ስልክ":"O",
        "ፋሽን":"O",
        "ተራ":"O",
        "Fashion":"O",
        "Tera":"O",
        "New" : "O",
        "year" :"O",
        "Discount": "O",
        "me" : "O",
        "httpsvmtiktokcomZM2yHbMPH" : "O",
        "contact" : "O",
        "sold" : "O",
        "out" : "O",
        "Sold" : "O",
        "Call" : "O",
        "call" : "O",
        "more" : "O",
        "info" : "O",
        "as" : "O",
        "Anyone" : "O",
        "who" : "O",
        "want" : "O",
        "new" : "O",
        "Original" : "O",
        "BIG" : "O",
        "DISCOUNT" : "O",
        "ብዛት" : "O",
        "ለምትወስዱ" : "O",
        "ልዩ" : "O",
        "ቅናሽ" : "O",
        "አለዉ" : "O",
        "ባሉበት" : "O",
        "እናደርሳለን" : "O",
        "ጫማ" : "O",
        "ለመግዛት" : "O",
        "መርካቶ" : "O",
        "እየሄዱ" : "O",
        "ደክመዋል" : "O",
        "እንግዲያውስ" : "O",
        "ቻናላችንን" : "O",
        "በመቀላቀለ" : "O",
        "የፈለጉትን" : "O",
        "ይዘዙን" : "O",
        "ባሉበት" : "O",
        "እናመጣለን" : "O",
        "httpstmejoinchatAAAAAEYRIOB5Tt7gKGGjA" : "O",
        "Enkuan": "O",
        "le": "O",
        "berhan": "O",
        "meswkelu": "O",
        "beselam": "O",
        "adersachu": "O",
               

    }
 
    # Apply labels based on the tokens
    for i, token in enumerate(tokens):
        # Check if token is in the custom list
        if token in custom_tokens:
            labels[i] = custom_tokens[token]
        # Check if token matches very long numbers (10 digits or more)
        elif re.match(r'^\d{10,}$', token):
            labels[i] = 'O'  # Label long numbers as 'O'
        # Label prices (e.g., numbers, ETB, Birr, $, etc.)
        elif any(pro in token for pro in product_patterns):
            labels[i] = 'B-PRODUCT'
        elif any(re.match(pattern, token) for pattern in price_patterns):
            labels[i] = 'I-PRICE'
        # Label locations (predefined locations)
        elif any(loc in token for loc in locations):
            labels[i] = 'I-LOC'
        # Label other tokens as I-PRODUCT
        else:
            labels[i] = 'I-PRODUCT'
    
    return labels 

# Function to combine both NER and custom labels
def combine_labels(ner_labels, custom_labels):
    final_labels = []
    
    for ner_label, custom_label in zip(ner_labels, custom_labels):
        if ner_label != 'O':  # NER label takes precedence
            final_labels.append(ner_label)
        else:
            final_labels.append(custom_label)  # Otherwise, use custom label
    
    return final_labels


## process a message with both NER and custom methods

In [22]:

# Function to process a message with both NER and custom methods
def process_message(message, nlp_pipeline):
    tokens = re.findall(r'\S+', message)  # Tokenize the message
    
    # Apply NER model
    ner_results = nlp_pipeline(message)
    ner_labels = map_ner_to_conll(ner_results, tokens)
    
    # Apply custom labeling
    custom_labels = custom_label_prices_locations(tokens)
    
    # Combine both label sets
    final_labels = combine_labels(ner_labels, custom_labels)
    
    # Return tokens with their combined labels
    labeled_tokens = [f"{token} {label}" for token, label in zip(tokens, final_labels)]
    return "\n".join(labeled_tokens)

# Apply the combined processing to each message
df['Labeled_Message'] = df['Message'].apply(lambda msg: process_message(msg, nlp))

In [23]:
df['Labeled_Message'] 

2       CLASICO B-PRODUCT\nPrice B-PRICE\n2200 I-PRICE...
4       Puma B-PRODUCT\nMade I-PRODUCT\nin I-PRODUCT\n...
8       New O\nyear O\nDiscount O\nInbox O\nHiwe5266 O...
19      Skechers B-PRODUCT\nMade I-PRODUCT\nin I-PRODU...
31      Reebok I-PRODUCT\nMade I-PRODUCT\nin I-PRODUCT...
                              ...                        
2612                                        Sold O\nout O
2613    Nikon B-PRODUCT\nD9 I-PRODUCT\nDigital I-PRODU...
2614    Vans B-PRODUCT\nLeather I-PRODUCT\nMade I-PROD...
2615    Samsung B-PRODUCT\nTV I-PRODUCT\nCurved I-PROD...
2616    Rebook B-PRODUCT\nMade I-PRODUCT\nin I-PRODUCT...
Name: Labeled_Message, Length: 1920, dtype: object

## Save the final labeled data

In [24]:

# Save the final labeled data to a CoNLL-style file
output_file_combined = 'C:/Users/Administrator/Documents/kifiya/Week_5/labeled_data_conll.txt'
with open(output_file_combined, 'w', encoding='utf-8') as f:
    for index, row in df.iterrows():
        f.write(f"{row['Labeled_Message']}\n\n")


print(f"labeled data saved to {output_file_combined}")  


labeled data saved to C:/Users/Administrator/Documents/kifiya/Week_5/labeled_data_conll.txt
