# Data Procesing and Labeling


In [1]:
import pandas as pd
import os
notebook_dir = os.getcwd()
parent_path=os.path.dirname(notebook_dir)

os.chdir(parent_path)

In [2]:
from script.data_processor_labler import Processor

**Instance of the imported class**

In [3]:
processor=Processor()

**Load the data which was scrapped from**

In [4]:
tg_data=pd.read_csv("data/adamagebeya_telegram_data.csv")

### Handle missing data
Drop the row which does not have a messgae

In [5]:
processor.drop_missing_messsage(tg_data)

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
0,አዳማ ገበያ - Adama gebeya,@gebeyaadama,4043,✅Hot Water Bag\n🎯 የትኩስ ውሃ መያዢያ ከረጢት\n👉 1.8 ሊትር...,2024-09-25 08:36:52+00:00,photos/@gebeyaadama_4043.jpg
1,አዳማ ገበያ - Adama gebeya,@gebeyaadama,4042,❇️Hair Scalp Massager,2024-09-25 08:33:14+00:00,
2,አዳማ ገበያ - Adama gebeya,@gebeyaadama,4041,❇️Hair Scalp Massager\n\n﻿﻿👉Stimulate blood fl...,2024-09-25 08:32:54+00:00,photos/@gebeyaadama_4041.jpg
3,አዳማ ገበያ - Adama gebeya,@gebeyaadama,4040,የፀጉር መፈረዣ,2024-09-25 07:19:39+00:00,
4,አዳማ ገበያ - Adama gebeya,@gebeyaadama,4039,✅ የፀጉር መፈረዣ ✅\n\n📌ለሁሉም አይነት ፀጉር የሚሆን እና ለ አጠቃቀ...,2024-09-25 07:19:27+00:00,photos/@gebeyaadama_4039.jpg
...,...,...,...,...,...,...
3298,አዳማ ገበያ - Adama gebeya,@gebeyaadama,17,☎️0911-76-22-01\n ዋጋ 1200 ብር \n❣️❣️🇪🇹🇪🇹 በ ...,2020-10-06 09:06:46+00:00,
3301,አዳማ ገበያ - Adama gebeya,@gebeyaadama,14,❤️❤️❤️ አዳማ ❤️❤️❤️\n 🎯የጀርባ ችግር አለቦት\n ...,2020-10-05 18:04:46+00:00,photos/@gebeyaadama_14.jpg
3304,አዳማ ገበያ - Adama gebeya,@gebeyaadama,10,❣ለውስን ጊዜ የሚቆይ ታላቅ ቅናሽ❣\n 💯አንድ Smart watch ሲገ...,2020-10-05 12:08:36+00:00,photos/@gebeyaadama_10.jpg
3305,አዳማ ገበያ - Adama gebeya,@gebeyaadama,9,❤ አዳማ /ናዝሬት ❤\n 0911-76-22-01\n❣️❣️🇪...,2020-10-05 04:58:56+00:00,photos/@gebeyaadama_9.jpg


## Remove emojis and extra spaces from the messages

In [6]:
processor.clean_message(tg_data)

0       Hot Water Bag\n የትኩስ ውሃ መያዢያ ከረጢት\n 1.8 ሊትር ውሃ...
1                                     Hair Scalp Massager
2       Hair Scalp Massager\nStimulate blood flow to t...
3                                               የፀጉር መፈረዣ
4        የፀጉር መፈረዣ \nለሁሉም አይነት ፀጉር የሚሆን እና ለ አጠቃቀም ምቹ\...
                              ...                        
3298    0911-76-22-01\n     ዋጋ 1200 ብር \n በ ሆድ የሰውነት ክ...
3301     አዳማ \n     የጀርባ ችግር አለቦት\n        0911-76-22-...
3304    ለውስን ጊዜ የሚቆይ ታላቅ ቅናሽ\n  አንድ  Smart watch ሲገዙ በ...
3305     አዳማ /ናዝሬት \n           0911-76-22-01\n በ ሆድ የ...
3309      0911-76-22-01\n         አዳማ / ናዝሬት \n     የጁ...
Name: Message, Length: 2488, dtype: object

## Filter the data to keep only amharic messages
Since the project is focused on *Amharic Named Entity Recognition* and the data have a mixed language (amharic and english) it is essential filtering out.in this case keep meaages that have 50% or more amharic characters.

In [16]:
tg_data=processor.filter_amharic(tg_data)

## Label messages
We are going to label a portion of the provided dataset in the CoNLL format. This format is commonly used for Named Entity Recognition (NER) tasks.
The goal is to identify and label entities such as products, price, and Location in Amharic text.
Entity Types:

**B-Product:** The beginning of a product entity (e.g., "Baby bottle"). 

**I-Product:** Inside a product entity (e.g., the word "bottle" in "Baby bottle").

**B-LOC:** The beginning of a location entity (e.g., "Addis abeba", "Bole").

**I-LOC:** Inside a location entity (e.g., the word "Abeba" in “Addis abeba”)

**B-PRICE:** The beginning of a price entity (e.g., "ዋጋ 1000 ብር", "በ 100 ብር").

**I -PRICE፡** Inside a price entity (e.g., the word "1000" in “ዋጋ 1000 ብር”)

**O:** Tokens that are outside any entities


In [10]:
# processor.label_dataset(tg_data)

In [17]:
import re

def label_message(message):
    # Define price and phone number patterns
    price_pattern = re.compile(r'\b\d{3,5}\s?ብር\b')
    phone_pattern = re.compile(r'(\+2519\d{8}\b|\b09\d{8}\b)')
    
    # Split tokens by newline
    tokens = message.split('\n')
    labels = []
    loc_flag = False  # To flag location entity start
    product_flag = False  # To flag that product labeling is in progress
    
    for i, token in enumerate(tokens):
        # Check if the token is a beginning token (first token) or contains specific keywords
        if i == 0 and all(city not in token for city in ['አዳማ', 'አዲስ አበባ']):
            labels.append('B-Product')  # Mark the first token or specified keywords as B-Product
            product_flag = True  # Set product flag
            continue
        
        # Check for phone number, 'አድራሻ', or 'አድራሻችን', and label as 'O'
        if phone_pattern.match(token) or token in ['አድራሻ', 'አድራሻችን']:
            labels.append('O')
            product_flag = False  # Reset product flag after encountering these
            continue
        
        # Check for price pattern and label as 'B-PRICE'
        if price_pattern.match(token):
            labels.append('B-PRICE')
            product_flag = False  # Reset product flag
            loc_flag = False  # Reset location flag
            continue
        
        # If token is part of a location entity
        if loc_flag:
            if token == 'ቴሌግራም' or re.match(r'https?://t\.me/\w+', token):
                loc_flag = False  # Stop marking as location if we hit Telegram or link
                labels.append('O')
            else:
                labels.append('B-LOC')  # Mark as B-LOC for location entities
                continue
        
        # Start a location entity if the previous token was a location keyword
        if labels and labels[-1] in ['O', 'B-LOC']:
            loc_flag = True  # Set location flag for subsequent tokens
        
        # If none of the above, label as 'O'
        labels.append('O')
    
    # Return tokens and corresponding labels
    return list(zip(tokens, labels))



In [18]:
message="Hot Water Bag የትኩስ ውሃ መያዢያ ከረጢት 1.8 ሊትር ውሃ ይይዛል ሙቀት ከሚቋቋም ወፍራም ጎማ የተሰራ አስተማማኝ ክዳን ያለው ወፍራም ጨርቅ ያለው ከወር አበባ ፣ ከወገብ ህመም፣ ከመደንዘዝ፣ ከደም ስር መዞርና ከውልቃት ጋር የተያያዙ ህመሞችን ለማስታገስ ይረዳል 750 ብር 0911762201 0972824252 0988404491 0922282582 በቴሌግራም ለማዘዝ @GebeyaAdama21 አድራሻችን አዳማ ፖስታ ቤት ሶሬቲ ሞል ምድር ላይ ሱ.ቁ 33 ይሄንን በመጫን የቤተሰባችን አባል ይሁኑ https://t.me/gebeyaadama የመረጡትን እቃ ይዘዙ፤ ያሉበት እናደርሳለን!! በኪስዎ ጥሬ ገንዘብ ካልያዙ በሞባይል ማስተላለፍ ይችላሉ።"

label_message(tg_data.Message[0])
# tg_data.Message[0]

[('Hot Water Bag', 'B-Product'),
 (' የትኩስ ውሃ መያዢያ ከረጢት', 'O'),
 (' 1.8 ሊትር ውሃ ይይዛል', 'O'),
 (' ሙቀት ከሚቋቋም ወፍራም ጎማ የተሰራ', 'B-LOC'),
 (' አስተማማኝ ክዳን ያለው', 'B-LOC'),
 (' ወፍራም ጨርቅ ያለው', 'B-LOC'),
 (' ከወር አበባ', 'B-LOC'),
 (' ፣ ከወገብ ህመም፣ ከመደንዘዝ፣ ከደም ስር መዞርና ከውልቃት ጋር የተያያዙ ህመሞችን ለማስታገስ ይረዳል',
  'B-LOC'),
 (' 750 ብር', 'B-LOC'),
 (' 0911762201', 'B-LOC'),
 (' 0972824252', 'B-LOC'),
 (' 0988404491', 'B-LOC'),
 (' 0922282582', 'B-LOC'),
 ('በቴሌግራም ለማዘዝ @GebeyaAdama21 ', 'B-LOC'),
 (' ', 'B-LOC'),
 (' አድራሻችን', 'B-LOC'),
 (' ', 'B-LOC'),
 ('አዳማ ፖስታ ቤት', 'B-LOC'),
 (' ሶሬቲ ', 'B-LOC'),
 ('ሞል ምድር ላይ ሱ.ቁ 33', 'B-LOC'),
 ('ይሄንን በመጫን የቤተሰባችን አባል ይሁኑ', 'B-LOC'),
 (' ', 'B-LOC'),
 ('https://t.me/gebeyaadama', 'O'),
 (' የመረጡትን እቃ ይዘዙ፤ ያሉበት እናደርሳለን!!', 'O'),
 (' በኪስዎ ጥሬ ገንዘብ ካልያዙ በሞባይል ማስተላለፍ ይችላሉ።', 'B-LOC')]