In [1]:
import os, sys
notebook_dir = os.getcwd()
sys.path.append(os.path.abspath(os.path.join(notebook_dir,'..')))

In [6]:
from scripts.data_load import load_data
import re

In [5]:
df = load_data('../Data/telegram_data.csv')

In [7]:
print("Checking for NaN values in the 'Message' column:")
nan_count = df['Message'].isnull().sum()
print(f"Number of NaN values in 'Message' column: {nan_count}")

Checking for NaN values in the 'Message' column:
Number of NaN values in 'Message' column: 2565


In [8]:
df = df.dropna(subset=['Message'])

# Print the shape of the dataset after dropping NaN values in the "Message" column
print(f"Dataset shape after dropping NaN values in 'Message' column: {df.shape}")

Dataset shape after dropping NaN values in 'Message' column: (3258, 6)


In [9]:
message_df=df['Message']
message_df

0       ❇️100 Pieces Disposable Food Cover, Plastic Wr...
1       ❇️ 100 Pieces Disposable Food Cover, Plastic W...
2            😠😠😠😠😠😠👁👁👁👁👁👁\n❇️ Rechargeable Flawless Brows
4       😠😠😠😠😠😠👁👁👁👁👁👁\n❇️ Rechargeable Flawless Brows\n...
9       ❇️Bottle and nipple brush\n\n    ዋጋ💵፦  💰 350 ብ...
                              ...                        
5818    ❇️LED Crystal Table Lamp \n\n🔰Very suitable fo...
5819    ❇️100 Pieces Disposable Food Cover, Plastic Wr...
5820    ❇️ 100 Pieces Disposable Food Cover, Plastic W...
5821                              ❇️Waist Training Corset
5822                              ❇️Waist Training Corset
Name: Message, Length: 3258, dtype: object

In [10]:
def remove_emojis(text):
    emoji_pattern = re.compile(
        "[" 
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251" 
        "]+", 
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

# Apply the function to the 'Message' column
df['Message'] = df['Message'].apply(remove_emojis)

# Display the updated DataFrame
print(df.head())

      Channel Title Channel Username     ID  \
0  AwasMart-አዋስማርት🎁        @AwasMart  10204   
1  AwasMart-አዋስማርት🎁        @AwasMart  10203   
2  AwasMart-አዋስማርት🎁        @AwasMart  10202   
4  AwasMart-አዋስማርት🎁        @AwasMart  10200   
9  AwasMart-አዋስማርት🎁        @AwasMart  10195   

                                             Message  \
0  100 Pieces Disposable Food Cover, Plastic Wrap...   
1   100 Pieces Disposable Food Cover, Plastic Wra...   
2                     \n Rechargeable Flawless Brows   
4  \n Rechargeable Flawless Brows\n      High Qua...   
9  Bottle and nipple brush\n\n    ዋጋ፦   350 ብር\n\...   

                        Date                  Media Path  
0  2025-01-20 18:38:37+00:00                         NaN  
1  2025-01-20 18:38:37+00:00  photos\@AwasMart_10203.jpg  
2  2025-01-20 12:15:15+00:00                         NaN  
4  2025-01-20 12:15:15+00:00  photos\@AwasMart_10200.jpg  
9  2025-01-20 08:09:44+00:00  photos\@AwasMart_10195.jpg  


In [11]:
df.to_csv('../Data/clean_data_telegram.csv')

In [12]:
def label_message_utf8_with_birr(message):
    # Split the message at the first occurrence of '\n'
    if '\n' in message:
        first_line, remaining_message = message.split('\n', 1)
    else:
        first_line, remaining_message = message, ""
    
    labeled_tokens = []
    
    # Tokenize the first line
    first_line_tokens = re.findall(r'\S+', first_line)
    
    # Label the first token as B-PRODUCT and the rest as I-PRODUCT
    if first_line_tokens:
        labeled_tokens.append(f"{first_line_tokens[0]} B-PRODUCT")  # First token as B-PRODUCT
        for token in first_line_tokens[1:]:
            labeled_tokens.append(f"{token} I-PRODUCT")  # Remaining tokens as I-PRODUCT
    
    # Process the remaining message normally
    if remaining_message:
        lines = remaining_message.split('\n')
        for line in lines:
            tokens = re.findall(r'\S+', line)  # Tokenize each line while considering non-ASCII characters
            
            for token in tokens:
                # Check if token is a price (e.g., 500 ETB, $100, or ብር)
                if re.match(r'^\d{10,}$', token):
                    labeled_tokens.append(f"{token} O")  # Label as O for "other" or outside of any entity
                elif re.match(r'^\d+(\.\d{1,2})?$', token) or 'ETB' in token or 'ዋጋ' in token or '$' in token or 'ብር' in token:
                    labeled_tokens.append(f"{token} I-PRICE")
                # Check if token could be a location (e.g., cities or general location names)
                elif any(loc in token for loc in ['Addis Ababa', 'ለቡ', 'ለቡ መዳህኒዓለም', 'መገናኛ', 'ቦሌ', 'ሜክሲኮ']):
                    labeled_tokens.append(f"{token} I-LOC")
                # Assume other tokens are part of a product name or general text
                else:
                    labeled_tokens.append(f"{token} O")
    
    return "\n".join(labeled_tokens)

# Apply the updated function to the non-null messages
df['Labeled_Message'] = df['Message'].apply(label_message_utf8_with_birr)

# Display the updated DataFrame
df.head()


Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path,Labeled_Message
0,AwasMart-አዋስማርት🎁,@AwasMart,10204,"100 Pieces Disposable Food Cover, Plastic Wrap...",2025-01-20 18:38:37+00:00,,100 B-PRODUCT\nPieces I-PRODUCT\nDisposable I-...
1,AwasMart-አዋስማርት🎁,@AwasMart,10203,"100 Pieces Disposable Food Cover, Plastic Wra...",2025-01-20 18:38:37+00:00,photos\@AwasMart_10203.jpg,100 B-PRODUCT\nPieces I-PRODUCT\nDisposable I-...
2,AwasMart-አዋስማርት🎁,@AwasMart,10202,\n Rechargeable Flawless Brows,2025-01-20 12:15:15+00:00,,Rechargeable O\nFlawless O\nBrows O
4,AwasMart-አዋስማርት🎁,@AwasMart,10200,\n Rechargeable Flawless Brows\n High Qua...,2025-01-20 12:15:15+00:00,photos\@AwasMart_10200.jpg,Rechargeable O\nFlawless O\nBrows O\nHigh O\nQ...
9,AwasMart-አዋስማርት🎁,@AwasMart,10195,Bottle and nipple brush\n\n ዋጋ፦ 350 ብር\n\...,2025-01-20 08:09:44+00:00,photos\@AwasMart_10195.jpg,Bottle B-PRODUCT\nand I-PRODUCT\nnipple I-PROD...


In [13]:
labeled_data_birr_path = '../Data/@AwasMart_labeled_telegram_product_price_location.txt'
with open(labeled_data_birr_path, 'w', encoding='utf-8') as f:
    for index, row in df.iterrows():
        f.write(f"{row['Labeled_Message']}\n\n")