# Import the necessary libraries

In [1]:
import pandas as pd
import sys
import os
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import re

# Configure logging
logging.basicConfig(filename='../logs/data_processing.log',
                    level=logging.INFO,
                    format='%(asctime)s:%(levelname)s:%(message)s')
  

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../scripts')))
from load_data import Load_Data


In [3]:
df = pd.read_excel('../data/telegram_data.xlsx')



In [4]:
print("Checking for NaN values in the 'Message' column:")
nan_count = df['Message'].isnull().sum()
print(f"Number of NaN values in 'Message' column: {nan_count}")

Checking for NaN values in the 'Message' column:
Number of NaN values in 'Message' column: 1849


In [5]:
df = df.dropna(subset=['Message'])

# Print the shape of the dataset after dropping NaN values in the "Message" column
print(f"Dataset shape after dropping NaN values in 'Message' column: {df.shape}")

Dataset shape after dropping NaN values in 'Message' column: (3166, 6)


In [6]:
df.head()

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
5,Sheger online-store,@Shageronlinestore,5328.0,💥3pcs silicon brush spatulas\n\n\n \n ...,2024-09-20 11:50:02+00:00,photos/@Shageronlinestore_5328.jpg
6,Sheger online-store,@Shageronlinestore,5327.0,💥Mandoline Slicer\n\n👉 ጊዜ ቆጣቢ ስላይስ ማድረጊያ \n👉 ...,2024-09-20 08:11:40+00:00,
7,Sheger online-store,@Shageronlinestore,5326.0,💥Table Desk Edge Guard Strip\n 💯 High Qu...,2024-09-20 05:23:18+00:00,
8,Sheger online-store,@Shageronlinestore,5325.0,💥Table Desk Edge Guard Strip\n 💯 High Qu...,2024-09-20 05:21:14+00:00,photos/@Shageronlinestore_5325.jpg
10,Sheger online-store,@Shageronlinestore,5323.0,"💥Only baby 3in1 double bottle milk warmer,ster...",2024-09-19 13:54:46+00:00,photos/@Shageronlinestore_5323.jpg


In [7]:
message_df=df['Message']
message_df

5       💥3pcs silicon brush spatulas\n\n\n      \n    ...
6       💥Mandoline Slicer\n\n👉 ጊዜ ቆጣቢ ስላይስ ማድረጊያ \n👉  ...
7       💥Table Desk Edge Guard Strip\n       💯 High Qu...
8       💥Table Desk Edge Guard Strip\n       💯 High Qu...
10      💥Only baby 3in1 double bottle milk warmer,ster...
                              ...                        
5009    🎯 Kitchen Sticker\n\nለኪችንዎ ውበት እጅግ ተመራጭ \n🔰ውሀ ...
5010    🎯 3in1 One Step Hair Dryer & Styler \n\n👉 ከርል ...
5011    ✅ Home GYM - X5 slimming vibrator \n\n📢📢📢 ታላቅ ...
5012    ለጤናችን-Health & Personal Care\n\n📍FingerTip Pul...
5013    #Finger_tip_pulse_oximeter\n       #በተመጣጣኝ_ዋጋ\...
Name: Message, Length: 3166, dtype: object

In [8]:

# Example of your DataFrame
# df = pd.DataFrame({'Message': ['💥3pcs silicon brush spatulas...', '💥Mandoline Slicer...', ...]})

# Define a function to remove emojis
def remove_emojis(text):
    emoji_pattern = re.compile(
        "[" 
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251" 
        "]+", 
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

# Apply the function to the 'Message' column
df['Message'] = df['Message'].apply(remove_emojis)

# Display the updated DataFrame
print(df.head())


          Channel Title    Channel Username      ID  \
5   Sheger online-store  @Shageronlinestore  5328.0   
6   Sheger online-store  @Shageronlinestore  5327.0   
7   Sheger online-store  @Shageronlinestore  5326.0   
8   Sheger online-store  @Shageronlinestore  5325.0   
10  Sheger online-store  @Shageronlinestore  5323.0   

                                              Message  \
5   3pcs silicon brush spatulas\n\n\n      \n     ...   
6   Mandoline Slicer\n\n ጊዜ ቆጣቢ ስላይስ ማድረጊያ \n  ለእጅ...   
7   Table Desk Edge Guard Strip\n        High Qual...   
8   Table Desk Edge Guard Strip\n        High Qual...   
10  Only baby 3in1 double bottle milk warmer,steri...   

                         Date                          Media Path  
5   2024-09-20 11:50:02+00:00  photos/@Shageronlinestore_5328.jpg  
6   2024-09-20 08:11:40+00:00                                 NaN  
7   2024-09-20 05:23:18+00:00                                 NaN  
8   2024-09-20 05:21:14+00:00  photos/@Shageronlinesto

In [21]:
df = df.to_csv('../data/clean_data.csv')

# Token Labeling for Product, Price, and Location Recognition in UTF-8 Messages

In [9]:
import re

def label_message_utf8_with_birr(message):
    # Check if the message is None
    if not message:
        return ""  # Return an empty string for None or empty messages
    
    # Split the message at the first occurrence of '\n'
    if '\n' in message:
        first_line, remaining_message = message.split('\n', 1)
    else:
        first_line, remaining_message = message, ""
    
    labeled_tokens = []
    
    # Tokenize the first line
    first_line_tokens = re.findall(r'\S+', first_line)
    
    # Label the first token as B-PRODUCT and the rest as I-PRODUCT
    if first_line_tokens:
        labeled_tokens.append(f"{first_line_tokens[0]} B-PRODUCT")  # First token as B-PRODUCT
        for token in first_line_tokens[1:]:
            labeled_tokens.append(f"{token} I-PRODUCT")  # Remaining tokens as I-PRODUCT
    
    # Process the remaining message normally
    if remaining_message:
        lines = remaining_message.split('\n')
        for line in lines:
            tokens = re.findall(r'\S+', line)  # Tokenize each line
            
            for token in tokens:
                # Check if token is a price (e.g., 500 ETB, $100, or ብር)
                if re.match(r'^\d{10,}$', token):
                    labeled_tokens.append(f"{token} O")  # Label as O for "other" or outside of any entity
                elif re.match(r'^\d+(\.\d{1,2})?$', token) or 'ETB' in token or 'ዋጋ' in token or '$' in token or 'ብር' in token:
                    labeled_tokens.append(f"{token} I-PRICE")
                # Check if token could be a location (e.g., cities or general location names)
                elif any(loc in token for loc in ['Addis Ababa', 'ለቡ', 'ለቡ መዳህኒዓለም', 'መገናኛ', 'ቦሌ', 'ሜክሲኮ']):
                    labeled_tokens.append(f"{token} I-LOC")
                # Assume other tokens are part of a product name or general text
                else:
                    labeled_tokens.append(f"{token} O")
    
    return "\n".join(labeled_tokens)

# Ensure that None or NaN values in the 'Message' column are handled
df['Message'] = df['Message'].fillna("")  # Replace None/NaN with an empty string

# Apply the updated function to the non-null messages
df['Labeled_Message'] = df['Message'].apply(label_message_utf8_with_birr)

# Display the updated DataFrame
df.head()


Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path,Labeled_Message
5,Sheger online-store,@Shageronlinestore,5328.0,3pcs silicon brush spatulas\n\n\n \n ...,2024-09-20 11:50:02+00:00,photos/@Shageronlinestore_5328.jpg,3pcs B-PRODUCT\nsilicon I-PRODUCT\nbrush I-PRO...
6,Sheger online-store,@Shageronlinestore,5327.0,Mandoline Slicer\n\n ጊዜ ቆጣቢ ስላይስ ማድረጊያ \n ለእጅ...,2024-09-20 08:11:40+00:00,,Mandoline B-PRODUCT\nSlicer I-PRODUCT\nጊዜ O\nቆ...
7,Sheger online-store,@Shageronlinestore,5326.0,Table Desk Edge Guard Strip\n High Qual...,2024-09-20 05:23:18+00:00,,Table B-PRODUCT\nDesk I-PRODUCT\nEdge I-PRODUC...
8,Sheger online-store,@Shageronlinestore,5325.0,Table Desk Edge Guard Strip\n High Qual...,2024-09-20 05:21:14+00:00,photos/@Shageronlinestore_5325.jpg,Table B-PRODUCT\nDesk I-PRODUCT\nEdge I-PRODUC...
10,Sheger online-store,@Shageronlinestore,5323.0,"Only baby 3in1 double bottle milk warmer,steri...",2024-09-19 13:54:46+00:00,photos/@Shageronlinestore_5323.jpg,Only B-PRODUCT\nbaby I-PRODUCT\n3in1 I-PRODUCT...


In [10]:
# Save the updated labeled dataset to a file in CoNLL format
labeled_data_birr_path = 'labeled_telegram_product_price_location.txt-'
with open(labeled_data_birr_path, 'w', encoding='utf-8') as f:
    for index, row in df.iterrows():
        f.write(f"{row['Labeled_Message']}\n\n")

In [11]:
def label_message_utf8_with_birr(message):
    tokens = re.findall(r'\S+', message)  # Tokenize while considering non-ASCII characters
    labeled_tokens = []
    
    for token in tokens:
        # Check if token is a price (e.g., 500 ETB, $100, or ብር)
        
        if re.match(r'^\d{10,}$', token):
            labeled_tokens.append(f"{token} O")  # Label as O for "other" or outside of any entity
        elif re.match(r'^\d+(\.\d{1,2})?$', token) or 'ETB' in token or '$' in token or 'ብር' in token:
            labeled_tokens.append(f"{token} I-PRICE")
        
        # Check if token could be a location (e.g., cities or general location names)
        elif any(loc in token for loc in ['Addis Ababa', 'ለቡ', 'ለቡ  መዳህኒዓለም', 'መገናኛ','ቦሌ','ሜክሲኮ']):
            labeled_tokens.append(f"{token} I-LOC")
        
        elif any(loc in token for loc in ['💥']):
            labeled_tokens.append(f"{token} B-Product")
        
        # Assume other tokens are part of a product name (this can be refined)
        else:
            labeled_tokens.append(f"{token} O")
    
    return "\n".join(labeled_tokens)

# Apply the updated function to the non-null messages
df['Labeled_Message'] = df['Message'].apply(label_message_utf8_with_birr)



In [12]:
# Save the updated labeled dataset to a file in CoNLL format
labeled_data_birr_path = 'labeled_telegram_data_price_product_location_birr.txt'
with open(labeled_data_birr_path, 'w', encoding='utf-8') as f:
    for index, row in df.iterrows():
        f.write(f"{row['Labeled_Message']}\n\n")

In [13]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0")
model = AutoModelForTokenClassification.from_pretrained("masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

# Example text from your dataframe
example = df['Message'][10]
ner_results = nlp(example)
print(ner_results)


  from .autonotebook import tqdm as notebook_tqdm


[{'entity': 'B-LOC', 'score': np.float32(0.92732847), 'index': 35, 'word': '▁ስ', 'start': 112, 'end': 113}, {'entity': 'B-ORG', 'score': np.float32(0.49229348), 'index': 36, 'word': 'ሪ', 'start': 113, 'end': 114}, {'entity': 'I-LOC', 'score': np.float32(0.9909759), 'index': 37, 'word': '▁', 'start': 115, 'end': 116}, {'entity': 'I-LOC', 'score': np.float32(0.98072445), 'index': 38, 'word': 'ኤም', 'start': 115, 'end': 117}, {'entity': 'I-LOC', 'score': np.float32(0.9790091), 'index': 39, 'word': '▁ሲ', 'start': 118, 'end': 119}, {'entity': 'I-LOC', 'score': np.float32(0.9285544), 'index': 40, 'word': 'ቲ', 'start': 119, 'end': 120}, {'entity': 'I-LOC', 'score': np.float32(0.9664343), 'index': 41, 'word': '▁ሞ', 'start': 121, 'end': 122}, {'entity': 'I-LOC', 'score': np.float32(0.98376197), 'index': 42, 'word': 'ል', 'start': 122, 'end': 123}, {'entity': 'I-LOC', 'score': np.float32(0.8450248), 'index': 43, 'word': '▁', 'start': 125, 'end': 126}, {'entity': 'B-LOC', 'score': np.float32(0.6465

In [14]:
df['Message'][10]


'Only baby 3in1 double bottle milk warmer,sterilizer,food steamer\n\n\xa0\xa0\xa0\xa0\xa0\xa0\xa0 ዋጋ:-3000ብር\n\nውስን ፍሬ ነው ያለው\n\n አድራሻ\xa0 ቁ.1 ስሪ ኤም ሲቲ ሞል\xa0 ሁለተኛ ፎቅ ቢሮ ቁ. SL-05A(ከ ሊፍቱ ፊት ለ ፊት)\n\nቁ.2 ለቡ\xa0 መዳህኒዓለም ቤተ/ክርስቲያን ፊት ለፊት\xa0 #ዛም_ሞል 2ኛ ፎቅ ቢሮ ቁጥር.214\n\nለቡ\xa0ቅርንጫፍ0973611819\n\n\n\n\xa0\xa0\xa0\xa0 \n\n\n\xa0\xa0\xa0  0909522840\n\xa0\xa0\xa0  0923350054\n\n\n\xa0 በTelegram ለማዘዝ  ይጠቀሙ\n@shager_onlinestore\n\xa0 \nለተጨማሪ ማብራሪያ የቴሌግራም ገፃችን\nhttps://t.me/Shageronlinestore'

In [15]:
df.to_csv('../data/clean_data_set.csv')

In [18]:
# Function to check if a string contains Amharic characters
def is_amharic(message):
    return bool(re.search(r'[\u1200-\u137F]', message))

# Example: applying the function to a DataFrame
# Assuming 'df' is your DataFrame with a 'Message' column

# Apply the function to the 'Message' column and create a new column 'Contains_Amharic'
df['Contains_Amharic'] = df['Message'].apply(is_amharic)

# Print the first few rows to check the output
print(df[['Message', 'Contains_Amharic']].head())

                                              Message  Contains_Amharic
5   3pcs silicon brush spatulas\n\n\n      \n     ...              True
6   Mandoline Slicer\n\n ጊዜ ቆጣቢ ስላይስ ማድረጊያ \n  ለእጅ...              True
7   Table Desk Edge Guard Strip\n        High Qual...              True
8   Table Desk Edge Guard Strip\n        High Qual...              True
10  Only baby 3in1 double bottle milk warmer,steri...              True


In [16]:
# Function to classify messages
def classify_message(message):
    if pd.isna(message):  # Check for NaN or None
        return 'uncategorized'
    
    if is_amharic(message):
        for category, keywords in categories.items():
            if any(keyword in message for keyword in keywords):
                return category
    else:
        for category, keywords in categories.items():
            if any(keyword in message.lower() for keyword in keywords):
                return category
    return 'uncategorized'



In [17]:
import re

# Example of a classify_message function
def classify_message(message):
    # Define categories and classification logic
    if re.search(r'\bአማርኛ\b', message):  # Example condition for Amharic
        return 'Amharic'
    elif re.search(r'\bprice\b|\bETB\b|\bብር\b', message, re.IGNORECASE):
        return 'Price'
    elif re.search(r'\bAddis Ababa\b|ቦሌ|ሜክሲኮ', message):
        return 'Location'
    else:
        return 'Other'

# Apply the classify_message function to the Message column
df['Category'] = df['Message'].apply(classify_message)

# Display the updated DataFrame with categories
print(df[['Message', 'Category']].head())


                                              Message Category
5   3pcs silicon brush spatulas\n\n\n      \n     ...    Other
6   Mandoline Slicer\n\n ጊዜ ቆጣቢ ስላይስ ማድረጊያ \n  ለእጅ...    Price
7   Table Desk Edge Guard Strip\n        High Qual...    Price
8   Table Desk Edge Guard Strip\n        High Qual...    Price
10  Only baby 3in1 double bottle milk warmer,steri...    Other


In [18]:
# Display counts of unique values in the Category column
category_counts = df['Category'].value_counts()
print(category_counts)

Category
Price    1759
Other    1407
Name: count, dtype: int64


In [19]:
df.head(100)

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path,Labeled_Message,Category
5,Sheger online-store,@Shageronlinestore,5328.0,3pcs silicon brush spatulas\n\n\n \n ...,2024-09-20 11:50:02+00:00,photos/@Shageronlinestore_5328.jpg,3pcs O\nsilicon O\nbrush O\nspatulas O\nዋጋ-550...,Other
6,Sheger online-store,@Shageronlinestore,5327.0,Mandoline Slicer\n\n ጊዜ ቆጣቢ ስላይስ ማድረጊያ \n ለእጅ...,2024-09-20 08:11:40+00:00,,Mandoline O\nSlicer O\nጊዜ O\nቆጣቢ O\nስላይስ O\nማድ...,Price
7,Sheger online-store,@Shageronlinestore,5326.0,Table Desk Edge Guard Strip\n High Qual...,2024-09-20 05:23:18+00:00,,Table O\nDesk O\nEdge O\nGuard O\nStrip O\nHig...,Price
8,Sheger online-store,@Shageronlinestore,5325.0,Table Desk Edge Guard Strip\n High Qual...,2024-09-20 05:21:14+00:00,photos/@Shageronlinestore_5325.jpg,Table O\nDesk O\nEdge O\nGuard O\nStrip O\nHig...,Price
10,Sheger online-store,@Shageronlinestore,5323.0,"Only baby 3in1 double bottle milk warmer,steri...",2024-09-19 13:54:46+00:00,photos/@Shageronlinestore_5323.jpg,Only O\nbaby O\n3in1 O\ndouble O\nbottle O\nmi...,Other
...,...,...,...,...,...,...,...,...
181,Sheger online-store,@Shageronlinestore,5142.0,WINNING STAR® 2in1 MULTIFUNCTIONAL BLENDER\n\n...,2024-08-29 09:12:06+00:00,photos/@Shageronlinestore_5142.jpg,WINNING O\nSTAR® O\n2in1 O\nMULTIFUNCTIONAL O\...,Other
182,Sheger online-store,@Shageronlinestore,5141.0,3.6L Glass dispenser jar with Bamboo stand\n\n...,2024-08-29 06:02:13+00:00,,3.6L O\nGlass O\ndispenser O\njar O\nwith O\nB...,Other
183,Sheger online-store,@Shageronlinestore,5140.0,3.6L Glass dispenser jar with Bamboo stand\n\n...,2024-08-29 06:01:02+00:00,photos/@Shageronlinestore_5140.jpg,3.6L O\nGlass O\ndispenser O\njar O\nwith O\nB...,Other
184,Sheger online-store,@Shageronlinestore,5139.0,44CM HAOCHU® CERAMIC PIZZA PAN\n\nለቤትና ለሬስቶራንት...,2024-08-28 18:38:44+00:00,photos/@Shageronlinestore_5139.jpg,44CM O\nHAOCHU® O\nCERAMIC O\nPIZZA O\nPAN O\n...,Other


In [20]:
# from wordcloud import WordCloud
# import matplotlib.pyplot as plt
# # Filter for uncategorized items
# uncategorized_items = df[df['Category'] == 'uncategorized']

# # Combine all messages into a single string
# text = ' '.join(uncategorized_items['Message'])

# # Generate the word cloud
# wordcloud = WordCloud(width=800, height=400, background_color='white', 
#                       colormap='viridis', max_words=200).generate(text)

# # Display the word cloud
# plt.figure(figsize=(10, 5))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')  # Turn off the axis
# plt.title('Word Cloud of Messages for Uncategorized Items')
# plt.show()

In [28]:
uncategorized_items = df[df['Category'] == 'uncategorized']
uncategorized_items.head(100)

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path,Labeled_Message,Contains_Amharic,Category


In [None]:
df = df.to_csv('../data/labeled_data.conll')