In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

import time
import warnings

warnings.filterwarnings('ignore')

In [3]:
sys.path.append(os.path.abspath('../scripts/'))

In [4]:
from data_loader import load_data, label_content

In [5]:
df = load_data('../data/telegram_data.csv')

In [6]:
df.shape

(37378, 6)

In [7]:
df.head()

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
0,Zemen Express®,@ZemenExpress,6982,💥💥...................................💥💥\n\n📌Im...,2025-06-18 06:01:10+00:00,
1,Zemen Express®,@ZemenExpress,6981,💥💥...................................💥💥\n\n📌 B...,2025-06-16 12:21:00+00:00,
2,Zemen Express®,@ZemenExpress,6980,,2025-06-16 05:11:57+00:00,data\photos\@ZemenExpress_6980.jpg
3,Zemen Express®,@ZemenExpress,6979,,2025-06-16 05:11:57+00:00,data\photos\@ZemenExpress_6979.jpg
4,Zemen Express®,@ZemenExpress,6978,,2025-06-16 05:11:57+00:00,data\photos\@ZemenExpress_6978.jpg


In [8]:
df.isnull().sum()

Channel Title           0
Channel Username        0
ID                      0
Message             13772
Date                    0
Media Path           8044
dtype: int64

In [9]:
df['content_type'] = df.apply(label_content, axis=1)

In [10]:
df = df[df['content_type'] != 'None']

In [11]:
df.isnull().sum()

Channel Title           0
Channel Username        0
ID                      0
Message             12724
Date                    0
Media Path           6996
content_type            0
dtype: int64

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36330 entries, 0 to 37376
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Channel Title     36330 non-null  object
 1   Channel Username  36330 non-null  object
 2   ID                36330 non-null  int64 
 3   Message           23606 non-null  object
 4   Date              36330 non-null  object
 5   Media Path        29334 non-null  object
 6   content_type      36330 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.2+ MB


In [13]:
from preprocess import clean_text, remove_english_words

In [14]:
df['Message'] = df['Message'].apply(clean_text)

In [15]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

In [16]:
df.isnull().sum()

Channel Title           0
Channel Username        0
ID                      0
Message             12724
Date                    0
Media Path           6996
content_type            0
dtype: int64

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36330 entries, 0 to 37376
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   Channel Title     36330 non-null  object             
 1   Channel Username  36330 non-null  object             
 2   ID                36330 non-null  int64              
 3   Message           23606 non-null  object             
 4   Date              36330 non-null  datetime64[ns, UTC]
 5   Media Path        29334 non-null  object             
 6   content_type      36330 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(1), object(5)
memory usage: 2.2+ MB


In [18]:
df['Message_cleaned_amh'] = df['Message'].apply(remove_english_words)

In [19]:
df['Message_cleaned_amh']

0        💥💥...................................💥💥 📌 👍በኤሌ...
1        💥💥...................................💥💥 📌 👍በፈለ...
2                                                      NaN
3                                                      NaN
4                                                      NaN
                               ...                        
37372    🎯 ለኪችንዎ ውበት እጅግ ተመራጭ 🔰ውሀ የማያስገባ 🔰ቅባት ዘይት ነገሮች ...
37373    🎯 3in1 & 👉 ከርል ለመስራት 👉 ለማለስለስ 👉 እንዲሁም ለማድረቅ የሚ...
37374    ✅ - X5 📢📢📢 ታላቅ ቅናሽ 🎉🎉🎉 ⚡️ ⚡️ 🚩የሰዉነትዎ ውፍረት አሳስቧ...
37375    ለጤናችን- & 📍 SPO2 () 🎯 & 🎯 . 🎯 , 🎯 - 🎯 🎯 💦 ዋጋ፦ 💰...
37376    📲 0909522840 🎯 ለአጠቃቀም ምቹ 🎯 በሰዉነታችን ያለውን የኦክስጅን...
Name: Message_cleaned_amh, Length: 36330, dtype: object

In [20]:
from preprocess import remove_amharic_words

In [21]:
df['Message_cleaned_eng'] = df['Message'].apply(remove_amharic_words)

In [22]:
df['Message_cleaned_eng']

0        💥💥...................................💥💥 📌Imita...
1        💥💥...................................💥💥 📌 Baby...
2                                                      nan
3                                                      nan
4                                                      nan
                               ...                        
37372    🎯 Kitchen Sticker 🔰 🔰 🔰 ✅ ✅ 👉 Size: 60ሴ.×3ሜ ፦ ...
37373    🎯 3in1 One Step Hair Dryer & Styler 👉 👉 👉 🔰 ፦ ...
37374    ✅ Home GYM - X5 slimming vibrator 📢📢📢 🎉🎉🎉 ⚡️Sl...
37375    -Health & Personal Care 📍FingerTip Pulse Oxime...
37376                                 📲 0909522840 🎯 🎯 🚚 🚚
Name: Message_cleaned_eng, Length: 36330, dtype: object

In [23]:
df.isnull().sum()

Channel Title              0
Channel Username           0
ID                         0
Message                12724
Date                       0
Media Path              6996
content_type               0
Message_cleaned_amh    12724
Message_cleaned_eng        0
dtype: int64

In [24]:
df['Message'] = df['Message'].fillna('[መልዕክት የለም]')
df['Media Path'] = df['Media Path'].fillna('[ምስል የለም]')

In [25]:
df.isnull().sum()

Channel Title              0
Channel Username           0
ID                         0
Message                    0
Date                       0
Media Path                 0
content_type               0
Message_cleaned_amh    12724
Message_cleaned_eng        0
dtype: int64

In [26]:
df['Message_cleaned_amh'] = df['Message_cleaned_amh'].fillna('[መልዕክት የለም]')

##### Text cleaning

In [27]:
from preprocess import clean_message, extract_sender_and_msg

In [28]:
df['Message_cleaned_amh'] = df['Message_cleaned_amh'].apply(clean_message)

In [29]:

df[['sender', 'cleaned_message']] = df['Message_cleaned_amh'].apply(extract_sender_and_msg)

In [30]:
df.head()

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path,content_type,Message_cleaned_amh,Message_cleaned_eng,sender,cleaned_message
0,Zemen Express®,@ZemenExpress,6982,💥💥...................................💥💥\n\n📌Im...,2025-06-18 06:01:10+00:00,[ምስል የለም],Text Only,በኤሌክትሪክ:የሚሰራ ለቤት መልካም መዓዛን የሚሰጥ ዋጋ፦ 1400 ብር ውስ...,💥💥...................................💥💥 📌Imita...,,በኤሌክትሪክ:የሚሰራ ለቤት መልካም መዓዛን የሚሰጥ ዋጋ፦ 1400 ብር ውስ...
1,Zemen Express®,@ZemenExpress,6981,💥💥...................................💥💥\n\n📌 B...,2025-06-16 12:21:00+00:00,[ምስል የለም],Text Only,በፈለጉት አቅጣጫ ልጅዎን በምቾት ማዘል ያስችልዎታል ዋጋ፦ 2400 ብር ው...,💥💥...................................💥💥 📌 Baby...,,በፈለጉት አቅጣጫ ልጅዎን በምቾት ማዘል ያስችልዎታል ዋጋ፦ 2400 ብር ው...
2,Zemen Express®,@ZemenExpress,6980,[መልዕክት የለም],2025-06-16 05:11:57+00:00,data\photos\@ZemenExpress_6980.jpg,Media Only,መልዕክት የለም,,,መልዕክት የለም
3,Zemen Express®,@ZemenExpress,6979,[መልዕክት የለም],2025-06-16 05:11:57+00:00,data\photos\@ZemenExpress_6979.jpg,Media Only,መልዕክት የለም,,,መልዕክት የለም
4,Zemen Express®,@ZemenExpress,6978,[መልዕክት የለም],2025-06-16 05:11:57+00:00,data\photos\@ZemenExpress_6978.jpg,Media Only,መልዕክት የለም,,,መልዕክት የለም


In [31]:
df.to_csv('../data/all_telegram_data.csv')

In [32]:
df_structured = df[["Channel Title", "Channel Username", "ID", "sender", "Date", "cleaned_message", "Media Path"]].copy()
df_structured.rename(columns={'cleaned_message': 'Message'}, inplace=True)

In [33]:
df_structured.rename(columns={'ID':'Message ID', 'sender':'Sender','Media Path': 'Media_Path'}, inplace=True)

In [39]:
df_structured[df_structured['Channel Username']=='@Shageronlinestore']

Unnamed: 0,Channel Title,Channel Username,Message ID,Sender,Date,Message,Media_Path
30346,Sheger online-store,@Shageronlinestore,7389,,2025-06-19 12:31:30+00:00,መልዕክት የለም,data\photos\@Shageronlinestore_7389.jpg
30347,Sheger online-store,@Shageronlinestore,7388,,2025-06-19 12:31:30+00:00,መልዕክት የለም,data\photos\@Shageronlinestore_7388.jpg
30348,Sheger online-store,@Shageronlinestore,7387,,2025-06-19 12:31:30+00:00,መልዕክት የለም,data\photos\@Shageronlinestore_7387.jpg
30349,Sheger online-store,@Shageronlinestore,7386,,2025-06-19 12:31:30+00:00,መልዕክት የለም,data\photos\@Shageronlinestore_7386.jpg
30350,Sheger online-store,@Shageronlinestore,7385,,2025-06-19 12:31:30+00:00,መልዕክት የለም,data\photos\@Shageronlinestore_7385.jpg
...,...,...,...,...,...,...,...
37372,Sheger online-store,@Shageronlinestore,12,,2021-04-27 05:58:59+00:00,ለኪችንዎ ውበት እጅግ ተመራጭ ውሀ የማያስገባ ቅባት ዘይት ነገሮች የማያበ...,data\photos\@Shageronlinestore_12.jpg
37373,Sheger online-store,@Shageronlinestore,10,,2021-04-27 05:57:12+00:00,3in1 ከርል ለመስራት ለማለስለስ እንዲሁም ለማድረቅ የሚያገለግል ለኢትዮ...,data\photos\@Shageronlinestore_10.jpg
37374,Sheger online-store,@Shageronlinestore,9,,2021-04-27 05:45:57+00:00,X5 ታላቅ ቅናሽ የሰዉነትዎ ውፍረት አሳስቧታል ሙሉ በሙሉ ቦርጭን በአጭር...,data\photos\@Shageronlinestore_9.jpg
37375,Sheger online-store,@Shageronlinestore,4,,2021-04-12 08:36:40+00:00,ለጤናችን SPO2 ዋጋ፦ 900 ብር ያሉበት ድረስ በነፃ እናደርሳለን 090...,data\photos\@Shageronlinestore_4.jpg


In [35]:
df_structured.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36330 entries, 0 to 37376
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   Channel Title     36330 non-null  object             
 1   Channel Username  36330 non-null  object             
 2   Message ID        36330 non-null  int64              
 3   Sender            0 non-null      object             
 4   Date              36330 non-null  datetime64[ns, UTC]
 5   Message           36330 non-null  object             
 6   Media_Path        36330 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(1), object(5)
memory usage: 2.2+ MB


In [40]:
df_structured.to_csv('../data/cleaned_message.csv')