# Task 1: Data Ingestion and Preprocessing
Ethiopian E-commerce Telegram Channel Data Collection and Processing

In [1]:
# Setup and imports
import sys
sys.path.append('../src')
import pandas as pd
import asyncio

%reload_ext autoreload
%autoreload 2
from data_ingestion.telegram_scraper import TelegramScraper
from preprocessing.text_preprocessor import AmharicTextPreprocessor

In [None]:
# Initialize Telegram Scraper
try:
    scraper = TelegramScraper('../config/config.yaml')
    await scraper.initialize_client()
    print("Telegram client initialized")
except Exception as e:
    print(f"Error initializing scraper: {e}")
    print("Using demo data instead...")
    scraper = None

In [12]:
# Scrape data from all channels (2000 messages per channel)
if scraper:
    df = await scraper.scrape_all_channels(limit_per_channel=2000)
    print(f"Total messages scraped: {len(df)}")
    print(f"Channels: {df['channel'].unique()}")
else:
    # Create demo data
    import pandas as pd
    df = pd.DataFrame({
        'message_id': [1, 2, 3],
        'text': ['ሸሚዝ 500 ብር አዲስ አበባ', 'ጫማ 800 ብር ባህር ዳር', 'ቦርሳ 300 ብር ሐዋሳ'],
        'channel': ['demo_channel'] * 3,
        'date': pd.to_datetime(['2024-01-01', '2024-01-02', '2024-01-03'])
    })
    print(f"Using demo data: {len(df)} messages")

[32m2025-08-21 18:11:02.058[0m | [1mINFO    [0m | [36mdata_ingestion.telegram_scraper[0m:[36mscrape_channel[0m:[36m51[0m - [1mScraping channel: @ZemenExpress[0m
[32m2025-08-21 18:11:13.869[0m | [1mINFO    [0m | [36mdata_ingestion.telegram_scraper[0m:[36mscrape_channel[0m:[36m70[0m - [1mScraped 985 messages from @ZemenExpress[0m
[32m2025-08-21 18:11:14.002[0m | [1mINFO    [0m | [36mdata_ingestion.telegram_scraper[0m:[36mscrape_channel[0m:[36m51[0m - [1mScraping channel: @sinayelj[0m
[32m2025-08-21 18:11:21.689[0m | [1mINFO    [0m | [36mdata_ingestion.telegram_scraper[0m:[36mscrape_channel[0m:[36m70[0m - [1mScraped 522 messages from @sinayelj[0m
[32m2025-08-21 18:11:21.809[0m | [1mINFO    [0m | [36mdata_ingestion.telegram_scraper[0m:[36mscrape_channel[0m:[36m51[0m - [1mScraping channel: @Shewabrand[0m
[32m2025-08-21 18:11:35.871[0m | [1mINFO    [0m | [36mdata_ingestion.telegram_scraper[0m:[36mscrape_channel[0m:[36m70[0

Total messages scraped: 5800
Channels: ['@ZemenExpress' '@sinayelj' '@Shewabrand' '@lobelia4cosmetics'
 '@yetenaweg' '@EthiopianAirlines']


In [13]:
# Save raw data
if scraper:
    raw_data_path = await scraper.save_raw_data(df)
    await scraper.close()
    print(f"Raw data saved to: {raw_data_path}")
else:
    # Save demo data
    import os
    os.makedirs('../data/raw', exist_ok=True)
    raw_data_path = '../data/raw/demo_telegram_data.csv'
    df.to_csv(raw_data_path, index=False, encoding='utf-8')
    print(f"Demo data saved to: {raw_data_path}")

[32m2025-08-21 18:11:53.285[0m | [1mINFO    [0m | [36mdata_ingestion.telegram_scraper[0m:[36msave_raw_data[0m:[36m107[0m - [1mData saved to ../data/raw\telegram_data_20250821_181153.csv[0m


Raw data saved to: ../data/raw\telegram_data_20250821_181153.csv


In [5]:
# Initialize text preprocessor
preprocessor = AmharicTextPreprocessor()
print("Text preprocessor initialized")

Text preprocessor initialized


In [9]:
# load raw data 
df = pd.read_csv('../data/raw/telegram_data_20250821_181153.csv', encoding='utf-8')

In [3]:
df

Unnamed: 0,id,channel,text,date,views,forwards,replies,sender_id,has_media,media_type,message_link
0,7164,@ZemenExpress,💥💥...................................💥💥\n\n📌 S...,2025-08-18T11:26:35+00:00,2557.0,4.0,0,-1001307493052,True,photo,https://t.me/ZemenExpress/7164
1,7162,@ZemenExpress,💥💥...................................💥💥\n\n📌Cr...,2025-08-14T09:23:20+00:00,5049.0,9.0,0,-1001307493052,True,photo,https://t.me/ZemenExpress/7162
2,7161,@ZemenExpress,💥💥...................................💥💥\n\n📌Cr...,2025-08-14T09:23:11+00:00,3738.0,2.0,0,-1001307493052,True,document,https://t.me/ZemenExpress/7161
3,7158,@ZemenExpress,💥💥...................................💥💥\n\n📌Ba...,2025-08-11T08:07:28+00:00,4733.0,9.0,0,-1001307493052,True,photo,https://t.me/ZemenExpress/7158
4,7157,@ZemenExpress,💥💥...................................💥💥\n\n📍 N...,2025-08-07T08:15:17+00:00,7026.0,4.0,0,-1001307493052,True,photo,https://t.me/ZemenExpress/7157
...,...,...,...,...,...,...,...,...,...,...,...
5795,8,@yetenaweg,አዘጋጆች \n\n ዶ/ር ኤርምያስ ካቻ \n በውስጥ ደዌ ህክምና ስፔሺያለስ...,2020-02-17T21:45:20+00:00,439.0,1.0,0,-1001447066276,False,,https://t.me/yetenaweg/8
5796,6,@yetenaweg,በየሁለት ሳምንቱ እየተዘጋጀ የሚቅርብላችሁ በጤና ላይ የሚያተኩር ፖድካስት...,2020-02-17T21:42:39+00:00,418.0,0.0,0,-1001447066276,False,,https://t.me/yetenaweg/6
5797,5,@yetenaweg,ይህ አዲሱ የኮሮና ቫይረስ በማይክሮስኮፕ ስር ሲታይ ያለው ምስል ነው። ኮ...,2020-02-17T20:58:59+00:00,448.0,2.0,0,-1001447066276,True,photo,https://t.me/yetenaweg/5
5798,4,@yetenaweg,አዲሱ የኮሮና ቫይረስ (በአዲስ የሳይንስ ስሙ COVID-19) ፣\nከየት ...,2020-02-17T20:55:46+00:00,1335.0,2.0,0,-1001447066276,True,,https://t.me/yetenaweg/4


In [10]:
df[['text', 'tokens']] = df['text'].apply(
    lambda x: pd.Series(preprocessor.custom_preprocess_amharic_text(x))
)

In [13]:
df

Unnamed: 0,id,channel,text,date,views,forwards,replies,sender_id,has_media,media_type,message_link,tokens
0,7164,@ZemenExpress,4 1 304 500 ዋጋ፦ ብር ውስን ፍሬ ነው ያለው መገናኛ መሰረት ደፋር...,2025-08-18T11:26:35+00:00,2557.0,4.0,0,-1001307493052,True,photo,https://t.me/ZemenExpress/7164,4 1 304 500 ዋጋ፦ ብር ውስን ፍሬ ነው ያለው መገናኛ መሰረት ደፋር...
1,7162,@ZemenExpress,6 የጫማ ማስቀመጫ ባለ ስድስት ደረጃ ቦታ ቆጣቢ ሲዘረጋ 27 27 86 ስ...,2025-08-14T09:23:20+00:00,5049.0,9.0,0,-1001307493052,True,photo,https://t.me/ZemenExpress/7162,6 የጫማ ማስቀመጫ ባለ ስድስት ደረጃ ቦታ ቆጣቢ ሲዘረጋ 27 27 86 ስ...
2,7161,@ZemenExpress,6 የጫማ ማስቀመጫ ባለ ስድስት ደረጃ ቦታ ቆጣቢ ሲዘረጋ 27 27 86 ስ...,2025-08-14T09:23:11+00:00,3738.0,2.0,0,-1001307493052,True,document,https://t.me/ZemenExpress/7161,6 የጫማ ማስቀመጫ ባለ ስድስት ደረጃ ቦታ ቆጣቢ ሲዘረጋ 27 27 86 ስ...
3,7158,@ZemenExpress,ሲሊከን የልጆች ጫማ ማለማመጃ ልጆች በቀላሉ የማያወልቁት ዋጋ፦ 600 ብር...,2025-08-11T08:07:28+00:00,4733.0,9.0,0,-1001307493052,True,photo,https://t.me/ZemenExpress/7158,ሲሊከን የልጆች ጫማ ማለማመጃ ልጆች በቀላሉ የማያወልቁት ዋጋ፦ 600 ብር...
4,7157,@ZemenExpress,ዋጋ፦ እንድ ጥቅል 200 ብር ውስን ፍሬ ነው ያለው አድራሻ መገናኛ መሰረ...,2025-08-07T08:15:17+00:00,7026.0,4.0,0,-1001307493052,True,photo,https://t.me/ZemenExpress/7157,ዋጋ፦ እንድ ጥቅል 200 ብር ውስን ፍሬ ነው ያለው አድራሻ መገናኛ መሰረ...
...,...,...,...,...,...,...,...,...,...,...,...,...
5795,8,@yetenaweg,አዘጋጆች ዶ ር ኤርምያስ ካቻ በውስጥ ደዌ ህክምና ስፔሺያለስት ፣ በአሁን...,2020-02-17T21:45:20+00:00,439.0,1.0,0,-1001447066276,False,,https://t.me/yetenaweg/8,አዘጋጆች ዶ ር ኤርምያስ ካቻ በውስጥ ደዌ ህክምና ስፔሺያለስት ፣ በአሁን...
5796,6,@yetenaweg,በየሁለት ሳምንቱ እየተዘጋጀ የሚቅርብላችሁ በጤና ላይ የሚያተኩር ፖድካስት...,2020-02-17T21:42:39+00:00,418.0,0.0,0,-1001447066276,False,,https://t.me/yetenaweg/6,በየሁለት ሳምንቱ እየተዘጋጀ የሚቅርብላችሁ በጤና ላይ የሚያተኩር ፖድካስት...
5797,5,@yetenaweg,ይህ አዲሱ የኮሮና ቫይረስ በማይክሮስኮፕ ስር ሲታይ ያለው ምስል ነው። ኮ...,2020-02-17T20:58:59+00:00,448.0,2.0,0,-1001447066276,True,photo,https://t.me/yetenaweg/5,ይህ አዲሱ የኮሮና ቫይረስ በማይክሮስኮፕ ስር ሲታይ ያለው ምስል ነው። ኮ...
5798,4,@yetenaweg,አዲሱ የኮሮና ቫይረስ በአዲስ የሳይንስ ስሙ 19 ፣ ከየት መጣ ምን ምልክ...,2020-02-17T20:55:46+00:00,1335.0,2.0,0,-1001447066276,True,,https://t.me/yetenaweg/4,አዲሱ የኮሮና ቫይረስ በአዲስ የሳይንስ ስሙ 19 ፣ ከየት መጣ ምን ምልክ...


In [16]:
# Preprocess the scraped data
# # processed_df = preprocessor.preprocess_dataframe(df)
# print(f"Processed messages: {len(processed_df)}")
# print(f"Messages with Amharic: {processed_df['has_amharic'].sum()}")
# print(f"Messages with price hints: {(processed_df['price_hints'].apply(len) > 0).sum()}")

In [18]:
# Display sample processed data
# print("Sample processed data:")
# sample = df[['text', 'token_count', 'price_hints', 'location_hints']].head(3)
# for idx, row in sample.iterrows():
#     print(f"Text: {row['cleaned_text'][:80]}...")
#     print(f"Tokens: {row['token_count']}")
#     print(f"Price hints: {row['price_hints']}")
#     print(f"Location hints: {row['location_hints']}")
#     print("-" * 50)

In [19]:
# Save processed data
processed_path = "../data/processed/processed_telegram_data.csv"
df.to_csv(processed_path, index=False, encoding='utf-8')
print(f"Processed data saved to: {processed_path}")

Processed data saved to: ../data/processed/processed_telegram_data.csv


In [21]:
# Data quality summary
print("Data Quality Summary:")
print(f"Total raw messages: {len(df)}")
print(f"Total processed messages: {len(df)}")
print(f"Retention rate: {len(df)/len(df)*100:.1f}%")
# print(f"Average tokens per message: {df['token_count'].mean():.1f}")
print(f"Channels covered: {df['channel'].nunique()}")

Data Quality Summary:
Total raw messages: 5800
Total processed messages: 5800
Retention rate: 100.0%
Channels covered: 6
