# Building an Amharic E-commerce Data Extractor

# Task 1: Data Ingestion and  Data Preprocessing

### Install Dependencies

In [1]:
!pip install telethon

Collecting telethon
  Downloading Telethon-1.40.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pyaes (from telethon)
  Downloading pyaes-1.6.1.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading Telethon-1.40.0-py3-none-any.whl (722 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m722.0/722.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyaes
  Building wheel for pyaes (setup.py) ... [?25l[?25hdone
  Created wheel for pyaes: filename=pyaes-1.6.1-py3-none-any.whl size=26347 sha256=8b9214c7ff6ca0d659616f902f1b00d7ba78dd125f31a3c435047e2489c04d9e
  Stored in directory: /root/.cache/pip/wheels/4e/52/33/010d0843550bffb6a591b11629070ae140c0ad4f53e68a3bd3
Successfully built pyaes
Installing collected packages: pyaes, telethon
Successfully installed pyaes-1.6.1 telethon-1.40.0


#### Authenticate with Telegram API

In [None]:
from telethon.sync import TelegramClient
from telethon.tl.functions.messages import GetDialogsRequest
from telethon.tl.types import InputPeerEmpty
from telethon import functions, types
import os
import json
import re
import asyncio
from datetime import datetime

api_id = TG_API_ID
api_hash = 'TG_API_HASH'
phone = 'phone'  # phone number


#### Preprocess Text

In [None]:
def clean_amharic_text(text):
    if not text:
        return ""
    # Remove unwanted symbols, retain Amharic, numbers, and simple symbols
    text = re.sub(r'[^\u1200-\u137F0-9\s@.]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

#### Message Ingestion Script


In [None]:
async def scrape_channel(client, channel_username, output_dir):
    entity = await client.get_entity(channel_username)
    channel_title = entity.title
    media_dir = os.path.join(output_dir, 'media', channel_username.replace('@', ''))
    os.makedirs(media_dir, exist_ok=True)

    messages = []
    async for message in client.iter_messages(entity, limit=1000):  # Adjust limit as needed
        media_path = None
        if message.media and hasattr(message.media, 'photo'):
            filename = f"{channel_username}_{message.id}.jpg"
            media_path = os.path.join(media_dir, filename)
            await client.download_media(message.media, media_path)

        text_raw = message.message or ""
        text_clean = clean_amharic_text(text_raw)

        messages.append({
            "channel_title": channel_title,
            "channel_username": channel_username,
            "message_id": message.id,
            "sender_id": message.sender_id,
            "timestamp": message.date.isoformat(),
            "text_raw": text_raw,
            "text_clean": text_clean,
            "tokens": text_clean.split(),
            "media_path": media_path
        })

    # Save each message as a JSON line
    output_file = os.path.join(output_dir, f"{channel_username.replace('@', '')}_data.jsonl")
    with open(output_file, 'w', encoding='utf-8') as f:
        for m in messages:
            f.write(json.dumps(m, ensure_ascii=False) + '\n')
    print(f"✅ Scraped {len(messages)} messages from {channel_username}")


#### Outpu structure

In [None]:
output_dir = '/content/telegram_output'  # Use Colab's working directory
os.makedirs(output_dir, exist_ok=True)

# Channels to scrape
channels = [
    '@qnashcom',
    '@MerttEka',
    '@ethio_brand_collection',
    '@Leyueqa',
    '@marakibrand'
]

async def run_scraper():
    async with TelegramClient('colab_session', api_id, api_hash) as client:
        await client.start(phone=phone)
        for channel in channels:
            await scrape_channel(client, channel, output_dir)

await run_scraper()


Signed in successfully as Yitbarek; remember to not break the ToS or you will risk an account ban!
✅ Scraped 1000 messages from @qnashcom
✅ Scraped 1000 messages from @MerttEka
✅ Scraped 1000 messages from @ethio_brand_collection
✅ Scraped 1000 messages from @Leyueqa
✅ Scraped 1000 messages from @marakibrand
