In [None]:
from telethon.sync import TelegramClient
import pandas as pd
import os
from dotenv import load_dotenv

# --- Load credentials from .env ---
load_dotenv()

api_id = int(os.getenv("API_ID"))
api_hash = os.getenv("API_HASH")
phone = os.getenv("PHONE")

# --- Telegram Channels ---
channels = [
    'ZemenExpress',
    'nevacomputer',
    'ethio_brand_collection',
    'AwasMart',
    'Shewabrand'
]

# --- Initialize client ---
client = TelegramClient('session_name', api_id, api_hash)

# --- Connect and Fetch ---
with client:
    all_data = []
    for channel in channels:
        for message in client.iter_messages(channel, limit=200):
            if message.text:
                all_data.append({
                    'channelName': channel,
                    'text': message.text,
                    'date': message.date,
                    'viewsCount': message.views,
                    'id': message.id
                })

# --- Save to CSV ---
df = pd.DataFrame(all_data)
df.to_csv("../data/dataset_telegram.csv", index=False)
print("✅ Data saved to telegram_messages.csv")


In [None]:
import re
import string
import pandas as pd

df = pd.read_csv("../data/dataset_telegram.csv") #imported data after the scraping process
print("✅ Data loaded.")

df.dtypes

df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['viewsCount'] = pd.to_numeric(df['viewsCount'], errors='coerce')
df['text'] = df['text'].astype(str)
df['channelName'] = df['channelName'].astype(str)

def clean_text(text):
    text = re.sub(r'[^\w\s።፡]', '', text)  # Remove non-Amharic punctuations
    text = re.sub(r'[a-zA-Z]', '', text)  # Remove English characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.strip()

df['cleaned_text'] = df['text'].apply(clean_text)



df.to_csv("../data/telegram_messages_cleaned.csv", index=False)
print("✅ Cleaned data saved.")

from etnltk.tokenize.am import word_tokenize
from etnltk.lang.am import normalize


df['tokenized_text'] = df['cleaned_text'].apply(normalize)

print("✅ Tokenization completed.")

df['tokenized_text'] = df['tokenized_text'].str.replace(r'\s+', ' ', regex=True).str.strip()  # Remove extra spaces
df['tokenized_text'] = df["tokenized_text"].astype(str)

df.to_csv("../data/telegram_messages_tokenized.csv", index=False)
print("✅ Tokenized data saved.")



df[['channelName', 'date', 'viewsCount', 'cleaned_text']].rename(
    columns={'cleaned_text': 'message'}
).to_json('../data/telegram_data.json', orient='records', force_ascii=False, lines=True)


✅ Data loaded.
✅ Cleaned data saved.
✅ Tokenization completed.
✅ Tokenized data saved.
