In [7]:
!pip install telethon nltk python-dotenv




In [15]:
import os
from getpass import getpass
from telethon import TelegramClient
import os
import csv
import re
import nltk
from nltk.tokenize import word_tokenize


In [17]:
import os
import csv
import re
import nltk
from telethon import TelegramClient, events
from telethon.tl.types import MessageMediaPhoto, MessageMediaDocument
import asyncio
import nest_asyncio

# Apply nest_asyncio to allow asyncio.run() to be called from a running event loop
nest_asyncio.apply()

# --- NLTK Data Downloads ---
try:
    nltk.download('punkt', quiet=True)
    print("NLTK 'punkt' data downloaded successfully.")
except Exception as e:
    print(f"Error downloading NLTK data: {e}")

# --- Text Preprocessing Functions ---
def remove_emojis(text):
    """Remove emojis from the text."""
    # This regex matches characters that are NOT word characters, spaces, or specific Amharic punctuation.
    return re.sub(r'[^\w\s,፡።፣፤፥፦፧]', '', text)

def preprocess_text(text):
    """Preprocess the text with Normalization and Tokenization for further analysis."""
    if not text:
        return ""

    # Normalize the text
    text = str(text).strip() # Ensure text is a string and remove leading/trailing whitespace
    text = re.sub(r'[\n\r]+', ' ', text)  # Replace newlines with single spaces
    text = remove_emojis(text)  # Remove emojis using the helper function

    # Filter unwanted characters:
    text = re.sub(r'[^\u1200-\u137F\u1380-\u1399\u2D80-\u2DDF\uAB00-\uAB2F\w\s፡።፣፤፥፦፧]', '', text)

    text = re.sub(r'\s+', ' ', text)  # Collapse multiple spaces into a single space

    # Tokenization: Break the text into words/tokens
    tokens = nltk.word_tokenize(text)

    return ' '.join(tokens)  # Join tokens back into a single string separated by spaces

# --- Sample Preprocessing Test ---
sample_input = "ምርት የተመዘገበ በላይ ይህ ነው! 😊"
sample_output = preprocess_text(sample_input)
print("Sample Input:", sample_input)
print("Sample Output:", sample_output)

# --- Telegram Scraper Functions ---
async def scrape_channel(client, channel_username, writer, media_dir):

    try:

        entity = await client.get_entity(channel_username)
        print(f"Accessing channel: {entity.title} ({channel_username})")

        async for message in client.iter_messages(entity, limit=300):
            if message.message:
                clean_text = preprocess_text(message.message)
                media_path = None

                # Check if the message contains media and handle different media types
                if message.media:
                    if isinstance(message.media, MessageMediaPhoto):
                        # For photos, we can just note its presence
                        media_path = f"Photo attached (ID: {message.id})"
                    elif isinstance(message.media, MessageMediaDocument):
                        # MessageMediaDocument can be videos, audio, or other files

                        if message.media.document.mime_type and message.media.document.mime_type.startswith('video/'):
                            media_path = f"Video attached (ID: {message.id})"
                        else:
                            # For other documents, try to get the file name
                            media_path = message.media.document.file_name or f"Document attached (ID: {message.id})"
                    else:
                        # Fallback for other unexpected media types
                        media_path = f"Unsupported Media Type: {type(message.media).__name__} (ID: {message.id})"

                writer.writerow([
                    entity.title,
                    channel_username,
                    message.id,
                    clean_text,
                    message.date,
                    media_path
                ])
                print(f"Processed message ID: {message.id}")
            else:
                # Handle messages without text content
                media_path = None
                if message.media:
                    if isinstance(message.media, MessageMediaPhoto):
                        media_path = f"Photo attached (ID: {message.id})"
                    elif isinstance(message.media, MessageMediaDocument):
                        if message.media.document.mime_type and message.media.document.mime_type.startswith('video/'):
                            media_path = f"Video attached (ID: {message.id})"
                        else:
                            media_path = message.media.document.file_name or f"Document attached (ID: {message.id})"
                    else:
                        media_path = f"Unsupported Media Type: {type(message.media).__name__} (ID: {message.id})"

                # For messages with no text but media, log them.

                writer.writerow([
                    entity.title,
                    channel_username,
                    message.id,
                    "",
                    message.date,
                    media_path
                ])
                print(f"No text content in message ID: {message.id}, media_path: {media_path}")

    except Exception as e:
        print(f"Error scraping {channel_username}: {e}")

async def main():
    """
    Main function to orchestrate the Telegram scraping process.
    Initializes the Telegram client, sets up directories, and scrapes specified channels.
    """
    # List of Telegram channel usernames to scrape
    selected_channels = [
        '@ZemenExpress',
        '@modernshoppingcenter',
        '@Shewabrand',
        '@Fashiontera',
        '@marakibrand'
    ]

    media_dir = 'media_selected'
    # Create the media directory if it doesn't already exist
    os.makedirs(media_dir, exist_ok=True)

    # Open the CSV file in write mode, ensuring proper newline handling and UTF-8 encoding
    with open('preprocessed_telegram_data.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        # Write the header row for the CSV file
        writer.writerow(['Channel Title', 'Channel Username', 'Message ID', 'Message', 'Date', 'Media Path'])

        # Iterate through each selected channel and scrape its content
        for channel in selected_channels:
            print(f"Scraping from {channel}...")
            # Call the scrape_channel function for each channel
            await scrape_channel(client, channel, writer, media_dir)
            print(f"Finished scraping {channel}")

# --- Telegram Client Initialization ---


api_id = os.environ.get('TELEGRAM_API_ID')
api_hash = os.environ.get('TELEGRAM_API_HASH')
phone_number = os.environ.get('phone')

# --- Start the Client and Run Main Function ---
async def run_scraper():
    # Check if all necessary environment variables are set
    if not api_id or not api_hash or not phone_number:
        print("Error: TELEGRAM_API_ID, TELEGRAM_API_HASH, or phone environment variables are not set.")
        print("Please set them in your Google Colab notebook before running the script.")
        return # Exit the function if credentials are missing

    global client # Declare client as global to be accessible by other functions
    client = TelegramClient('scraper_session', int(api_id), api_hash)

    print("Starting Telegram client...")
    try:
        await client.start(phone=phone_number)
        print("Telegram client started. Running main scraping function...")
        await main()
        print("Scraping completed.")
    except Exception as e:
        print(f"An error occurred during scraping: {e}")
    finally:
        # Ensure the client disconnects even if an error occurs
        if client.is_connected():
            print("Disconnecting Telegram client...")
            await client.disconnect()
            print("Telegram client disconnected.")
        # If you encounter "database is locked" errors frequently, you might need to

# Run the asynchronous function
if __name__ == '__main__':
    asyncio.run(run_scraper())


NLTK 'punkt' data downloaded successfully.
Sample Input: ምርት የተመዘገበ በላይ ይህ ነው! 😊
Sample Output: ምርት የተመዘገበ በላይ ይህ ነው
Starting Telegram client...
Telegram client started. Running main scraping function...
Scraping from @ZemenExpress...
Accessing channel: Zemen Express® (@ZemenExpress)
Processed message ID: 7004
No text content in message ID: 7003, media_path: Photo attached (ID: 7003)
No text content in message ID: 7002, media_path: Photo attached (ID: 7002)
No text content in message ID: 7001, media_path: Photo attached (ID: 7001)
Processed message ID: 7000
Processed message ID: 6999
No text content in message ID: 6998, media_path: Photo attached (ID: 6998)
No text content in message ID: 6997, media_path: Photo attached (ID: 6997)
No text content in message ID: 6996, media_path: Photo attached (ID: 6996)
Processed message ID: 6995
No text content in message ID: 6994, media_path: Photo attached (ID: 6994)
No text content in message ID: 6993, media_path: Photo attached (ID: 6993)
No text