In [1]:
import os
import json
import asyncio
import nest_asyncio
import warnings
from telethon.sync import TelegramClient
from telethon.tl.types import MessageMediaPhoto, MessageMediaDocument
from datetime import datetime
import logging
from dotenv import load_dotenv  # Make sure to include this import
import time

# Suppress warnings
warnings.filterwarnings("ignore")

# Allow nested asyncio calls
nest_asyncio.apply()

# Load environment variables
load_dotenv()

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Telegram API credentials
TELEGRAM_API_ID = os.getenv("TELEGRAM_API_ID")
TELEGRAM_API_HASH = os.getenv("TELEGRAM_API_HASH")

# Telegram channels to scrape
TELEGRAM_CHANNELS = [
    'https://t.me/CheMed123',
    'https://t.me/lobelia4cosmetics',
    'https://t.me/tikvahpharma',
]

# Paths for storing data
RAW_DATA_PATH = os.path.join(os.getcwd(), 'data', 'raw', 'telegram_messages')
IMAGE_DATA_PATH = os.path.join(os.getcwd(), 'data', 'raw', 'telegram_images')

# Create directories if they don't exist
os.makedirs(RAW_DATA_PATH, exist_ok=True)
os.makedirs(IMAGE_DATA_PATH, exist_ok=True)

# Limit on messages to scrape per channel
MESSAGE_LIMIT = 600  # Adjust this limit as needed

async def initialize_telegram_client():
    if not TELEGRAM_API_ID or not TELEGRAM_API_HASH:
        logging.error("Telegram API ID or Hash not found in .env. Please set them.")
        return None
    
    try:
        client = TelegramClient('telegram_scraper_session', int(TELEGRAM_API_ID), TELEGRAM_API_HASH)
        await client.connect()

        if not await client.is_user_authorized():
            logging.info("Authorizing Telegram client...")
            await client.start()  # Prompt for phone number and code if not authorized

        logging.info("Telegram client initialized and authorized.")
        return client
    except Exception as e:
        logging.error(f"Error initializing Telegram client: {e}")
        return None

async def scrape_channel(client, channel_url):
    logging.info(f"Starting scraping for channel: {channel_url}")
    try:
        entity = await client.get_entity(channel_url)
        channel_name = entity.username or entity.title.replace(" ", "_").replace("/", "_")

        channel_message_path = os.path.join(RAW_DATA_PATH, datetime.now().strftime('%Y-%m-%d'), channel_name)
        os.makedirs(channel_message_path, exist_ok=True)

        channel_image_path = os.path.join(IMAGE_DATA_PATH, datetime.now().strftime('%Y-%m-%d'), channel_name)
        os.makedirs(channel_image_path, exist_ok=True)

        message_count = 0
        image_count = 0

        async for message in client.iter_messages(entity, limit=MESSAGE_LIMIT):
            message_dict = {
                'id': message.id,
                'date': message.date.isoformat(),
                'message': message.message,
                'views': message.views,
                'channel_id': entity.id,
                'channel_name': channel_name,
                'has_media': message.media is not None,
                'media_type': None,
                'file_name': None,
                'file_path': None
            }

            if message.media:
                if isinstance(message.media, MessageMediaPhoto):
                    message_dict['media_type'] = 'photo'
                    try:
                        file_name = f"message_{message.id}_photo.jpg"
                        file_path = os.path.join(channel_image_path, file_name)
                        await client.download_media(message, file=file_path)
                        message_dict['file_name'] = file_name
                        message_dict['file_path'] = file_path
                        image_count += 1
                    except Exception as e:
                        logging.warning(f"Could not download photo for message {message.id} in {channel_name}: {e}")
                elif isinstance(message.media, MessageMediaDocument) and message.media.document.mime_type.startswith('image/'):
                    message_dict['media_type'] = 'document_image'
                    try:
                        file_ext = message.media.document.mime_type.split('/')[-1]
                        file_name = f"message_{message.id}_doc_image.{file_ext}"
                        file_path = os.path.join(channel_image_path, file_name)
                        await client.download_media(message, file=file_path)
                        message_dict['file_name'] = file_name
                        message_dict['file_path'] = file_path
                        image_count += 1
                    except Exception as e:
                        logging.warning(f"Could not download document image for message {message.id} in {channel_name}: {e}")
                else:
                    message_dict['media_type'] = 'other_media'

            message_file_path = os.path.join(channel_message_path, f"{message.id}.json")
            with open(message_file_path, 'w', encoding='utf-8') as f:
                json.dump(message_dict, f, ensure_ascii=False, indent=4)
            message_count += 1

            # Rate limiting: wait between messages to avoid hitting Telegram's limits
            await asyncio.sleep(0.5)  # Adjust the sleep time as necessary

        logging.info(f"Finished scraping {message_count} messages and {image_count} images from {channel_name}")

    except Exception as e:
        logging.error(f"Error scraping channel {channel_url}: {e}")

async def main():
    client = await initialize_telegram_client()
    if not client:
        return

    for channel_url in TELEGRAM_CHANNELS:
        await scrape_channel(client, channel_url)

    await client.disconnect()
    logging.info("Scraping process completed.")

# Run the main coroutine
asyncio.run(main())

2025-07-14 09:01:52,962 - INFO - Connecting to 149.154.167.92:443/TcpFull...
2025-07-14 09:01:53,069 - INFO - Connection to 149.154.167.92:443/TcpFull complete!
2025-07-14 09:01:53,739 - INFO - Telegram client initialized and authorized.
2025-07-14 09:01:53,740 - INFO - Starting scraping for channel: https://t.me/CheMed123
2025-07-14 09:01:54,287 - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072
2025-07-14 09:01:55,267 - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072
2025-07-14 09:01:56,708 - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072
2025-07-14 09:01:57,932 - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072
2025-07-14 09:01:59,291 - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072
2025-07-14 09:02:00,835 - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072
2025-07-14 09:02:02,481 - INFO - Starting direct file dow