 ## Data Scraping and Collection (Extract & Load)

In [1]:
!pip install telethon python-dotenv pandas




[notice] A new release of pip available: 22.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Apply nest_asyncio
import nest_asyncio
nest_asyncio.apply() 

In [3]:
#  Import required libraries
import os
import logging
import pandas as pd
import re
from dotenv import load_dotenv
from telethon.sync import TelegramClient

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [5]:
#  Load environment variables
load_dotenv()

api_id = os.getenv("TELEGRAM_API_ID")
api_hash = os.getenv("TELEGRAM_API_HASH")
phone_number = os.getenv("TELEGRAM_PHONE_NUMBER")

if api_id and api_hash and phone_number:
    print("✅ Environment variables loaded successfully!")
else:
    print("⚠️ Warning: Missing environment variables. Check your .env file.")

✅ Environment variables loaded successfully!


In [6]:


# Set up logging and storage paths
CURRENT_DIR = os.path.abspath(os.getcwd())
PROJECT_ROOT = os.path.dirname(CURRENT_DIR)

log_dir = os.path.join(PROJECT_ROOT, "logs")
os.makedirs(log_dir, exist_ok=True)
log_storage= os.path.join("logs")

log_file = os.path.join(log_dir, "scraping.log")
logging.basicConfig(filename=log_file, level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

TEXT_DATA_DIR = os.path.join(PROJECT_ROOT, "data", "messages")
os.makedirs(TEXT_DATA_DIR, exist_ok=True)
text_data_path = os.path.join(TEXT_DATA_DIR, "scraped_messages.csv")
text_storage= os.path.join("scraped_messages.csv")

if not os.path.exists(text_data_path):
    pd.DataFrame(columns=["channel_name","channel_title", "date", "text"]).to_csv(text_data_path, index=False)

print(f"📂 Log directory: {log_storage}")
print(f"📂 Messages storage: {text_storage}")



📂 Log directory: logs
📂 Messages storage: scraped_messages.csv


In [7]:
# Initialize Telegram client
client = TelegramClient("session_name", 29168686, 'e359744dfcfc810d71dd68af37cee602')
# client = TelegramClient("session_name", api_id, api_hash)
print("🚀 Telegram client initialized!")

🚀 Telegram client initialized!


In [8]:
# Define function for scraping messages
async def scrape_messages():
    """Scrape messages from Telegram channels and save to CSV"""
    await client.start(phone_number)

    channels = [
        "https://t.me/DoctorsET",
        "https://t.me/CheMed123",
        "https://t.me/lobelia4cosmetics",
        "https://t.me/yetenaweg",
        "https://t.me/EAHCI"
    ]

    all_messages = []
    for channel in channels:
        try:
            entity = await client.get_entity(channel)
            channel_name = entity.username if entity.username else "N/A"
            channel_title = entity.title  
            messages = await client.get_messages(entity, limit=100)

            for msg in messages:
                all_messages.append({
                    "channel_name": channel_name,
                    "channel_title": channel_title,
                    "date": msg.date.strftime("%Y-%m-%d %H:%M:%S"),
                    "text": msg.text if msg.text else "[No Text]"
                })

            logging.info(f"✅ Scraped {len(messages)} messages from {channel_title} ({channel_name})")
            print(f"✅ Scraped {len(messages)} messages from {channel_title} ({channel_name})")

        except Exception as e:
            logging.error(f"Error scraping {channel}: {e}")
            print(f"❌ Error scraping {channel}: {e}")

    df = pd.DataFrame(all_messages)
    if not df.empty:
        df.to_csv(text_data_path, mode="a", header=False, index=False)
        logging.info("Messages saved successfully!")
        print("📌 Messages saved successfully!")

## Image and Scraping

In [9]:
# Define function for scraping images
async def scrape_images():
    """Scrape images from Telegram channels and save locally"""
    await client.start(phone_number)

    IMAGE_FOLDER = os.path.join(PROJECT_ROOT, "data", "images")
    os.makedirs(IMAGE_FOLDER, exist_ok=True)

    image_channels = ["https://t.me/CheMed123", "https://t.me/lobelia4cosmetics"]

    for channel in image_channels:
        try:
            entity = await client.get_entity(channel)
            messages = await client.get_messages(entity, limit=50)

            for msg in messages:
                if msg.photo:
                    sanitized_channel = re.sub(
                        r"https://t.me/|[^a-zA-Z0-9_]", "_", channel
                    )
                    formatted_date = msg.date.strftime("%Y-%m-%d_%H-%M-%S")
                    filename = os.path.join(
                        IMAGE_FOLDER, f"{sanitized_channel}_{formatted_date}.jpg"
                    )
                    storage = os.path.join(f"{sanitized_channel}_{formatted_date}.jpg")

                    await client.download_media(msg.photo, filename)
                    logging.info(f"Downloaded image from {channel} - {filename}")
                    print(f"✅ Image saved: {storage}")

        except Exception as e:
            logging.error(f"Error scraping images from {channel}: {e}")
            print(f"❌ Error scraping images from {channel}: {e}")

In [10]:
# Run the scrapers
async def main():
    """Runs both message and image scraping."""
    async with client:
        print("🚀 Starting message scraping...")
        await scrape_messages()

        print("🚀 Starting image scraping...")
        await scrape_images()

    print("🎉 Scraping process completed!")

# Run the async function properly
await main()  

🚀 Starting message scraping...
✅ Scraped 100 messages from Doctors Ethiopia (DoctorsET)
✅ Scraped 76 messages from CheMed (CheMed123)
✅ Scraped 100 messages from Lobelia pharmacy and cosmetics (lobelia4cosmetics)
✅ Scraped 100 messages from የጤና ወግ - የጤና መረጃ (yetenaweg)
✅ Scraped 100 messages from ETHIO-AMERICAN MEDICAL TRAININGS( CPD ) & HEALTH CONSULTANCY CENTER (EAHCI)
📌 Messages saved successfully!
🚀 Starting image scraping...
✅ Image saved: _CheMed123_2023-02-10_12-23-06.jpg
✅ Image saved: _CheMed123_2023-02-02_08-58-52.jpg
✅ Image saved: _CheMed123_2023-02-01_08-59-37.jpg
✅ Image saved: _CheMed123_2023-01-31_09-19-53.jpg
✅ Image saved: _CheMed123_2023-01-30_09-45-25.jpg
✅ Image saved: _CheMed123_2023-01-27_07-18-40.jpg
✅ Image saved: _CheMed123_2023-01-26_18-27-53.jpg
✅ Image saved: _CheMed123_2023-01-23_10-39-20.jpg
✅ Image saved: _CheMed123_2023-01-17_08-43-12.jpg
✅ Image saved: _CheMed123_2023-01-16_13-41-35.jpg
✅ Image saved: _CheMed123_2023-01-16_10-13-42.jpg
✅ Image saved: _