# ⏰ Automated Telegram Data Collection

## Overview
Automated scheduling system for continuous data collection:
- **Scheduled scraping** (hourly, daily, weekly)
- **Incremental updates** (only new messages)
- **Error handling** and retry logic
- **Data deduplication** and validation
- **Background execution** with logging

---

### 📚 Import Libraries

In [None]:
import os
import sys
import asyncio
import schedule
import time
import pandas as pd
from datetime import datetime, timedelta
import logging
from threading import Thread

# Add scripts to path
sys.path.append(os.path.abspath('../src/data_collection'))
from telegram_scraper import TelegramScraper

### ⚙️ Configuration

In [None]:
# Scheduling configuration
SCHEDULE_INTERVAL = "daily"  # Options: "hourly", "daily", "weekly"
SCHEDULE_TIME = "09:00"      # Time for daily/weekly runs

# Data collection settings
CHANNELS = [
    '@classybrands',
    '@Shageronlinestore', 
    '@ZemenExpress',
    '@sinayelj',
    '@modernshoppingcenter'
]

OUTPUT_FILE = '../data/telegram_data.csv'
LOG_FILE = '../logs/scraper.log'
INCREMENTAL_LIMIT = 100  # New messages per run

# Setup logging
os.makedirs('../logs', exist_ok=True)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(LOG_FILE),
        logging.StreamHandler()
    ]
)

print(f"⏰ Schedule: {SCHEDULE_INTERVAL} at {SCHEDULE_TIME}")
print(f"📡 Channels: {len(CHANNELS)}")
print(f"📝 Log file: {LOG_FILE}")

### 🔧 Scraping Functions

In [None]:
class AutomatedScraper:
    def __init__(self):
        self.api_id = os.getenv('TG_API_ID')
        self.api_hash = os.getenv('TG_API_HASH')
        self.scraper = None
        
    async def incremental_scrape(self):
        """Scrape only new messages since last run"""
        try:
            logging.info("Starting incremental scrape...")
            
            # Initialize scraper
            if not self.scraper:
                self.scraper = TelegramScraper(self.api_id, self.api_hash)
            
            # Get last message timestamp
            last_timestamp = self.get_last_timestamp()
            
            # Scrape new messages
            new_data = await self.scraper.scrape_channels(
                channels=CHANNELS,
                limit=INCREMENTAL_LIMIT,
                since_date=last_timestamp
            )
            
            # Save new data
            if new_data:
                self.append_data(new_data)
                logging.info(f"Collected {len(new_data)} new messages")
            else:
                logging.info("No new messages found")
                
        except Exception as e:
            logging.error(f"Scraping failed: {str(e)}")
            
    def get_last_timestamp(self):
        """Get timestamp of last collected message"""
        try:
            if os.path.exists(OUTPUT_FILE):
                df = pd.read_csv(OUTPUT_FILE)
                if not df.empty:
                    return pd.to_datetime(df['Date']).max()
        except:
            pass
        return datetime.now() - timedelta(days=1)
    
    def append_data(self, new_data):
        """Append new data to existing file"""
        new_df = pd.DataFrame(new_data)
        
        if os.path.exists(OUTPUT_FILE):
            existing_df = pd.read_csv(OUTPUT_FILE)
            combined_df = pd.concat([existing_df, new_df], ignore_index=True)
            # Remove duplicates based on message ID
            combined_df = combined_df.drop_duplicates(subset=['ID'], keep='last')
        else:
            combined_df = new_df
            
        combined_df.to_csv(OUTPUT_FILE, index=False, encoding='utf-8')
        logging.info(f"Data saved to {OUTPUT_FILE}")

# Initialize scraper
auto_scraper = AutomatedScraper()

### ⏰ Schedule Setup

In [None]:
def run_scraper():
    """Wrapper to run async scraper in sync context"""
    try:
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(auto_scraper.incremental_scrape())
        loop.close()
    except Exception as e:
        logging.error(f"Scheduler error: {str(e)}")

# Setup schedule based on configuration
if SCHEDULE_INTERVAL == "hourly":
    schedule.every().hour.do(run_scraper)
    print("📅 Scheduled: Every hour")
elif SCHEDULE_INTERVAL == "daily":
    schedule.every().day.at(SCHEDULE_TIME).do(run_scraper)
    print(f"📅 Scheduled: Daily at {SCHEDULE_TIME}")
elif SCHEDULE_INTERVAL == "weekly":
    schedule.every().monday.at(SCHEDULE_TIME).do(run_scraper)
    print(f"📅 Scheduled: Weekly on Monday at {SCHEDULE_TIME}")

print(f"⏰ Next run: {schedule.next_run()}")

### 🚀 Start Scheduler

In [None]:
def run_scheduler():
    """Run the scheduler in background"""
    logging.info("Scheduler started")
    while True:
        schedule.run_pending()
        time.sleep(60)  # Check every minute

# Start scheduler in background thread
scheduler_thread = Thread(target=run_scheduler, daemon=True)
scheduler_thread.start()

print("✅ Automated scraper started!")
print("📊 Monitor logs for scraping activity")
print("⏹️ Run next cell to stop scheduler")

### 🧪 Manual Test Run

In [None]:
# Test the scraper manually
print("🧪 Running test scrape...")
await auto_scraper.incremental_scrape()
print("✅ Test completed - check logs for results")

### 📊 Monitor Status

In [None]:
# Check current status
def show_status():
    print("📊 Scraper Status:")
    print(f"⏰ Next scheduled run: {schedule.next_run()}")
    
    if os.path.exists(OUTPUT_FILE):
        df = pd.read_csv(OUTPUT_FILE)
        print(f"📈 Total messages: {len(df):,}")
        print(f"📅 Latest message: {df['Date'].max()}")
        print(f"📡 Active channels: {df['Channel Username'].nunique()}")
    else:
        print("📂 No data file found yet")
    
    # Show recent log entries
    if os.path.exists(LOG_FILE):
        print("\n📝 Recent log entries:")
        with open(LOG_FILE, 'r') as f:
            lines = f.readlines()[-5:]  # Last 5 lines
            for line in lines:
                print(f"  {line.strip()}")

show_status()

### ⏹️ Stop Scheduler

In [None]:
# Stop the scheduler (run this cell to stop)
schedule.clear()
print("⏹️ Scheduler stopped")
print("📊 Final status:")
show_status()

### 🔧 Production Deployment

For production deployment, create a standalone script:

```python
# save as: automated_scraper.py
import schedule
import time
from automated_scraper import AutomatedScraper

scraper = AutomatedScraper()
schedule.every().day.at("09:00").do(scraper.run)

while True:
    schedule.run_pending()
    time.sleep(60)
```

**Run with:**
```bash
# Background process
nohup python automated_scraper.py &

# Or with systemd service
sudo systemctl enable telegram-scraper
sudo systemctl start telegram-scraper
```

**Features:**
- ✅ Incremental data collection
- ✅ Automatic deduplication
- ✅ Error handling and logging
- ✅ Configurable scheduling
- ✅ Background execution