In [1]:
# Import necessary libraries
import sys
import os

sys.path.append(os.path.abspath('../scripts'))


In [None]:
# Import the necessary modules
import os
from dotenv import load_dotenv
from telethon import TelegramClient, types  # Import types for media handling
import json

# Step 1: Load environment variables from the .env file
load_dotenv()

# Step 2: Fetch API ID and API Hash from environment variables
api_id = os.getenv('TELEGRAM_API_ID')
api_hash = os.getenv('TELEGRAM_API_HASH')

# Step 3: Initialize the Telegram client (with extended timeout)
client = TelegramClient('scraper_session_alt', api_id, api_hash, timeout=60)

# Step 4: Define a function to fetch messages from a channel
async def fetch_messages(channel_link, limit=100):
    messages_data = []

    async for message in client.iter_messages(channel_link, limit=limit):
        message_dict = {
            "sender": message.sender_id,
            "timestamp": message.date.isoformat(),
            "content": message.message,
            "media": None  # Initialize with None, updated later if media exists
        }
        
        # Check if the message contains media
        if message.media:
            # Handle different media types
            if isinstance(message.media, types.MessageMediaPhoto):
                media_file = await client.download_media(message.media, file=f"./media/photo_{message.id}")
                message_dict["media"] = media_file  # Save the path to the downloaded photo
            elif isinstance(message.media, types.MessageMediaDocument):
                media_file = await client.download_media(message.media, file=f"./media/doc_{message.id}")
                message_dict["media"] = media_file  # Save the path to the downloaded document
            elif isinstance(message.media, types.MessageMediaWebPage):
                # Handle web pages or other media types differently if needed
                message_dict["media"] = f"Web page URL: {message.media.webpage.url}" if message.media.webpage else "Unknown webpage"

        messages_data.append(message_dict)

    return messages_data

# Step 5: Define a function to fetch data from multiple channels
async def fetch_all_channels(channels, limit=100):
    for channel in channels:
        print(f"Fetching data from {channel}...")
        data = await fetch_messages(channel, limit)
        file_name = f'{channel.split("/")[-1]}_messages.json'
        with open(file_name, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"Saved {len(data)} messages from {channel} to {file_name}")

# Step 6: List the Telegram channels you want to scrape
channels = [
    'https://t.me/forfreemarket'
]

# Step 7: Use await directly in Jupyter notebook
async def main():
    async with client:
        await fetch_all_channels(channels, limit=4000)

# Instead of client.loop.run_until_complete(main()), just use await main()
await main()


In [2]:
import json
import re
import pandas as pd

# Function to check if a word contains Amharic characters
def contains_amharic(text):
    return bool(re.search(r'[\u1200-\u137F]', text))

# Function to preprocess the text data, keeping only Amharic words
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#', '', text)  # Remove mentions and hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    
    # Tokenize the text (split into words)
    tokens = text.split()
    
    # Keep only tokens that contain Amharic characters
    amharic_tokens = [word for word in tokens if contains_amharic(word)]
    
    return amharic_tokens

# Load the JSON file containing the messages
def load_data(file_name):
    with open(file_name, 'r', encoding='utf-8') as f:
        return json.load(f)

# Process the loaded data, keeping only messages with Amharic content
def process_data(data):
    processed_data = []
    for message in data:
        if 'content' in message and message['content'] and contains_amharic(message['content']):
            # Preprocess the content, keeping only Amharic words
            amharic_content = preprocess_text(message['content'])
            if amharic_content:  # If there are still Amharic words after filtering
                processed_data.append({
                    'sender': message['sender'],
                    'timestamp': message['timestamp'],
                    'content': ' '.join(amharic_content),  # Join tokens back into a string
                    'media': message.get('media', None)  # Handle case where 'media' might not exist
                })
    return processed_data

# Example usage
file_name = 'chuchushoes_messages.json'  # Replace with your actual file name
raw_data = load_data(file_name)
processed_data = process_data(raw_data)

# Convert to DataFrame for easy analysis and manipulation
df = pd.DataFrame(processed_data)
print(df.head())  # Display the first few rows of the DataFrame


          sender                  timestamp  \
0 -1001866685679  2024-08-06T10:13:04+00:00   
1 -1001866685679  2024-08-06T10:12:52+00:00   
2 -1001866685679  2024-08-06T10:12:39+00:00   
3 -1001866685679  2024-08-06T10:12:25+00:00   
4 -1001866685679  2024-08-06T10:12:19+00:00   

                                             content                  media  
0  [tacketa, size, 3940414243, 1600, free, delive...  ./media\photo_768.jpg  
1  [tacketa, size, 3940414243, 1600, free, delive...  ./media\photo_766.jpg  
2  [tacketa, size, 3940414243, 1600, free, delive...  ./media\photo_764.jpg  
3  [tacketa, size, 3940414243, 1600, free, delive...  ./media\photo_762.jpg  
4  [tacketa, size, 3940414243, 1600, free, delive...  ./media\photo_760.jpg  


In [3]:
# Save the processed data to a CSV file
df.to_csv('processed_chuchushoes_messages.csv', index=False, encoding='utf-8')


In [None]:
# Function to preprocess the text data
def preprocess_text(text):
    # Normalization: Convert to lowercase
    text = text.lower()
    
    # Remove special characters, links, and emojis
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#', '', text)  # Remove mentions and hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    
    # Tokenization: Split the text into words (you can also use nltk for more advanced tokenization)
    tokens = text.split()
    
    # Further processing for Amharic (e.g., stemming or specific handling can be added here)
    # For example: handle special Amharic characters, etc.

    return tokens


In [None]:
# Install necessary libraries if needed


import os
import re
import json
import unicodedata
import pandas as pd
from dotenv import load_dotenv
from telethon import TelegramClient, types
import nltk

# Download NLTK punkt tokenizer if not already done
nltk.download('punkt')


In [2]:
# Load environment variables from the .env file
load_dotenv()

# Fetch API ID and API Hash from environment variables
api_id = os.getenv('TELEGRAM_API_ID')
api_hash = os.getenv('TELEGRAM_API_HASH')

# Initialize the Telegram client (with extended timeout)
client = TelegramClient('scraper_session_alt', api_id, api_hash, timeout=60)


In [None]:
import nltk

# Download the punkt resource if not already available
nltk.download('punkt')

def tokenize_amharic(text):
    # Tokenize text using NLTK's word_tokenize function
    tokens = nltk.word_tokenize(text)
    return token

# Tokenization function for Amharic (or any text)
def tokenize_amharic(text):
    # Basic tokenization using whitespace and punctuation
    tokens = nltk.word_tokenize(text)
    return tokens

# Normalization function for Amharic text
def normalize_amharic(text):
    # Normalize to NFC form (for consistent Unicode encoding)
    normalized_text = unicodedata.normalize('NFC', text)
    return normalized_text

In [None]:
import re
import nltk
import unicodedata

# Download the punkt resource if not already available
nltk.download('punkt')

# Clean and structure data by separating metadata and content
def clean_and_structure(data):
    structured_data = []

    for message in data:
        # Clean message content by removing URLs, special characters, and extra whitespaces
        cleaned_content = re.sub(r'http\S+', '', message['content'])  # Remove URLs
        cleaned_content = re.sub(r'\W+', ' ', cleaned_content)  # Remove special characters
        cleaned_content = cleaned_content.strip()

        # Tokenize and normalize
        tokens = tokenize_amharic(cleaned_content)
        normalized_content = normalize_amharic(' '.join(tokens))
        
        # Structure the data into a dictionary
        structured_message = {
            "sender": message["sender"],
            "timestamp": message["timestamp"],
            "tokens": tokens,
            "normalized_content": normalized_content,
            "media": message["media"]
        }
        structured_data.append(structured_message)
    
    return structured_data

# Tokenization function for Amharic (or any text)
def tokenize_amharic(text):
    # Tokenize text using NLTK's word_tokenize function
    tokens = nltk.word_tokenize(text)
    return tokens  # Fixed return value (previously was returning 'token' by mistake)

# Normalization function for Amharic text
def normalize_amharic(text):
    # Normalize to NFC form (for consistent Unicode encoding)
    normalized_text = unicodedata.normalize('NFC', text)
    return normalized_text


In [4]:
# Function to save structured data to a CSV file
def save_to_csv(data, output_file):
    # Convert the list of dictionaries to a pandas DataFrame
    df = pd.DataFrame(data)
    
    # Save the DataFrame to a CSV file
    df.to_csv(output_file, index=False, encoding='utf-8-sig')  # Use utf-8-sig to handle Amharic characters


In [5]:
# Function to fetch messages from a Telegram channel
async def fetch_messages(channel_link, limit=100):
    messages_data = []

    async for message in client.iter_messages(channel_link, limit=limit):
        message_dict = {
            "sender": message.sender_id,
            "timestamp": message.date.isoformat(),
            "content": message.message,
            "media": None  # Initialize with None, updated later if media exists
        }
        
        # Check if the message contains media
        if message.media:
            # Handle different media types
            if isinstance(message.media, types.MessageMediaPhoto):
                media_file = await client.download_media(message.media, file=f"./media/photo_{message.id}")
                message_dict["media"] = media_file  # Save the path to the downloaded photo
            elif isinstance(message.media, types.MessageMediaDocument):
                media_file = await client.download_media(message.media, file=f"./media/doc_{message.id}")
                message_dict["media"] = media_file  # Save the path to the downloaded document
            elif isinstance(message.media, types.MessageMediaWebPage):
                # Handle web pages or other media types differently if needed
                message_dict["media"] = f"Web page URL: {message.media.webpage.url}" if message.media.webpage else "Unknown webpage"

        messages_data.append(message_dict)

    return messages_data


In [6]:
# Function to fetch messages from multiple channels, preprocess them, and save to CSV
async def fetch_all_channels(channels, limit=100):
    for channel in channels:
        print(f"Fetching data from {channel}...")
        raw_data = await fetch_messages(channel, limit)
        structured_data = clean_and_structure(raw_data)
        
        # Save the processed data to CSV
        output_file = f'{channel.split("/")[-1]}_messages.csv'
        save_to_csv(structured_data, output_file)
        
        print(f"Saved {len(structured_data)} messages from {channel} to {output_file}")
        


In [None]:
# List the Telegram channels you want to scrape
channels = [
    'https://t.me/chuchushoes'
]

# Main function to run the Telegram scraper and save output as CSV
async def main():
    async with client:
        await fetch_all_channels(channels, limit=1000)

# Run the main function
await main()
