In [1]:
import pandas as pd
import os
import time
import configparser
import calendar
from telethon.sync import TelegramClient
from telethon.tl.functions.messages import GetHistoryRequest
import json
import time
from datetime import datetime, timedelta
import asyncio

In [34]:
get_date_interval_from_filename("telegram_channels_june_2022.csv")

(datetime.date(2022, 1, 1), datetime.date(2022, 6, 30))

In [15]:
def get_date_interval_from_filename(filename):
    # Extract the date from the filename
    month_year = filename.split('_')[-2:]  # Extract the month and year
    month_name = month_year[0]
    year = int(month_year[1].split('.')[0])

    # Convert month from name to number
    month = datetime.strptime(month_name, '%B').month

    # Create the start date as the first day of the six months before
    if month == 6:
        start_date = datetime(year, 1, 1)
    else:
        start_date = datetime(year, 7, 1)

    # Create the end date as the last day of the month before the one specified in filename
    if month == 1:
        end_date = datetime(year-1, 12, 31)
    else:
        end_date = datetime(year, 6, 30)

    return start_date.date(), end_date.date()


def extract_top_channels(filename, top_n=20):
    df = pd.read_csv(filename)
    top_channels = df.nsmallest(top_n, 'rank')['channel_id']
    return top_channels


def extract_channel_info(filename):
    df = pd.read_csv(filename)
    return df

def get_date_interval_from_csv(row):
    # Convert start_date and end_date to integers before using them
    start_date = int(row['start_date'])
    end_date = int(row['end_date'])

    # Extract the start_date and end_date from the row of the CSV file
    start_date = datetime(start_date, 1, 1)
    end_date = datetime(end_date, 12, 31)
    return start_date.date(), end_date.date()


async def get_channel_messages(client, channel_id, start_date, end_date):
    channel = await client.get_entity(channel_id)

    limit = 100
    all_messages = []
    total_messages = 0
    total_count_limit = 0

    print(f"Fetching messages from {channel.title}")
    offset_id = 0
    break_flag = False
    while True:
        history = await client(GetHistoryRequest(
            peer=channel,
            limit=limit,
            offset_date=None,
            offset_id=offset_id,
            add_offset=0,
            max_id=0,
            min_id=0,
            hash=0
        ))
        if not history.messages:
            break
        messages = history.messages
        for message in messages:
            if message.date.date() < start_date:  # if message is older than start date
                break_flag = True
                break
            elif message.date.date() > end_date:  # if message is newer than end date
                continue
            all_messages.append(message.to_dict())
        if break_flag:
            break
        offset_id = messages[-1].id  # Get the id of the last message
        time.sleep(1)  # to prevent hitting rate limit
        total_messages = len(all_messages)
        print(f"\rFetched {total_messages} messages...", end='')
        if total_count_limit != 0 and total_messages >= total_count_limit:
            break
    print()  # Print a newline when done
    return all_messages


class DateTimeEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, datetime):
            return o.isoformat()

        if isinstance(o, bytes):
            return list(o)

        return json.JSONEncoder.default(self, o)

def save_messages_to_json(channel_name, start_date, end_date, messages):
    filename = f"{channel_name}_{start_date}_{end_date}.json"
    with open(filename, 'w') as f:
        json.dump(messages, f, cls=DateTimeEncoder)
 
    
async def main(client, filename):
    start_time = time.time()
    channel_info = extract_channel_info(filename)
    for _, row in channel_info.iterrows():
        channel_id = row['channel_id']
        channel_link = row['channel_link']
        start_date, end_date = get_date_interval_from_csv(row)

        output_file = f"{channel_id}_{start_date}_{end_date}.json"
        if os.path.exists(output_file):
            print(f"Messages for {channel_id} have already been scraped. Skipping...")
            continue

        messages = await get_channel_messages(client, channel_link, start_date, end_date)
        save_messages_to_json(channel_id, start_date, end_date, messages)
    elapsed_time = time.time() - start_time
    print(f"\nTotal time elapsed: {elapsed_time:.2f} seconds")


In [3]:
# Reading Configs
config = configparser.ConfigParser()
config.read("config.ini")

# Setting configuration values
api_id = config['Telegram']['api_id']
api_hash = config['Telegram']['api_hash']

api_hash = str(api_hash)

phone = config['Telegram']['phone']
username = config['Telegram']['username']


client = TelegramClient(username, api_id, api_hash)
await client.start()

Please enter your phone (or bot token):  +447494238046
Please enter the code you received:  57370


Signed in successfully as Yana


<telethon.client.telegramclient.TelegramClient at 0x132a82230>

In [17]:
if __name__ == "__main__":
    filename = "telegram_channels_with_dates.csv"
    await main(client, filename)

Messages for aavst55 have already been scraped. Skipping...
Messages for Akashevarova have already been scraped. Skipping...
Messages for Aksenov82 have already been scraped. Skipping...
Messages for atc_atc have already been scraped. Skipping...
Messages for babchenko77 have already been scraped. Skipping...
Messages for bloodysx have already been scraped. Skipping...
Messages for boilerroomchannel have already been scraped. Skipping...
Messages for ctrs2018 have already been scraped. Skipping...
Messages for Davydovln have already been scraped. Skipping...
Messages for deputatdumy have already been scraped. Skipping...
Messages for dmitrynikotin have already been scraped. Skipping...
Messages for eschlmann have already been scraped. Skipping...
Messages for gayasylum have already been scraped. Skipping...
Messages for gayasylum have already been scraped. Skipping...
Messages for generalsvr have already been scraped. Skipping...
Messages for go338 have already been scraped. Skipping..

Server closed the connection: [Errno 54] Connection reset by peer
Server closed the connection: [Errno 54] Connection reset by peer
Server closed the connection: [Errno 54] Connection reset by peer
Server sent a very old message with ID 7259446485191733249, ignoring (see FAQ for details)
Server sent a very old message with ID 7259446544505940993, ignoring (see FAQ for details)
Server sent a very old message with ID 7259446545983426561, ignoring (see FAQ for details)
Server closed the connection: [Errno 54] Connection reset by peer
Server closed the connection: [Errno 54] Connection reset by peer
Server closed the connection: [Errno 54] Connection reset by peer
Server closed the connection: [Errno 54] Connection reset by peer
Server closed the connection: [Errno 54] Connection reset by peer
Server closed the connection: [Errno 54] Connection reset by peer
Server closed the connection: [Errno 54] Connection reset by peer
Server closed the connection: [Errno 54] Connection reset by peer
S

In [18]:
client.is_connected()

True

In [19]:
client.disconnect()

<Future pending cb=[shield.<locals>._outer_done_callback() at /opt/homebrew/Cellar/python@3.10/3.10.6_2/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/tasks.py:857]>