In [1]:
import datetime
import re
from collections import defaultdict
import random

# Extract mentioned users from tweet text
def extract_mentioned_users(tweet_text):
    return re.findall(r'@(\w+)', tweet_text)

# Extract hashtags from tweet text
def extract_hashtags(tweet_text):
    return re.findall(r'#(\w+)', tweet_text)

# Read and process the tweet data
mentioned_counts = defaultdict(lambda: defaultdict(int))
topic_counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
unique_users = defaultdict(set)  # Keep track of unique users for each day
with open('tweets2009-07.txt', 'r', encoding='utf-8') as file:
    file.readline()  # Skip the header line
    for line in file:
        line = line.strip()
        if not line:
            continue
        
        timestamp = datetime.datetime.strptime(line[2:], '%Y-%m-%d %H:%M:%S')

        line = file.readline().strip()
        username = line[2:].split('/')[-1]

        line = file.readline().strip()
        tweet_text = line[2:]
        
        mentioned_users = extract_mentioned_users(tweet_text)
        hashtags = extract_hashtags(tweet_text)

        if timestamp < datetime.datetime(2009, 7, 1, 0, 0, 0) or timestamp > datetime.datetime(2009, 7, 5, 23, 59, 59):
            continue

        date = timestamp.date()
        unique_users[date].add(username)  # Add user to the set for the specific day

        for mentioned_user in mentioned_users:
            mentioned_counts[date][(mentioned_user, username)] += 1
        
        for hashtag in hashtags:
            topic_counts[date][username][hashtag] += 1

# Write the mentioned counts to CSV files
for date, data in mentioned_counts.items():
    mentioned_csv_file = f'{date:%Y.%m.%d}_mentioned.csv'
    with open(mentioned_csv_file, 'w', encoding='utf-8') as file:
        file.write('from,to,weight\n')
        for (mentioned_user, user_from), weight in data.items():
            file.write(f'{user_from},{mentioned_user},{weight}\n')

# Write the top topics for each user to separate CSV files for each date
for date, user_topics in topic_counts.items():
    topic_csv_file = f'{date:%Y.%m.%d}_topic_of_interest.csv'
    with open(topic_csv_file, 'w', encoding='utf-8') as file:
        file.write('user,topic_of_interest\n')
        for username in unique_users[date]:  # Iterate through all unique users for the specific day
            topics = user_topics[username]
            if topics:
                max_count = max(topics.values())
                top_topics = [topic for topic, count in topics.items() if count == max_count]
                selected_topic = random.choice(top_topics)
                file.write(f'{username},#{selected_topic}\n')
            else:
                file.write(f'{username},\n')

