# Environment setup

In [None]:
!pip install irc

Collecting irc
  Downloading irc-20.5.0-py3-none-any.whl.metadata (10 kB)
Collecting jaraco.collections (from irc)
  Downloading jaraco.collections-5.1.0-py3-none-any.whl.metadata (3.9 kB)
Collecting jaraco.text>=3.14 (from irc)
  Downloading jaraco.text-4.0.0-py3-none-any.whl.metadata (3.7 kB)
Collecting jaraco.logging (from irc)
  Downloading jaraco.logging-3.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting jaraco.functools>=1.20 (from irc)
  Downloading jaraco.functools-4.1.0-py3-none-any.whl.metadata (2.9 kB)
Collecting jaraco.stream (from irc)
  Downloading jaraco.stream-3.0.4-py3-none-any.whl.metadata (2.2 kB)
Collecting tempora>=1.6 (from irc)
  Downloading tempora-5.8.0-py3-none-any.whl.metadata (3.3 kB)
Collecting jaraco.context>=4.1 (from jaraco.text>=3.14->irc)
  Downloading jaraco.context-6.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting autocommand (from jaraco.text>=3.14->irc)
  Downloading autocommand-2.2.2-py3-none-any.whl.metadata (15 kB)
Collecting backports.tarfil

In [1]:
import socket
import re
import requests
import time
import os
from datetime import datetime
from threading import Thread, Lock
import pandas as pd
from google.colab import drive

In [2]:
# mount to Google drive
drive.mount('/content/drive')

# output dir on google drive
OUTPUT_DIR = "/content/drive/MyDrive/twitch_chat_logs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

today = datetime.utcnow().strftime("%Y%m%d")
OUTPUT_FILE = os.path.join(OUTPUT_DIR, f"twitch_chat_log_{today}.csv")

# Twitch API settings
CLIENT_ID = "gp762nuuoqcoxypju8c569th9wz7q5"
OAUTH_TOKEN = "oauth:mb053vdgfqc0u2m7folpig78vgxdke"
USERNAME = "danieljia36"
API_URL = "https://api.twitch.tv/helix/streams"
GAMES_API_URL = "https://api.twitch.tv/helix/games"

# Twitch IRC settings
IRC_SERVER = "irc.chat.twitch.tv"
IRC_PORT = 6667

# game list
GAME_LIST = [
    "League of Legends", "Counter-Strike", "Grand Theft Auto V",
    "Assassin's Creed Shadows", "Fortnite", "VALORANT",
    "Marvel Rivals", "Apex Legends", "Rainbow Six Siege"
]

# save mappings between channels and games
joined_channels = {}
sock = None
sock_lock = Lock()

Mounted at /content/drive


# Store data into Google drive

In [6]:
def connect():
    global sock
    while True:
        try:
            s = socket.socket()
            s.settimeout(60)
            s.connect((IRC_SERVER, IRC_PORT))
            s.send(f"PASS {OAUTH_TOKEN}\n".encode("utf-8"))
            s.send(f"NICK {USERNAME}\n".encode("utf-8"))
            print("✅ connected to Twitch IRC")
            with sock_lock:
                sock = s
            return s
        except Exception as e:
            print(f"⚠ connection failed，try again in 10s...: {e}")
            time.sleep(10)

OUTPUT_FILE = "/content/drive/MyDrive/twitch_chat_logs/twitch_chat_log20250509.csv"
def insert_chat_data_batch(rows):
    """ 追加聊天数据到同一个 CSV 文件 """
    if not rows:
        return
    df = pd.DataFrame(rows)
    # 检查文件是否存在
    file_exists = os.path.isfile(OUTPUT_FILE)
    # 追加写入，第一次写入时带 header，之后不带 header
    df.to_csv(OUTPUT_FILE, mode='a', index=False, encoding='utf-8', header=not file_exists)
    print(f"✅ inserted {len(rows)} of data into {OUTPUT_FILE}")

def get_game_ids(game_names):
    headers = {
        "Client-ID": CLIENT_ID,
        "Authorization": f"Bearer {OAUTH_TOKEN.split(':')[1]}"
    }
    game_ids = {}
    for name in game_names:
        response = requests.get(GAMES_API_URL, headers=headers, params={"name": name})
        if response.status_code == 200:
            data = response.json()
            for game in data.get("data", []):
                game_ids[game["name"]] = game["id"]
        else:
            print(f"failed to get game id ({name}): {response.text}")
    return game_ids

def get_live_channels(game_ids):
    headers = {
        "Client-ID": CLIENT_ID,
        "Authorization": f"Bearer {OAUTH_TOKEN.split(':')[1]}"
    }
    live_channels = {}
    for game_name, game_id in game_ids.items():
        params = {"game_id": game_id, "first": 20}
        response = requests.get(API_URL, headers=headers, params=params)
        if response.status_code == 200:
            data = response.json()
            live_channels[game_name] = [stream["user_login"] for stream in data.get("data", [])]
        else:
            print(f"get {game_name} stream list failed: {response.text}")
    return live_channels

def connect_to_channel(s, channel, game_name):
    try:
        s.send(f"JOIN #{channel}\n".encode("utf-8"))
        joined_channels[channel] = game_name
        print(f"joined channel: {channel} (game: {game_name})")
    except Exception as e:
        print(f"join channel {channel} fail: {e}")
        new_sock = connect()
        connect_to_channel(new_sock, channel, game_name)

# def listen_chat():
#     global sock
#     buffer = []
#     last_flush = time.time()
#     while True:
#         try:
#             with sock_lock:
#                 s = sock
#             resp = s.recv(2048).decode("utf-8").strip()
#             if not resp:
#                 print("⚠ received empty messages，reconnection...")
#                 s = connect()
#                 continue
#             if resp.startswith("PING"):
#                 s.send("PONG :tmi.twitch.tv\n".encode("utf-8"))
#                 continue
#             if "JOIN" in resp:
#                 print(f"✅ joined channel confirmation: {resp}")
#                 continue
#             if "PRIVMSG" in resp:
#                 try:
#                     user_name = re.search(r":(\w+)!", resp).group(1)
#                     message_match = re.search(r"PRIVMSG #([\w]+) :(.*)", resp)
#                     channel = message_match.group(1)
#                     chat_message = message_match.group(2)
#                     timestamp = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
#                     game_name = joined_channels.get(channel, "Unknown Game")
#                     row = {
#                         "timestamp": timestamp,
#                         "game_name": game_name,
#                         "channel": channel,
#                         "user_name": user_name,
#                         "message": chat_message
#                     }
#                     buffer.append(row)
#                     if len(buffer) >= 20 or (time.time() - last_flush) >= 5:
#                         insert_chat_data_batch(buffer)
#                         buffer.clear()
#                         last_flush = time.time()
#                 except Exception as parse_e:
#                     print(f"parse messages failed: {parse_e}")
#                     continue
#         except socket.timeout:
#             print("⚠ 60 s no new messages，reconnection...")
#             s = connect()
#             continue
#         except (socket.error, BrokenPipeError) as e:
#             print(f"⚠ connection failure: {e}，reconnection...")
#             s = connect()
#             continue

# def update_channels():
#     global sock
#     game_ids = get_game_ids(GAME_LIST)
#     while True:
#         live_channels = get_live_channels(game_ids)
#         for game_name, channels in live_channels.items():
#             for channel in channels:
#                 with sock_lock:
#                     s = sock
#                 connect_to_channel(s, channel, game_name)
#                 time.sleep(2)
#         time.sleep(60)


# 定义最大频道数
MAX_CHANNELS = 100

# 用于记录最后收到消息的时间
last_message_time = time.time()

def listen_chat():
    global sock, last_message_time
    buffer = []
    last_flush = time.time()
    while True:
        print("💬 Listening to chat...")
        try:
            with sock_lock:
                s = sock
            try:
                resp = s.recv(2048).decode("utf-8").strip()
            except socket.timeout:
                print("⚠️ socket timeout，reconnecting...")
                s = connect()
                continue

            if not resp:
                print("⚠️ empty message，reconnecting...")
                s = connect()
                continue

            last_message_time = time.time()

            if resp.startswith("PING"):
                s.send("PONG :tmi.twitch.tv\n".encode("utf-8"))
                continue

            if "JOIN" in resp:
                print(f"✅ Joined channel: {resp}")
                continue

            if "PRIVMSG" in resp:
                try:
                    user_name = re.search(r":(\w+)!", resp).group(1)
                    message_match = re.search(r"PRIVMSG #([\w]+) :(.*)", resp)
                    channel = message_match.group(1)
                    chat_message = message_match.group(2)
                    timestamp = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
                    game_name = joined_channels.get(channel, "Unknown Game")
                    row = {
                        "timestamp": timestamp,
                        "game_name": game_name,
                        "channel": channel,
                        "user_name": user_name,
                        "message": chat_message
                    }
                    buffer.append(row)
                    if len(buffer) >= 20 or (time.time() - last_flush) >= 5:
                        insert_chat_data_batch(buffer)
                        buffer.clear()
                        last_flush = time.time()
                except Exception as parse_e:
                    print(f"❌ Parse message filed: {parse_e}")
                    continue

        except (socket.error, BrokenPipeError) as e:
            print(f"⚠️ Connection error: {e}，reconnecting...")
            s = connect()
            continue
        except Exception as e:
            print(f"⚠️ Unknown error in listening process: {e}")
            time.sleep(5)
            continue

def update_channels():
    global sock
    game_ids = get_game_ids(GAME_LIST)
    while True:
        live_channels = get_live_channels(game_ids)
        for game_name, channels in live_channels.items():
            for channel in channels:
                if len(joined_channels) >= MAX_CHANNELS:
                    print(f"⚠️ Reached maximum channels {MAX_CHANNELS}，stop joining new channels")
                    break
                with sock_lock:
                    s = sock
                if channel not in joined_channels:
                    connect_to_channel(s, channel, game_name)
                    time.sleep(2)
        time.sleep(60)

def watchdog():
    global sock, last_message_time
    while True:
        if time.time() - last_message_time > 120:
            print("⚠️ 120 s no new messages，force to reconnect...")
            with sock_lock:
                try:
                    sock.close()
                except Exception:
                    pass
                sock = connect()
            last_message_time = time.time()
        time.sleep(30)  # check every 30s

In [7]:
# if __name__ == "__main__":
#     sock = connect()
#     t_listen = Thread(target=listen_chat, daemon=True)
#     t_update = Thread(target=update_channels, daemon=True)
#     t_listen.start()
#     t_update.start()
#     t_listen.join()
#     t_update.join()

if __name__ == "__main__":
    sock = connect()
    t_listen = Thread(target=listen_chat, daemon=True)
    t_update = Thread(target=update_channels, daemon=True)
    t_watchdog = Thread(target=watchdog, daemon=True)
    t_listen.start()
    t_update.start()
    t_watchdog.start()
    t_listen.join()
    t_update.join()
    t_watchdog.join()

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
✅ inserted 20 of data into /content/drive/MyDrive/twitch_chat_logs/twitch_chat_log20250509.csv
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
joined channel: camloso (game: Marvel Rivals)
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
joined channel: mrstiara (game: Marvel Rivals)
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
✅ inserted 20 of data into /content/drive/MyDrive/twitch_chat_logs/twitch_chat_log20250509.csv
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
💬 正在监听聊天...
✅ inserted 20 of data into /content/drive/MyDrive/twitch_chat_lo

KeyboardInterrupt: 