# Environment setup

In [1]:
!pip install irc

Collecting irc
  Downloading irc-20.5.0-py3-none-any.whl.metadata (10 kB)
Collecting jaraco.collections (from irc)
  Downloading jaraco.collections-5.1.0-py3-none-any.whl.metadata (3.9 kB)
Collecting jaraco.text>=3.14 (from irc)
  Downloading jaraco.text-4.0.0-py3-none-any.whl.metadata (3.7 kB)
Collecting jaraco.logging (from irc)
  Downloading jaraco.logging-3.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting jaraco.stream (from irc)
  Downloading jaraco.stream-3.0.4-py3-none-any.whl.metadata (2.2 kB)
Collecting tempora>=1.6 (from irc)
  Downloading tempora-5.8.0-py3-none-any.whl.metadata (3.3 kB)
Collecting autocommand (from jaraco.text>=3.14->irc)
  Downloading autocommand-2.2.2-py3-none-any.whl.metadata (15 kB)
Downloading irc-20.5.0-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.5/75.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jaraco.text-4.0.0-py3-none-any.whl (11 kB)
Downloading tempora-5.8.0-py3-none-any.whl (14 k

In [1]:
import os
import re
import time
import socket
import requests
import pandas as pd
from datetime import datetime
from threading import Thread, Lock
import pytz
from google.colab import drive


In [2]:
# ================== Google Drive Mount ==================
drive.mount('/content/drive')

# ================== Configuration ==================
CLIENT_ID = "gp762nuuoqcoxypju8c569th9wz7q5"
OAUTH_TOKEN = "oauth:mb053vdgfqc0u2m7folpig78vgxdke"
USERNAME = "danieljia36"
IRC_SERVER = "irc.chat.twitch.tv"
IRC_PORT = 6667

GAME_LIST = [
    "League of Legends", "Counter-Strike", "Grand Theft Auto V",
    "Assassin's Creed Shadows", "Fortnite", "VALORANT",
    "Marvel Rivals", "Apex Legends", "Rainbow Six Siege"
]

MAX_CHANNELS = 100
OUTPUT_DIR = "/content/drive/MyDrive/twitch_chat_logs"
os.makedirs(OUTPUT_DIR, exist_ok=True)
local_tz = pytz.timezone("Europe/London")

sock = None
sock_lock = Lock()
joined_channels = {}
last_message_time = time.time()

Mounted at /content/drive


# Store data into Google drive

## New version of data aquisition

In [3]:
def get_output_file():
    today = datetime.now(local_tz).strftime("%Y%m%d")
    return os.path.join(OUTPUT_DIR, f"twitch_chat_log_{today}.csv")

def insert_chat_data_batch(rows):
    if not rows:
        return
    output_file = get_output_file()
    if os.path.isdir(output_file):
        print(f"❌ Error: {output_file} is a directory.")
        return
    df = pd.DataFrame(rows)
    file_exists = os.path.isfile(output_file)
    df.to_csv(output_file, mode='a', index=False, encoding='utf-8', header=not file_exists)
    print(f"✅ Appended {len(rows)} rows to {output_file}")

# ================== Twitch IRC Connection ==================
def connect():
    global sock
    while True:
        try:
            s = socket.socket()
            s.settimeout(60)
            s.connect((IRC_SERVER, IRC_PORT))
            s.send(f"PASS {OAUTH_TOKEN}\n".encode("utf-8"))
            s.send(f"NICK {USERNAME}\n".encode("utf-8"))
            print("✅ Connected to Twitch IRC")
            with sock_lock:
                sock = s
            return s
        except Exception as e:
            print(f"⚠️ connect() error: {e}, retrying in 10s")
            time.sleep(10)

# ================== Twitch API ==================
def get_game_ids(game_names):
    headers = {
        "Client-ID": CLIENT_ID,
        "Authorization": f"Bearer {OAUTH_TOKEN.split(':')[1]}"
    }
    game_ids = {}
    for name in game_names:
        response = requests.get("https://api.twitch.tv/helix/games", headers=headers, params={"name": name})
        if response.status_code == 200:
            data = response.json()
            for game in data.get("data", []):
                game_ids[game["name"]] = game["id"]
        else:
            print(f"❌ Failed to get game ID: {name} → {response.text}")
    return game_ids

def get_live_channels(game_ids):
    headers = {
        "Client-ID": CLIENT_ID,
        "Authorization": f"Bearer {OAUTH_TOKEN.split(':')[1]}"
    }
    live_channels = {}
    for game_name, game_id in game_ids.items():
        params = {"game_id": game_id, "first": 20}
        response = requests.get("https://api.twitch.tv/helix/streams", headers=headers, params=params)
        if response.status_code == 200:
            data = response.json()
            live_channels[game_name] = [stream["user_login"] for stream in data.get("data", [])]
        else:
            print(f"❌ Failed to get streams for {game_name}: {response.text}")
    return live_channels

# ================== Channel Connection ==================
def connect_to_channel(s, channel, game_name):
    try:
        s.send(f"JOIN #{channel}\n".encode("utf-8"))
        joined_channels[channel] = game_name
        print(f"✅ Joined channel: {channel} (game: {game_name})")
    except Exception as e:
        print(f"❌ Join error: {channel} → {e}")
        new_sock = connect()
        connect_to_channel(new_sock, channel, game_name)

# ================== Chat Listening ==================
def listen_chat():
    global sock, last_message_time
    buffer = []
    last_flush = time.time()
    while True:
        print("💬 Listening to chat...")
        try:
            with sock_lock:
                s = sock
            try:
                resp = s.recv(2048).decode("utf-8").strip()
            except socket.timeout:
                print("⚠️ 60s no messages, reconnecting...")
                s = connect()
                continue

            if not resp:
                print("⚠️ Empty message, reconnecting...")
                s = connect()
                continue

            last_message_time = time.time()

            if resp.startswith("PING"):
                s.send("PONG :tmi.twitch.tv\n".encode("utf-8"))
                continue

            if "JOIN" in resp:
                print(f"✅ Join confirmation: {resp}")
                continue

            if "PRIVMSG" in resp:
                try:
                    user_name = re.search(r":(\w+)!", resp).group(1)
                    message_match = re.search(r"PRIVMSG #([\w]+) :(.*)", resp)
                    channel = message_match.group(1)
                    chat_message = message_match.group(2)
                    timestamp = datetime.now(local_tz).strftime("%Y-%m-%d %H:%M:%S")
                    game_name = joined_channels.get(channel, "Unknown Game")
                    row = {
                        "timestamp": timestamp,
                        "game_name": game_name,
                        "channel": channel,
                        "user_name": user_name,
                        "message": chat_message
                    }
                    buffer.append(row)
                    if len(buffer) >= 20 or (time.time() - last_flush) >= 5:
                        insert_chat_data_batch(buffer)
                        buffer.clear()
                        last_flush = time.time()
                except Exception as parse_e:
                    print(f"❌ Parse error: {parse_e}")
                    continue

        except (socket.error, BrokenPipeError) as e:
            print(f"⚠️ Connection error: {e}, reconnecting...")
            s = connect()
            continue
        except Exception as e:
            print(f"⚠️ Unexpected error: {e}")
            time.sleep(5)
            continue

# ================== Channel Updater ==================
def update_channels():
    global sock
    game_ids = get_game_ids(GAME_LIST)
    while True:
        if len(joined_channels) >= MAX_CHANNELS:
            print(f"⚠️ Reached max channels {MAX_CHANNELS}, sleeping 5 mins...")
            time.sleep(300)
            continue

        live_channels = get_live_channels(game_ids)
        for game_name, channels in live_channels.items():
            for channel in channels:
                if len(joined_channels) >= MAX_CHANNELS:
                    break
                with sock_lock:
                    s = sock
                if channel not in joined_channels:
                    connect_to_channel(s, channel, game_name)
                    time.sleep(2)
        time.sleep(60)

# ================== Watchdog Thread ==================
def watchdog():
    global sock, last_message_time
    while True:
        if time.time() - last_message_time > 120:
            print("⚠️ 120s no messages, watchdog triggering reconnect...")
            with sock_lock:
                try:
                    sock.close()
                except:
                    pass
                sock = connect()
            last_message_time = time.time()
        time.sleep(30)


In [4]:
# ================== Main Entry ==================
if __name__ == "__main__":
    sock = connect()
    Thread(target=listen_chat, daemon=True).start()
    Thread(target=update_channels, daemon=True).start()
    Thread(target=watchdog, daemon=True).start()
    while True:
        time.sleep(60)


✅ Connected to Twitch IRC
💬 Listening to chat...
💬 Listening to chat...
✅ Joined channel: thebausffs (game: League of Legends)
✅ Join confirmation: :danieljia36!danieljia36@danieljia36.tmi.twitch.tv JOIN #thebausffs
💬 Listening to chat...
💬 Listening to chat...
💬 Listening to chat...
💬 Listening to chat...
💬 Listening to chat...
💬 Listening to chat...
✅ Joined channel: caedrel (game: League of Legends)
✅ Join confirmation: :danieljia36!danieljia36@danieljia36.tmi.twitch.tv JOIN #caedrel
💬 Listening to chat...
💬 Listening to chat...
✅ Joined channel: lol_nemesis (game: League of Legends)
✅ Join confirmation: :danieljia36!danieljia36@danieljia36.tmi.twitch.tv JOIN #lol_nemesis
:danieljia36.tmi.twitch.tv 353 danieljia36 = #lol_nemesis :danieljia36
:danieljia36.tmi.twitch.tv 366 danieljia36 #lol_nemesis :End of /NAMES list
💬 Listening to chat...
✅ Appended 6 rows to /content/drive/MyDrive/twitch_chat_logs/twitch_chat_log_20250512.csv
💬 Listening to chat...
💬 Listening to chat...
💬 Listenin

KeyboardInterrupt: 

## Old version

In [4]:
def connect():
    global sock
    while True:
        try:
            s = socket.socket()
            s.settimeout(60)
            s.connect((IRC_SERVER, IRC_PORT))
            s.send(f"PASS {OAUTH_TOKEN}\n".encode("utf-8"))
            s.send(f"NICK {USERNAME}\n".encode("utf-8"))
            print("✅ connected to Twitch IRC")
            with sock_lock:
                sock = s
            return s
        except Exception as e:
            print(f"⚠ connection failed，try again in 10s...: {e}")
            time.sleep(10)

OUTPUT_FILE = "/content/drive/MyDrive/twitch_chat_logs/twitch_chat_log20250509.csv"
def insert_chat_data_batch(rows):
    """ 追加聊天数据到同一个 CSV 文件 """
    if not rows:
        return
    df = pd.DataFrame(rows)
    # 检查文件是否存在
    file_exists = os.path.isfile(OUTPUT_FILE)
    # 追加写入，第一次写入时带 header，之后不带 header
    df.to_csv(OUTPUT_FILE, mode='a', index=False, encoding='utf-8', header=not file_exists)
    print(f"✅ inserted {len(rows)} of data into {OUTPUT_FILE}")

def get_game_ids(game_names):
    headers = {
        "Client-ID": CLIENT_ID,
        "Authorization": f"Bearer {OAUTH_TOKEN.split(':')[1]}"
    }
    game_ids = {}
    for name in game_names:
        response = requests.get(GAMES_API_URL, headers=headers, params={"name": name})
        if response.status_code == 200:
            data = response.json()
            for game in data.get("data", []):
                game_ids[game["name"]] = game["id"]
        else:
            print(f"failed to get game id ({name}): {response.text}")
    return game_ids

def get_live_channels(game_ids):
    headers = {
        "Client-ID": CLIENT_ID,
        "Authorization": f"Bearer {OAUTH_TOKEN.split(':')[1]}"
    }
    live_channels = {}
    for game_name, game_id in game_ids.items():
        params = {"game_id": game_id, "first": 20}
        response = requests.get(API_URL, headers=headers, params=params)
        if response.status_code == 200:
            data = response.json()
            live_channels[game_name] = [stream["user_login"] for stream in data.get("data", [])]
        else:
            print(f"get {game_name} stream list failed: {response.text}")
    return live_channels

def connect_to_channel(s, channel, game_name):
    try:
        s.send(f"JOIN #{channel}\n".encode("utf-8"))
        joined_channels[channel] = game_name
        print(f"joined channel: {channel} (game: {game_name})")
    except Exception as e:
        print(f"join channel {channel} fail: {e}")
        new_sock = connect()
        connect_to_channel(new_sock, channel, game_name)


# Max channel number
MAX_CHANNELS = 100

# the last timestamp received
last_message_time = time.time()

def listen_chat():
    global sock, last_message_time
    buffer = []
    last_flush = time.time()
    while True:
        print("💬 Listening to chat...")
        try:
            with sock_lock:
                s = sock
            try:
                resp = s.recv(2048).decode("utf-8").strip()
            except socket.timeout:
                print("⚠️ socket timeout，reconnecting...")
                s = connect()
                continue

            if not resp:
                print("⚠️ empty message，reconnecting...")
                s = connect()
                continue

            last_message_time = time.time()

            if resp.startswith("PING"):
                s.send("PONG :tmi.twitch.tv\n".encode("utf-8"))
                continue

            if "JOIN" in resp:
                print(f"✅ Joined channel: {resp}")
                continue

            if "PRIVMSG" in resp:
                try:
                    user_name = re.search(r":(\w+)!", resp).group(1)
                    message_match = re.search(r"PRIVMSG #([\w]+) :(.*)", resp)
                    channel = message_match.group(1)
                    chat_message = message_match.group(2)
                    timestamp = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
                    game_name = joined_channels.get(channel, "Unknown Game")
                    row = {
                        "timestamp": timestamp,
                        "game_name": game_name,
                        "channel": channel,
                        "user_name": user_name,
                        "message": chat_message
                    }
                    buffer.append(row)
                    if len(buffer) >= 20 or (time.time() - last_flush) >= 5:
                        insert_chat_data_batch(buffer)
                        buffer.clear()
                        last_flush = time.time()
                except Exception as parse_e:
                    print(f"❌ Parse message filed: {parse_e}")
                    continue

        except (socket.error, BrokenPipeError) as e:
            print(f"⚠️ Connection error: {e}，reconnecting...")
            s = connect()
            continue
        except Exception as e:
            print(f"⚠️ Unknown error in listening process: {e}")
            time.sleep(5)
            continue

def update_channels():
    global sock
    game_ids = get_game_ids(GAME_LIST)
    while True:
        live_channels = get_live_channels(game_ids)
        for game_name, channels in live_channels.items():
            for channel in channels:
                if len(joined_channels) >= MAX_CHANNELS:
                    print(f"⚠️ Reached maximum channels {MAX_CHANNELS}，stop joining new channels")
                    break
                with sock_lock:
                    s = sock
                if channel not in joined_channels:
                    connect_to_channel(s, channel, game_name)
                    time.sleep(2)
        time.sleep(60)

def watchdog():
    global sock, last_message_time
    while True:
        if time.time() - last_message_time > 120:
            print("⚠️ 120 s no new messages，force to reconnect...")
            with sock_lock:
                try:
                    sock.close()
                except Exception:
                    pass
                sock = connect()
            last_message_time = time.time()
        time.sleep(30)  # check every 30s

In [None]:

if __name__ == "__main__":
    sock = connect()
    t_listen = Thread(target=listen_chat, daemon=True)
    t_update = Thread(target=update_channels, daemon=True)
    t_watchdog = Thread(target=watchdog, daemon=True)
    t_listen.start()
    t_update.start()
    t_watchdog.start()
    t_listen.join()
    t_update.join()
    t_watchdog.join()

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
⚠️ Reached maximum channels 100，stop joining new channels
⚠️ Reached maximum channels 100，stop joining new channels
⚠️ Reached maximum channels 100，stop joining new channels
⚠️ Reached maximum channels 100，stop joining new channels
⚠️ Reached maximum channels 100，stop joining new channels
⚠️ Reached maximum channels 100，stop joining new channels
⚠️ Reached maximum channels 100，stop joining new channels
⚠️ Reached maximum channels 100，stop joining new channels
⚠️ Reached maximum channels 100，stop joining new channels
⚠️ socket timeout，reconnecting...
✅ connected to Twitch IRC
💬 Listening to chat...
💬 Listening to chat...
⚠️ Reached maximum channels 100，stop joining new channels
⚠️ Reached maximum channels 100，stop joining new channels
⚠️ Reached maximum channels 100，stop joining new channels
⚠️ Reached maximum channels 100，stop joining new channels
⚠️ Reached maximum channels 100，stop joining new channels
⚠️ Reached maximum channels 100，stop join