In [1]:
stream_name = 'test'

In [2]:
import pandas as pd

df = pd.DataFrame(columns=['timestamp', 'platform', 'username', 'message'])

In [3]:
import re
import pandas as pd

data = []

# Define regex patterns for parsing
TWITCH_LOG_PATTERN = re.compile(r'^(\d{4}-\d{2}-\d{2}_\d{2}:\d{2}:\d{2}) — :([^!]+)!.* PRIVMSG #\w+ :(.*)$')
YOUTUBE_LOG_PATTERN = re.compile(r'^(\d{4}-\d{2}-\d{2}_\d{2}:\d{2}:\d{2}) — \[(.*?)\]- (.*)$')

for platform in ['twitch', 'youtube']:
    log_file = f'logs/{stream_name}_{platform}_live_chat.log'
    try:
        with open(log_file, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue

                timestamp, username, message = None, None, None

                if platform == 'twitch':
                    match = TWITCH_LOG_PATTERN.match(line)
                    if match:
                        timestamp, username, message = match.groups()

                elif platform == 'youtube':
                    match = YOUTUBE_LOG_PATTERN.match(line)
                    if match:
                        timestamp, username, message = match.groups()

                        if username and username.startswith('@'):
                            username = username[1:]

                if timestamp and username and message:
                    data.append({
                        'timestamp': timestamp,
                        'platform': platform,
                        'username': username,
                        'message': message
                    })

    except FileNotFoundError:
        print(f"Log file not found: {log_file}")

if data:
    new_df = pd.DataFrame(data)
    df = pd.concat([df, new_df], ignore_index=True)

# Convert timestamp to datetime object for better manipulation
if not df.empty:df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d_%H:%M:%S')

print(f"Parsed {len(df)} messages.")

Parsed 3410 messages.


In [4]:
df

Unnamed: 0,timestamp,platform,username,message
0,2026-02-19 17:37:48,twitch,itsliterallyaidyn,yo gurt
1,2026-02-19 17:37:54,twitch,hibadgerlolf,DO WORDLE ROBEY
2,2026-02-19 17:37:55,twitch,hibadgerlolf,PLEASE
3,2026-02-19 17:37:58,twitch,path_b_pathington,eeffoc:loudly_crying_face::loudly_crying_face:...
4,2026-02-19 17:38:10,twitch,infernoxof,Tung?
...,...,...,...,...
3405,2026-02-19 20:36:04,youtube,RaZoey,pls link it mods
3406,2026-02-19 20:36:05,youtube,toasterpuppies,cya
3407,2026-02-19 20:36:24,youtube,Caterblock,yep
3408,2026-02-19 20:36:24,youtube,RaZoey,"mods, drop the link"


In [5]:
df.to_csv(f'{stream_name}_chat_data.csv', index=False)

In [6]:
# Get most common word in messages
from collections import Counter
all_messages = ' '.join(df['message'].dropna()).lower().split()
word_counts = Counter(all_messages)

for i, (word, count) in enumerate(word_counts.most_common(100), 1):
    print(f"{i}. {word}: {count}")

1. the: 351
2. beat: 250
3. you: 240
4. do: 197
5. a: 190
6. play: 178
7. it: 177
8. i: 166
9. is: 151
10. game: 131
11. of: 119
12. to: 117
13. its: 114
14. robey: 111
15. lol: 101
16. gg: 95
17. that: 83
18. in: 80
19. this: 76
20. u: 72
21. tower: 66
22. first: 66
23. for: 65
24. what: 62
25. yes: 60
26. dingus: 57
27. and: 56
28. no: 55
29. fun: 55
30. on: 54
31. w: 54
32. my: 53
33. me: 53
34. combat: 53
35. initation.: 52
36. stage.: 52
37. not: 51
38. one: 51
39. can: 50
40. have: 49
41. get: 46
42. are: 41
43. stream: 41
44. sniper: 41
45. he: 40
46. at: 40
47. bro: 37
48. we: 36
49. like: 35
50. find: 34
51. games: 33
52. your: 32
53. be: 32
54. buy: 32
55. now: 31
56. did: 31
57. just: 31
58. ftc: 31
59. win: 30
60. so: 29
61. or: 29
62. an: 28
63. vs: 28
64. btw: 28
65. all: 27
66. etoh: 26
67. time: 25
68. roblox: 25
69. was: 25
70. gonna: 24
71. homer: 24
72. dont: 23
73. make: 23
74. why: 22
75. im: 22
76. ur: 22
77. but: 22
78. if: 21
79. should: 21
80. 2: 21
81. cant: 2