# Data Scraper for Twitch Chat
This scraper collects Twitch chat text data for the lastest 10 streams of the top 100 streamers. Vod IDs and message data was collected from 3/12/2023 - 3/20/2023 

The top streamers were determined according to avg. views on twitchtracker.com on 3/12/2023

In [1]:
import requests
import pandas as pd
import numpy as np
import re
import twitch
import itertools

In [17]:
helix = twitch.Helix('bjcif9gensw0y7c8lt7sjetum9c7tt', 'sug8hgcsamkxkg04k17p8e1ls201tv')


streamers = ['avalanchesoftware', 'kaicenat', 'rainbow6', 'xqc', 'austinshow', 'taric', 
             'lec', 'lck', 'sapnap', 'valorant', 'georgenotfound', 'callofduty', 
             'lcs', 'halo', 'hasanabi', 'scump', 'dream', 'rocketleague', 
             'brucedropemoff', 'warframe', 'y4splz', 'mizkif', 'tenz', 'lirik',
             'yourragegaming', 'nmplol', 'unboxholics', 'tsm_imperialhal', 'warcraft', 'sodapoppin', 
             'loltyler1', 'forsen', 'zackrawrr', 'nickmercs', 'moonmoon', 'capcomfighters',
             'gorgc', 'paymoneywubby', 'fextralife', 'summit1g', 'fortnite', 'gmhikaru', 
             'emiru', 'thebausffs', 'clix', 'kyedae', 'magic', 'esl_csgo', 
             'brawlstars', 'doublelift']

streamers2 = ['jerma985', 'shroud', 'cdawgva', 'easportsfifa', 'nikolarn', 'moistcr1tikal', 'epulzegaming',
              'lpl', 'k3soju', 'brawlhalla', 'pokimane', 'eamaddennfl', 'trashtastepodcast', 'quinn69', 
              'chess', 'caedrel', 'criticalrole', 'maskenissen', 'lvndmark', 'psychoghost', 'cohhcarnage',
              'maximilian_dood', 'nasa', 'monaa122', 'topsonous', 'trainwreckstv', 'twitchrivals', 'betboom_eng', 
              'fctvlive__', 'officialboaster', 'ninja', 'karlnetwork', 'rekkles', 'zerkaa', 'shotzzy', 
              'iitztimmy', 'tinakitten', 'northernlion', 'hiswattson', 'rainbow6bravo', 'ranboolive', 'aceu', 
              'elajjaz', 'sonysandiegostudio', 'castro_1021', 'penta', 'amouranth', 'buddha', 'cxmmunity', 
              'vei']

vods = []

for user, videos in helix.users(streamers).videos():
    count = 0
    for video in videos:
        if video.type != 'archive':
            continue
        
        if count > 4: # Change back to 9
            break
            
        vods.append(
            {
                'user': user.login,
                'video_date': video.published_at,
                'video_id': video.id
            }
        )        
        count += 1
        
        
vods = pd.DataFrame(vods)
vods.to_csv(path_or_buf='data/vod_ids_extra.csv', index = False)

print(len(vods))
vods.head(5)

230


Unnamed: 0,user,video_date,video_id
0,emiru,2023-03-16T20:33:49Z,1767056615
1,emiru,2023-03-14T19:17:14Z,1765144047
2,emiru,2023-03-10T03:12:43Z,1760685348
3,emiru,2023-03-09T23:35:25Z,1760487320
4,emiru,2023-03-07T22:58:24Z,1758618370


In [18]:
# Set up the API request method

session = requests.Session()
session.headers = { 'Client-ID': 'kimne78kx3ncx6brgo4mv6wki5h1ko', 'content-type': 'application/json' }


def get_comments(video_id, cursor=None):
    response = session.post(
        'https://gql.twitch.tv/gql',
        json=[{
            'operationName': 'VideoCommentsByOffsetOrCursor',
            'variables': {
                'videoID': video_id,
                'cursor': cursor,
            },
            'extensions': {
                'persistedQuery': {
                    'version': 1,
                    'sha256Hash': 'b70a3591ff0f4e0313d126c6a1502d79a1c02baebb288227c582044aa76adf6a'
                }
            }
        }],
        timeout=10
    )

    response.raise_for_status()    
    return response.json()

In [22]:
# streamers = ['rainbow6']
# streamers2 = ['brawlhalla', 'cohhcarnage', 'amouranth', 'vei']

In [19]:
def get_message(message_components):
            result = ''
            for m in message_components:
                result = result + m['text']
            return result

In [23]:
# Get all comments for all videos
for streamer in streamers:
    print('Starting streamer:', streamer)
    user_vods = vods[vods['user'] == streamer]
    
    for user, video_date, video_id in user_vods.values:
        vod_df = pd.DataFrame()
        cursor = None
        counter = 0
        while True:
            response = None
            try:
                response = get_comments(video_id, cursor)
                if response[0]['data']['video']['comments']['pageInfo']['hasNextPage']:
                    cursor = response[0]['data']['video']['comments']['edges'][-1]['cursor']
                else:
                    break
                comments = response[0]['data']['video']['comments']['edges']

                comments_df = pd.json_normalize(comments)
                comments_df['streamer'] = user
                comments_df['video_date'] = video_date
                comments_df['video_id'] = video_id

                vod_df = pd.concat([vod_df, comments_df])
                counter += 1
                if counter%100 == 0:
                    print('Cursor step:', counter)
            except:
                print(response)
                break
        
        
        # Craft the chat message
        vod_df['message'] = vod_df['node.message.fragments'].apply(get_message)

        
        # Clean up columns
        vod_df = vod_df.drop(columns=['cursor', 'node.commenter.login', '__typename', 'node.id', 
                              'node.contentOffsetSeconds', 'node.message.fragments', 
                              'node.message.userBadges', 'node.message.userColor', 
                              'node.__typename'])
        
        if 'node.commenter' in vod_df.columns:
            vod_df = vod_df.drop(columns=['node.commenter'])

        vod_df.columns = ['commenter_id', 'commenter_name', 'user_type', 'message_time', 
                      'message_type', "streamer", "video_date", "video_id", 'message']
        
        # Write to CSV
        print('Writing to csv:', streamer)
        vod_df.to_csv(path_or_buf='data/raw/{0}_{1}.csv'.format(streamer, video_id), index = False)
            


Starting streamer: clix
Cursor step: 100
Cursor step: 200
Cursor step: 300
Cursor step: 400
Cursor step: 500
Cursor step: 600
Cursor step: 700
Cursor step: 800
Writing to csv: clix
Cursor step: 100
Cursor step: 200
Cursor step: 300
Cursor step: 400
Cursor step: 500
Cursor step: 600
Cursor step: 700
Cursor step: 800
Cursor step: 900
Cursor step: 1000
Cursor step: 1100
Cursor step: 1200
Cursor step: 1300
Cursor step: 1400
Cursor step: 1500
Cursor step: 1600
Cursor step: 1700
Cursor step: 1800
Cursor step: 1900
Cursor step: 2000
Cursor step: 2100
Cursor step: 2200
Cursor step: 2300
Cursor step: 2400
Writing to csv: clix
Cursor step: 100
Cursor step: 200
Cursor step: 300
Cursor step: 400
Cursor step: 500
Cursor step: 600
Cursor step: 700
Cursor step: 800
Cursor step: 900
Cursor step: 1000
Cursor step: 1100
Writing to csv: clix
Cursor step: 100
Cursor step: 200
Cursor step: 300
Cursor step: 400
Cursor step: 500
Cursor step: 600
Cursor step: 700
Cursor step: 800
Cursor step: 900
Cursor step: