In [2]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import numpy as np
import re
import datetime

In [3]:
# get bears hashtags and the hashtags of other teams
bears = []
others = []
with open("teams.nfl.hashtags") as hashtags:
    for team in hashtags:
        tags = team.strip("\n")
        tags = re.split("\t| ", tags)
        if tags[0] == "CHI":
            bears = tags[1:]
        else:
            others.extend(tags[1:])

In [4]:
# count unigrams
def count_unigrams(tweet):
    global corpus
    words = tweet.split(" ")
    for word in words:
        if word in corpus:
            corpus[word] += 1
        else:
            corpus[word] = 1

In [5]:
# load in Chicago Bears 2020 season schedule
# information retrieved from pro-football-reference.com
schedule = pd.read_csv("bears_2020.txt",sep="\t")
schedule["Start_time"] = pd.to_datetime(schedule["Start_time"], utc=True)

In [8]:
# scrape twitter for unigrams
bears_tags = " OR ".join(bears)
corpus = {}

for i in range(16):
    print("processing week",str(schedule.iloc[i]["Week"]))
    # For each week, decide the date range to scrape tweets.
    # For week 1 and the week after the bye, say the range is the 7 days prior to the game.
    if (i==0) or (schedule.iloc[i]["Week"] - schedule.iloc[i-1]["Week"]) == 2:
        start = schedule.iloc[i]["Start_time"] - datetime.timedelta(days=7)
    else:
        start = schedule.iloc[i-1]["Start_time"] + datetime.timedelta(hours=4)
    end = schedule.iloc[i]["Start_time"] - datetime.timedelta(hours=1)
    date_range = " since:" + start.strftime('%Y-%m-%d') + " until:" + end.strftime('%Y-%m-%d')
    query = bears_tags + date_range
    
    #print(start,end)
    # Scrape the tweets for the date_range. Also have to filter based on the 
    # time stamp so as not to capture tweets during and after games.
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
        if i > 10000:
            break
        #print(tweet.date)
        if not (start <= tweet.date <= end):
            #print("found a bad one!", tweet.date)
            continue
        tw = tweet.content.lower()
        if not any(o in tw for o in others):
            count_unigrams(tw)

processing week 1
processing week 2
processing week 3
processing week 4
processing week 5
processing week 6
processing week 7
processing week 8
processing week 9
processing week 10
processing week 12
processing week 13
processing week 14
processing week 15
processing week 16
processing week 17


In [9]:
sorted(corpus.items(), key=lambda x: x[1], reverse=True)[:100]

[('the', 88980),
 ('to', 48264),
 ('#bears', 48022),
 ('a', 40608),
 ('and', 32428),
 ('in', 26293),
 ('#dabears', 25626),
 ('of', 23977),
 ('', 22559),
 ('for', 22450),
 ('is', 21532),
 ('#beardown', 20560),
 ('on', 19704),
 ('this', 17600),
 ('i', 16675),
 ('that', 12940),
 ('with', 12397),
 ('you', 12171),
 ('bears', 11973),
 ('be', 11843),
 ('at', 10229),
 ('have', 9920),
 ('it', 9721),
 ('are', 9679),
 ('we', 9627),
 ('#gobears', 8430),
 ('but', 8296),
 ('he', 8145),
 ('was', 7663),
 ('as', 6981),
 ('#chicagobears', 6942),
 ('not', 6816),
 ('they', 6717),
 ('chicago', 6420),
 ('all', 6387),
 ('from', 6340),
 ('my', 6281),
 ('&amp;', 6159),
 ('will', 6106),
 ('his', 6055),
 ('game', 6040),
 ('if', 6020),
 ('get', 5930),
 ('out', 5712),
 ('has', 5712),
 ('up', 5696),
 ('#nfl', 5668),
 ('just', 5498),
 ('nagy', 5418),
 ('-', 5361),
 ('so', 5322),
 ('@chicagobears', 5208),
 ('what', 5128),
 ('our', 4839),
 ('like', 4794),
 ('about', 4694),
 ('can', 4631),
 ('good', 4526),
 ('their', 4