In [1]:
import pandas as pd
import random
import numpy as np
from tqdm import tqdm
import ipdb
import re
from glob import glob
import json
import csv
import datetime as dt
from copy import deepcopy
from collections import Counter
from IPython.core.debugger import set_trace

import matplotlib.pyplot as plt
# import mplcursors
import seaborn as sns
%matplotlib inline
sns.set(style='darkgrid', context='notebook', rc={'figure.figsize':(14,10)}, font_scale=2)

from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('chained_assignment',None)

tqdm.pandas()

from nltk.corpus import words
word_list = words.words()

# Set random seeds for reproducibility on a specific machine
random.seed(1)
np.random.seed(1)
np.random.RandomState(1)

RandomState(MT19937) at 0x318FBCF40

In [2]:
annotations = []
files = glob('thresh-internal/me/game*annotations.json')
for file in files:
    with open(file, 'r', encoding='utf-8') as f:
        annotations += json.load(f)
        
# files = glob('thresh-internal/random_samples/old/game*annotations.json')
# for file in files:
#     with open(file, 'r') as f:
#         annotations += json.load(f)

In [3]:
postinfo = pd.read_csv('../data/postInfo.tsv', sep='\t')
gameinfo = pd.read_csv('../data/gameInfo.tsv', sep='\t')

teaminfo = pd.read_csv('../data/nfl_teams.csv')
teaminfo['team_name_short'] = teaminfo['team_name_short'].apply(lambda x: x.lower())

teams = teaminfo['team_name_short'].values.tolist()
subreddits = teaminfo['subreddit'].values.tolist()

teams_to_subreddit = {teams[i]: subreddits[i] for i in range(32)}
subreddit_to_teams = {subreddits[i]: teams[i] for i in range(32)}
team_names_dict = {x: [x] for x in teams}

In [4]:
annotations[2]

{'post_id': 'quuhr8',
 'comment_id': 'hksqs65',
 'source': '[SENT] Lol all the 🤡 saying this team was only going to lose 1 game after the Titans game . ',
 'context': "*source subreddit*: **rams** || *final score*: **49ers 31-10 rams ** || *Date*: **11/15/2021** || *user flair*: Deacon Jones || *parent comment*: No parent comment. || *Prev 2 comments*: Yup. The called the raiders for a penalty yesterday doing what the niners did but no call for us. || why would raheem morris throw a pick 6, lets fire him /s ||  *Next 2 comments*: You guys know that's on Higbee right? || What’s the lions’ record?  Lmao || ",
 'edits': [{'category': 'in',
   'id': 1,
   'annotation': None,
   'input_idx': [[29, 38]]},
  {'category': 'other', 'id': 1, 'annotation': None, 'input_idx': [[75, 85]]}],
 '_thresh_id': 3}

## Concatenate into one big true test dataframe

In [5]:
ann_df = {'post_id': [], 'comment_id': [], 'tokenized_comment': [], 'tagged_comment': [], 'ref_expressions': [], 'ref_pos': [], 'ref_tags': [], 'confs': []}
for ann in tqdm(annotations):
    edits = ann['edits']
    # Sort it
    edits.sort(key=(lambda x: x['input_idx'][0][0]))
        
    ann_df['post_id'].append(ann['post_id'])
    ann_df['comment_id'].append(ann['comment_id'])
    tokenized_comment = ann['source']
    ann_df['tokenized_comment'].append(tokenized_comment)
    
    tagged_comment = ''
    ref_expressions = []
    ref_pos = []
    ref_tags = []
    confs = []
    
    for ind, curr_edit in enumerate(edits):
        
        # Figure out the referring expression for that edit
        ref_expr = tokenized_comment[curr_edit['input_idx'][0][0]:curr_edit['input_idx'][0][1]]
        
        start_pos = curr_edit['input_idx'][0][0]
        end_pos = curr_edit['input_idx'][0][1]
        
        ref_expressions.append(ref_expr)
        ref_pos.append((curr_edit['input_idx'][0][0], end_pos))
        
        ref_tags.append('<' + curr_edit['category'] + '>')
        if curr_edit['annotation'] is not None:
            # Need to check for conf, since I annotated with 'ref' without conf sometimes
            if 'conf' in curr_edit['annotation'].keys():
                # set_trace()
                confs.append(int(curr_edit['annotation']['conf']['val'].split('_')[-1]))
            else:
                confs.append(5)
        else:
            confs.append(5)
        # Get the new tagged comment based on edit indices
        if ind==0:
            tagged_comment = tokenized_comment[:curr_edit['input_idx'][0][0]] + '<' + curr_edit['category'] + '>'
            if len(edits)!=1:
                next_edit = edits[ind+1]
                tagged_comment += tokenized_comment[curr_edit['input_idx'][0][1]:next_edit['input_idx'][0][0]]
            else:
                tagged_comment += tokenized_comment[curr_edit['input_idx'][0][1]:]
        elif ind!=len(edits)-1:
            next_edit = edits[ind+1]
            tagged_comment += '<' + curr_edit['category'] + '>' + tokenized_comment[curr_edit['input_idx'][0][1]:next_edit['input_idx'][0][0]]
        else:
            tagged_comment += '<' + curr_edit['category'] + '>' + tokenized_comment[curr_edit['input_idx'][0][1]:]
    
    ann_df['tagged_comment'].append(tagged_comment)
    ann_df['ref_expressions'].append(ref_expressions)
    ann_df['ref_tags'].append(ref_tags)
    ann_df['ref_pos'].append(ref_pos)
    if confs==[]:
        confs = [5]
    ann_df['confs'].append(confs)

100%|██████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 238937.22it/s]


In [6]:
ann_df = pd.DataFrame(ann_df)
ann_df.shape

(1500, 8)

In [7]:
ann_df.drop_duplicates(subset=['post_id', 'comment_id'], keep='first', inplace=True, ignore_index=True)
ann_df.shape

(1500, 8)

### Exclude the january buffalo game

In [8]:
ann_df = ann_df[~(ann_df.post_id=='101c27x')]

In [9]:
ann_df.shape

(1499, 8)

In [10]:
df = pd.read_csv('../data/game_comments.tsv', sep='\t', index_col=None, header=0, quoting=csv.QUOTE_NONE, escapechar="\\", engine='python')

# Some cleanup
df['clean_comment'] = df['clean_comment'].astype(str)
# df['tokenized_comment'] = df['tokenized_comment'].astype(str)

In [11]:
df['team'] = df['subreddit'].apply(lambda x: subreddit_to_teams[x])

In [12]:
df=df.set_index(['post_id', 'comment_id'], drop=False)

In [13]:
df.gametime.max()

1.0

### Add remaining columns needed to this

In [14]:
def find_col_values(row):
    relevant_comment = df.loc[(row.post_id, row.comment_id)]
    return {
        'timestamp': relevant_comment['timestamp'],
        'parent_id': relevant_comment['parent_id'],
        'team': relevant_comment['team'],
        'opp': relevant_comment['opp'],
        'username': relevant_comment['username'],
        'flair': relevant_comment['flair'],
        'votes': relevant_comment['votes'], 
        'gametime': relevant_comment['gametime'],
        'win_prob':relevant_comment['win_prob']
    }
    

ann_df[['timestamp', 'parent_id', 'team', 'opp', 'username', 'flair', 'votes', 'gametime', 'win_prob']] = ann_df.apply(lambda row: find_col_values(row), axis='columns', result_type='expand')

## Get win prob

In [15]:
pbp = pd.read_csv('../data/play_by_play.tsv', sep='\t', low_memory=False)

def get_utc_timestamp(row):
    tod = row['time_of_day']
    date = row['game_date']
    
    day = int(date.split('-')[2])
    month = int(date.split('-')[1])
    year = int(date.split('-')[0])
    
    hour = int(tod.split(':')[0])
    minute = int(tod.split(':')[1])
    second = int(tod.split(':')[2])
    
    # UTC time for NFL games is almost always afternoon? The earliest local time is around 11am, which would be at least 3pm in UTC
    if int(hour)<12: 
        day_obj = dt.date(year, month, day)
        day = (day_obj + dt.timedelta(days=1)).day
    
    timestamp = dt.datetime(year, month, day, hour, minute, second, tzinfo=dt.timezone.utc)
    return int(timestamp.timestamp())

In [16]:
tag_counts = {'<in>': 0, '<out>':0, '<other>': 0}
for tag in tag_counts.keys():
    for _, row in ann_df.iterrows():
        tag_counts[tag] += str(row['tagged_comment']).count(tag)
tag_counts, ann_df.shape[0]

({'<in>': 1392, '<out>': 267, '<other>': 166}, 1499)

In [17]:
ann_df.head()

Unnamed: 0,post_id,comment_id,tokenized_comment,tagged_comment,ref_expressions,ref_pos,ref_tags,confs,timestamp,parent_id,team,opp,username,flair,votes,gametime,win_prob
0,pmvxko,hcl8hgm,[SENT] I ’m so glad that these things can be determined in week 1 ; there ’s no need for me to watch the rest of the season now !,,[],[],[],[5],1631470000.0,t1_hcl8cj8,panthers,panthers,all1good,Purrbaca,1.0,0.286,0.422
1,pu79mu,he18r6i,[SENT] Is it me or is this game fiery and fast paced tonight ?,,[],[],[],[5],1632445000.0,t3_pu79mu,panthers,panthers,prokreat,,9.0,0.178,0.1
2,quuhr8,hksqs65,[SENT] Lol all the 🤡 saying this team was only going to lose 1 game after the Titans game .,[SENT] Lol all the 🤡 saying t<in>was only going to lose 1 game after t<other>game .,"[his team , he Titans ]","[(29, 38), (75, 85)]","[<in>, <other>]","[5, 5]",1637027000.0,t3_quuhr8,rams,rams,Dodger_Dawg,Deacon Jones,0.0,0.159,0.744
3,xbm6xz,io0kw4p,[SENT] lol people writing off the season before halftime of the first game .... give yer fuckin balls a tug,<in> lol people writing off the season before halftime of the first game .... give yer fuckin balls a tug,[[SENT]],"[(0, 6)]",[<in>],[5],1662919000.0,t3_xbm6xz,bengals,bengals,StarchyAndDelicious,:3,12.0,0.231,0.599
4,qtu5tz,hkmv1b0,[SENT] I cant stop smiling !,,[],[],[],[5],1636923000.0,t3_qtu5tz,patriots,patriots,lqqk009,,2.0,0.966,0.0


In [18]:
def get_win_prob(row, game_pbp):
    wp = 0
    game_start = game_pbp.timestamp.min()
    game_end = game_pbp.timestamp.max()

    if row.timestamp <= game_start:
        wp = game_pbp.vegas_home_wp.values[0]
    elif row.timestamp >= game_end:
        wp = game_pbp.vegas_home_wp.values[-1]
    else:
        last_pbp = game_pbp[game_pbp.timestamp<row.timestamp].tail(1).to_dict(orient='records')[0]
        wp = last_pbp['vegas_home_wp']
        home_team_name = teaminfo.loc[teaminfo['team_id']==game_pbp['home_team'].values[0], 'team_name_short'].values[0]
        if row['team'] != home_team_name: 
            wp = 1 - wp
    return wp

def get_gametime(row, game_pbp):
    game_start = game_pbp.timestamp.min()
    game_end = game_pbp.timestamp.max()
    if row.timestamp<=game_start:
        return 0
    elif row.timestamp > game_end:
        return 1
    else:
        return (row.timestamp-game_start)/(game_end-game_start)

In [19]:
for pid in tqdm(ann_df.post_id.unique()):
    gameid = postinfo.loc[postinfo.post_id==pid, 'game_id'].values[0]
    # Get PBP and get timestamp (maybe useful later for scaling the score?)
    game_pbp = pbp.loc[pbp['new_game_id']==gameid, ['game_date','time_of_day','posteam', 'home_team', 'desc', 'vegas_home_wp']].reset_index(drop=True)
    game_pbp = game_pbp.bfill().ffill()
    game_pbp['timestamp'] = game_pbp.apply(lambda x: get_utc_timestamp(x), axis=1)
    
    new_df = ann_df[ann_df.post_id==pid]
    ann_df.loc[ann_df.post_id==pid, 'gametime']  = new_df.apply(lambda x: get_gametime(x, game_pbp), axis=1)
    ann_df.loc[ann_df.post_id==pid, 'win_prob'] = new_df.apply(lambda x: get_win_prob(x, game_pbp), axis=1)

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 746/746 [00:09<00:00, 78.90it/s]


In [20]:
ann_df.tagged_comment = ann_df.tagged_comment.apply(lambda x: x.replace('<in>', '[IN]').replace('<out>', '[OUT]').replace('<other>', '[OTHER]'))
ann_df.ref_expressions = ann_df.ref_expressions.apply(lambda x: [y.replace('<in>', '[IN]').replace('<out>', '[OUT]').replace('<other>', '[OTHER]') for y in x])
ann_df.ref_tags = ann_df.ref_tags.apply(lambda x: [y.replace('<in>', '[IN]').replace('<out>', '[OUT]').replace('<other>', '[OTHER]') for y in x])

In [21]:
ann_df.shape

(1499, 17)

In [22]:
ann_df.head(2)

Unnamed: 0,post_id,comment_id,tokenized_comment,tagged_comment,ref_expressions,ref_pos,ref_tags,confs,timestamp,parent_id,team,opp,username,flair,votes,gametime,win_prob
0,pmvxko,hcl8hgm,[SENT] I ’m so glad that these things can be determined in week 1 ; there ’s no need for me to watch the rest of the season now !,,[],[],[],[5],1631470000.0,t1_hcl8cj8,panthers,panthers,all1good,Purrbaca,1.0,0.286012,0.577765
1,pu79mu,he18r6i,[SENT] Is it me or is this game fiery and fast paced tonight ?,,[],[],[],[5],1632445000.0,t3_pu79mu,panthers,panthers,prokreat,,9.0,0.177857,0.899641


In [23]:
def add_explanation(row, cname, edf):
    # ipdb.set_trace()
    if row.comment_id in edf.comment_id.values:
        row[cname] = edf.loc[edf.comment_id==row.comment_id, 'explanation'].values[0]
    else:
        row[cname] = "No explicit or implicit references to tag."
    return row
            
exp_df = pd.read_csv('../modeling/model-outputs/gpt-4o/explanations.txt', names=['comment_id', 'explanation'], sep='^([a-z0-9]{7})\,', engine='python')
exp_wp_df = pd.read_csv('../modeling/model-outputs/gpt-4o/explanations+wp.txt', names=['comment_id', 'explanation'], sep='^([a-z0-9]{7})\,', engine='python')

ann_df['explanation'] = ''
ann_df['explanation+wp'] = ''

ann_df = ann_df.apply(lambda x: add_explanation(x, 'explanation', exp_df), axis=1)
ann_df = ann_df.apply(lambda x: add_explanation(x, 'explanation+wp', exp_wp_df), axis=1)

## Split and save

In [24]:
random.seed(1)
ann_df['split'] = ann_df.apply(lambda x: 'train' if random.random()<0.8 else 'test', axis=1)

In [25]:
Counter(ann_df.split)

Counter({'train': 1181, 'test': 318})

In [26]:
ann_df[((ann_df.split=='test') & (ann_df.explanation=="No explicit or implicit references to tag."))].shape

(78, 20)

In [27]:
# ann_df.loc[ann_df.tagged_comment.isna(), 'ref_expressions']
def handle_empties(row):
    if row.tagged_comment=='':
        row.tagged_comment = row.tokenized_comment
    return row

ann_df = ann_df.apply(lambda x: handle_empties(x), axis=1)

In [28]:
def parent_comment(row):
    parent_id = row['parent_id']
    if parent_id[:2] == 't1':
        if parent_id in df.comment_id.values:
            parent_comment = df[df.comment_id==parent_id[3:], 'clean_comment'].values[0]
        else:
            parent_comment="None"
    else:
        parent_comment = "None"

    return parent_comment

ann_df['parent_comment'] = ann_df.progress_apply(lambda x: parent_comment(x), axis=1)

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1499/1499 [01:39<00:00, 15.05it/s]


In [29]:
ann_df.to_csv('../data/gold_data.tsv', sep='\t', quoting=csv.QUOTE_NONE, escapechar="\\", index=False, columns=['split', 'post_id', 'comment_id', 'parent_id', 'tokenized_comment', 'tagged_comment', 'parent_comment', 'ref_expressions', 'ref_pos', 'ref_tags', 'confs', 'explanation+wp', 'explanation', 'timestamp', 'team', 'opp', 'username', 'flair', 'votes', 'win_prob', 'gametime'])

In [34]:
usernames = ann_df.username.unique()
username_to_anon = dict(zip(usernames, range(len(usernames))))

In [35]:
ann_df['username_anon'] = ann_df['username'].apply(lambda x: username_to_anon[x])

In [36]:
ann_df.to_csv('../../intergroup-nfl/data/gold_data.tsv', sep='\t', quoting=csv.QUOTE_NONE, escapechar="\\", index=False, columns=['split', 'post_id', 'comment_id', 'parent_id', 'tagged_comment', 'ref_expressions', 'ref_pos', 'ref_tags', 'confs', 'explanation', 'explanation+wp', 'timestamp', 'team', 'opp', 'username_anon', 'flair', 'votes', 'win_prob', 'gametime'])