In [14]:
import pandas as pd
import random
import numpy as np
from tqdm import tqdm
import ipdb
import re
from glob import glob
import json
import csv
import datetime as dt
from collections import Counter

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('chained_assignment',None)

# Set random seeds for reproducibility on a specific machine
random.seed(1)
np.random.seed(1)
np.random.RandomState(1)

tqdm.pandas()

# nlp = stanza.Pipeline(lang='en', processors='tokenize')

In [2]:
sample = pd.read_csv('../data/wp/test_data.tsv', sep='\t', index_col=None, header=0, quoting=csv.QUOTE_NONE, escapechar="\\", engine='python')
sample.shape

(1499, 17)

In [3]:
postinfo = pd.read_csv('../data/postInfo.tsv', sep='\t')
gameinfo = pd.read_csv('../data/gameInfo.tsv', sep='\t')
postinfo.shape, gameinfo.shape

((2407, 4), (569, 8))

In [4]:
teaminfo = pd.read_csv('../data/nfl_teams.csv')
teaminfo['team_name_short'] = teaminfo['team_name_short'].apply(lambda x: x.lower())

teams = teaminfo['team_name_short'].values.tolist()
subreddits = teaminfo['subreddit'].values.tolist()

teams_to_subreddit = {teams[i]: subreddits[i] for i in range(32)}
subreddit_to_teams = {subreddits[i]: teams[i] for i in range(32)}

In [5]:
df = pd.read_csv('../data/game_comments.tsv', sep='\t', index_col=None, header=0, quoting=csv.QUOTE_NONE, escapechar="\\", engine='python')

# Some cleanup
df['clean_comment'] = df['clean_comment'].astype(str)
df.loc[df['opp']=='commanders', 'opp'] = 'washington'

In [6]:
pbp = pd.read_csv('../data/play_by_play.tsv', sep='\t', low_memory=False)

def get_utc_timestamp(row):
    tod = row['time_of_day']
    date = row['game_date']
    
    day = int(date.split('-')[2])
    month = int(date.split('-')[1])
    year = int(date.split('-')[0])
    
    hour = int(tod.split(':')[0])
    minute = int(tod.split(':')[1])
    second = int(tod.split(':')[2])
    
    # UTC time for NFL games is almost always afternoon? The earliest local time is around 11am, which would be at least 3pm in UTC
    if int(hour)<12: 
        day_obj = dt.date(year, month, day)
        day = (day_obj + dt.timedelta(days=1)).day
    
    timestamp = dt.datetime(year, month, day, hour, minute, second, tzinfo=dt.timezone.utc)
    return int(timestamp.timestamp())

In [7]:
def find_parent(parent_id, post_id):
    if parent_id[:2]=='t3':
        return "No parent comment."
    else:
        parent_comment = df[((df['post_id']==post_id) & (df['comment_id']==parent_id[3:]))]['clean_comment']
        if parent_comment.shape[0]>0:
            parent_comment = parent_comment.values[0]
        else:
            "No parent comment."
        return parent_comment

In [8]:
def make_context(row):
    team = row['team'].title()
    opp = row['opp'].title()
    
    gameid = postinfo[postinfo['post_id'] == row['post_id']]['game_id'].values[0]
    game_pbp = pbp.loc[pbp['new_game_id']==gameid, ['game_date','time_of_day','posteam', 'home_team', 'away_team', 'total_home_score', 'total_away_score']].reset_index(drop=True)
    game_pbp = game_pbp.bfill().ffill()
    game_pbp['timestamp'] = game_pbp.apply(lambda x: get_utc_timestamp(x), axis=1)
    
    home_team, away_team = game_pbp['home_team'].values[0], game_pbp['away_team'].values[0]
    home_team = teaminfo.loc[teaminfo.team_id==home_team, 'team_name_short'].values[0].title()
    away_team = teaminfo.loc[teaminfo.team_id==away_team, 'team_name_short'].values[0].title()

    date = game_pbp.game_date.values[0]
    parent_comment = find_parent(row.parent_id, row.post_id)
    
    if row.timestamp > game_pbp.timestamp.min():
        home_score, away_score = game_pbp.loc[game_pbp.timestamp<row.timestamp, 'total_home_score'].values[-1], game_pbp.loc[game_pbp.timestamp<row.timestamp, 'total_away_score'].values[-1]
    else:
        home_score, away_score = 0, 0
        
    result =  home_team + ' ' + str(home_score) + '-' + str(away_score) + ' ' + away_team
    
    context = "**IN-GROUP**: " + team  + " || **OUT-GROUP**: " + opp + " || **LIVE-SCORE**: " + result + " || **GAME-DATE**: " + str(date) + " || **PARENT COMMENT**: " + parent_comment
    
    return context

sample['context'] = sample.progress_apply(lambda x: make_context(x), axis=1)

100%|████████████████████████████████████████████████████████████████████████████████████| 1499/1499 [06:37<00:00,  3.77it/s]


In [9]:
final = sample.loc[:, ['split', 'post_id', 'comment_id', 'tokenized_comment', 'context']]
final = final.rename(columns={"tokenized_comment": "source"}).loc[final.split=='test']

In [10]:
final

Unnamed: 0,split,post_id,comment_id,source,context
1,test,pu79mu,he18r6i,[SENT] Is it me or is this game fiery and fast paced tonight ?,**IN-GROUP**: Panthers || **OUT-GROUP**: Texans || **LIVE-SCORE**: Texans 0-7 Panthers || **GAME-DATE**: 2021-09-23 || **PARENT COMMENT**: No parent comment.
10,test,z0937l,ix4sj2p,[SENT] This team implodes in the red zone,**IN-GROUP**: Patriots || **OUT-GROUP**: Jets || **LIVE-SCORE**: Patriots 3-3 Jets || **GAME-DATE**: 2022-11-20 || **PARENT COMMENT**: No parent comment.
17,test,xbm6x0,io146xc,[SENT] The Jets really are the Jetsiest .,**IN-GROUP**: Jets || **OUT-GROUP**: Ravens || **LIVE-SCORE**: Jets 3-24 Ravens || **GAME-DATE**: 2022-09-11 || **PARENT COMMENT**: No parent comment.
18,test,pnqrqd,hcrtr5l,[SENT] Ruggs has the potential to be an all time bust especially if Lamb and Juedy stay on their rise Such a bad pick . [SENT] Ferell and Ruggs are potentially franchise altering mistakes,**IN-GROUP**: Raiders || **OUT-GROUP**: Ravens || **LIVE-SCORE**: Raiders 10-17 Ravens || **GAME-DATE**: 2021-09-13 || **PARENT COMMENT**: No parent comment.
22,test,x9gwgf,inoabnj,[SENT] Ankle . [SENT] He ’s walking around in pads . [SENT] He was almost better than Miller in the first half .,**IN-GROUP**: Bills || **OUT-GROUP**: Rams || **LIVE-SCORE**: Rams 10-17 Bills || **GAME-DATE**: 2022-09-08 || **PARENT COMMENT**: Coming in VERY late - what happened to Oliver?
...,...,...,...,...,...
1475,test,ybqjnm,iti6k03,"[SENT] Ah , they finally called the MASSIVE holding going on . [SENT] Fourth times a charm I guess .",**IN-GROUP**: Chargers || **OUT-GROUP**: Seahawks || **LIVE-SCORE**: Chargers 0-0 Seahawks || **GAME-DATE**: 2022-10-23 || **PARENT COMMENT**: No parent comment.
1476,test,zizm2a,iztpjpb,[SENT] Anyone have a clip of the hit on Huntley ? [SENT] I wo n't shut up about it to my wife and have yet to locate it,**IN-GROUP**: Steelers || **OUT-GROUP**: Ravens || **LIVE-SCORE**: Steelers 7-13 Ravens || **GAME-DATE**: 2022-12-11 || **PARENT COMMENT**: No parent comment.
1481,test,z3yb3a,ixp28r0,[SENT] Protect D here we come !!,**IN-GROUP**: Vikings || **OUT-GROUP**: Patriots || **LIVE-SCORE**: Vikings 33-26 Patriots || **GAME-DATE**: 2022-11-24 || **PARENT COMMENT**: No parent comment.
1483,test,sah8rz,htu3x0j,[SENT] Yes the 4 yard pass to Adam ’s will save us,**IN-GROUP**: Packers || **OUT-GROUP**: 49Ers || **LIVE-SCORE**: Packers 10-10 49Ers || **GAME-DATE**: 2022-01-22 || **PARENT COMMENT**: No parent comment.


In [17]:
annotations = []
files = glob('thresh-internal/crowd/annotations*.json')
for file in files:
    with open(file, 'r', encoding='utf-8') as f:
        new_anns = json.load(f)
        for f in new_anns:
            f['user'] = file.split('-')[1][1]
        annotations += new_anns

done_ids = []
for ann in tqdm(annotations):
    # ann_df['post_id'].append(ann['post_id'])
    done_ids.append(ann['comment_id'])

done_ids = list(set(done_ids))

100%|█████████████████████████████████████████████████████████████████████████████████| 250/250 [00:00<00:00, 2716518.13it/s]


In [25]:
final = final.loc[~final.comment_id.isin(done_ids)].reset_index(drop=True)

In [27]:
final.iloc[50:100]

Unnamed: 0,split,post_id,comment_id,source,context
50,test,100nsti,j2j3tv3,[SENT] wow who could 've guessed the result of that kick ? [SENT] did n't even glance over from my meal,**IN-GROUP**: Buccaneers || **OUT-GROUP**: Panthers || **LIVE-SCORE**: Buccaneers 0-7 Panthers || **GAME-DATE**: 2023-01-01 || **PARENT COMMENT**: No parent comment.
51,test,xbnazb,io0ve0e,"[SENT] Okay , that 's it fire everyone in this organization",**IN-GROUP**: Bears || **OUT-GROUP**: 49Ers || **LIVE-SCORE**: Bears 0-10 49Ers || **GAME-DATE**: 2022-09-11 || **PARENT COMMENT**: No parent comment.
52,test,qosg7q,hjpy1f7,[SENT] This team is weak and it all stems from the head coach . [SENT] Fire Zimmer,**IN-GROUP**: Vikings || **OUT-GROUP**: Ravens || **LIVE-SCORE**: Ravens 31-24 Vikings || **GAME-DATE**: 2021-11-07 || **PARENT COMMENT**: No parent comment.
53,test,106t8ta,j3jcunk,[SENT] winning cures all lmao,"**IN-GROUP**: Chargers || **OUT-GROUP**: Broncos || **LIVE-SCORE**: Broncos 24-20 Chargers || **GAME-DATE**: 2023-01-08 || **PARENT COMMENT**: I guess staley has a point lol. If we bench players and can't find our rhythm against the Jags, we'll question Staley's decision. Staley gets blamed either way lmao"
54,test,rp0uym,hq26rqh,[SENT] The streak is in jeopardy because McVay is forcing the pass when his QB is having a terrible game .,**IN-GROUP**: Rams || **OUT-GROUP**: Vikings || **LIVE-SCORE**: Vikings 10-13 Rams || **GAME-DATE**: 2021-12-26 || **PARENT COMMENT**: The streak is in jeopardy because of Stafford's brain
55,test,xog9do,iq22vz7,[SENT] You should,**IN-GROUP**: Cowboys || **OUT-GROUP**: Giants || **LIVE-SCORE**: Giants 13-20 Cowboys || **GAME-DATE**: 2022-09-26 || **PARENT COMMENT**: I take back everything
56,test,xo2afg,ipx7elr,[SENT] Was that on Jimmy ?,"Series([], Name: clean_comment, dtype: object)"
57,test,qjt4ao,hisjqsn,"[SENT] You do n't "" come back healthy "" from this injury in the same season . [SENT] If he 's out , he 's done for the year because it requires surgery .",**IN-GROUP**: Browns || **OUT-GROUP**: Steelers || **LIVE-SCORE**: Browns 3-3 Steelers || **GAME-DATE**: 2021-10-31 || **PARENT COMMENT**: Mayfield looks horrible. It’s obvious he is hurt. Keenum gives us a better chance to win. Let baker heal so he can come back and win a SB healthy
58,test,q5ex2d,hg5eorj,[SENT] He blocks well and he draws double teams to open up others ( like that Higgins td ) but it ’s definitely not the impact you need from a WR1 .,**IN-GROUP**: Browns || **OUT-GROUP**: Chargers || **LIVE-SCORE**: Chargers 13-10 Browns || **GAME-DATE**: 2021-10-10 || **PARENT COMMENT**: Someone explain to me what obj has done to benefit this team.
59,test,xhkdm7,ioypmp7,[SENT] Pats so lucky haha,**IN-GROUP**: Patriots || **OUT-GROUP**: Steelers || **LIVE-SCORE**: Steelers 6-10 Patriots || **GAME-DATE**: 2022-09-18 || **PARENT COMMENT**: No parent comment.


In [34]:
final.shape

(293, 5)

In [33]:
with open('thresh-internal/crowd/test/sample-6.json', 'w') as f:
    final.iloc[250:300].to_json(f, orient='records', indent=4)