In [1]:
import pandas as pd
import random
import numpy as np
from tqdm import tqdm
import ipdb
import re
from glob import glob
import json
import csv
import datetime as dt
from copy import deepcopy
from collections import Counter
from difflib import SequenceMatcher
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters

from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('chained_assignment',None)

tqdm.pandas()

from nltk.corpus import words
word_list = words.words()

# Set random seeds for reproducibility on a specific machine
random.seed(1)
np.random.seed(1)
np.random.RandomState(1)

RandomState(MT19937) at 0x178E73140

In [2]:
gold = pd.read_csv('../../../data/gold_data.tsv', sep='\t', quoting=csv.QUOTE_NONE, escapechar="\\")
gold.ref_expressions = gold.ref_expressions.apply(lambda x: eval(x))
gold.ref_pos = gold.ref_pos.apply(lambda x: eval(x))
gold.ref_tags = gold.ref_tags.apply(lambda x: eval(x))

In [3]:
annotations = []
files = glob('*annotation*.json')
for file in files:
    with open(file, 'r', encoding='utf-8') as f:
        new_anns = json.load(f)
        for f in new_anns:
            f['user'] = file.split('-')[2][0]
        annotations += new_anns

In [4]:
postinfo = pd.read_csv('../../../data/postInfo.tsv', sep='\t')
gameinfo = pd.read_csv('../../../data/gameInfo.tsv', sep='\t')

teaminfo = pd.read_csv('../../../data/nfl_teams.csv')
teaminfo['team_name_short'] = teaminfo['team_name_short'].apply(lambda x: x.lower())

teams = teaminfo['team_name_short'].values.tolist()
subreddits = teaminfo['subreddit'].values.tolist()

teams_to_subreddit = {teams[i]: subreddits[i] for i in range(32)}
subreddit_to_teams = {subreddits[i]: teams[i] for i in range(32)}
team_names_dict = {x: [x] for x in teams}

In [5]:
ann_df = {'post_id': [], 'comment_id': [], 'tokenized_comment': [], 'tagged_comment': [], 'ref_expressions': [], 'ref_pos': [], 'ref_tags': [], 'user': [], 'confs': []}

for ann in tqdm(annotations):
    
    edits = ann['edits']
    # Sort it
    edits.sort(key=(lambda x: x['input_idx'][0][0]))
        
    ann_df['post_id'].append(ann['post_id'])
    ann_df['comment_id'].append(ann['comment_id'])
    tokenized_comment = ann['source']
    ann_df['tokenized_comment'].append(tokenized_comment)
    ann_df['user'].append(ann['user'])
    
    tagged_comment = ''
    ref_expressions = []
    ref_pos = []
    ref_tags = []
    confs = []
    
    for ind, curr_edit in enumerate(edits):
        # Figure out the referring expression for that edit
        ref_expr = tokenized_comment[curr_edit['input_idx'][0][0]:curr_edit['input_idx'][0][1]]
        
        start_pos = curr_edit['input_idx'][0][0]
        end_pos = curr_edit['input_idx'][0][1]
        
        ref_expressions.append(ref_expr)
        ref_pos.append((curr_edit['input_idx'][0][0], end_pos))
        
        ref_tags.append('<' + curr_edit['category'] + '>')

        # Get confidence scores
        if curr_edit['annotation'] is not None:
            # Need to check for conf, since I annotated with 'ref' without conf sometimes
            if 'conf' in curr_edit['annotation'].keys():
                # set_trace()
                confs.append(int(curr_edit['annotation']['conf']['val'].split('_')[-1]))
            else:
                confs.append(5)
        else:
            confs.append(5)
        
        # Get the new tagged comment based on edit indices
        if ind==0:
            tagged_comment = tokenized_comment[:curr_edit['input_idx'][0][0]] + '<' + curr_edit['category'] + '>'
            if len(edits)!=1:
                next_edit = edits[ind+1]
                tagged_comment += tokenized_comment[curr_edit['input_idx'][0][1]:next_edit['input_idx'][0][0]]
            else:
                tagged_comment += tokenized_comment[curr_edit['input_idx'][0][1]:]
        elif ind!=len(edits)-1:
            next_edit = edits[ind+1]
            tagged_comment += '<' + curr_edit['category'] + '>' + tokenized_comment[curr_edit['input_idx'][0][1]:next_edit['input_idx'][0][0]]
        else:
            tagged_comment += '<' + curr_edit['category'] + '>' + tokenized_comment[curr_edit['input_idx'][0][1]:]
    
    ann_df['tagged_comment'].append(tagged_comment)
    ann_df['ref_expressions'].append(ref_expressions)
    ann_df['ref_tags'].append(ref_tags)
    ann_df['ref_pos'].append(ref_pos)
    if confs==[]:
        confs = [5]
    ann_df['confs'].append(confs)

ann_df = pd.DataFrame(ann_df)
ann_df.shape

100%|███████████████████████████████████████████████████████████████████████████████████████████| 1179/1179 [00:00<00:00, 29018.06it/s]


(1179, 9)

In [6]:
ann_df.tagged_comment = ann_df.tagged_comment.apply(lambda x: x.replace('<in>', '[IN]').replace('<out>', '[OUT]').replace('<other>', '[OTHER]'))
ann_df.ref_expressions = ann_df.ref_expressions.apply(lambda x: [y.replace('<in>', '[IN]').replace('<out>', '[OUT]').replace('<other>', '[OTHER]') for y in x])
ann_df.ref_tags = ann_df.ref_tags.apply(lambda x: [y.replace('<in>', '[IN]').replace('<out>', '[OUT]').replace('<other>', '[OTHER]') for y in x])

In [7]:
ann_df.sample(3)

Unnamed: 0,post_id,comment_id,tokenized_comment,tagged_comment,ref_expressions,ref_pos,ref_tags,user,confs
660,s00mf8,hrzdn5h,[SENT] Watching this team is a damn roller - coaster,[SENT] Watching [IN] is a damn roller - coaster,[this team],"[(16, 25)]",[[IN]],m,[3]
548,qi08vc,higjg47,[SENT] Maybe our Defense is legit,[SENT] Maybe [IN] [IN] is legit,"[our, Defense]","[(13, 16), (17, 24)]","[[IN], [IN]]",a,"[5, 5]"
1062,z6b1l1,iy0vudc,"[SENT] The football gods giveth , the football gods taketh",,[],[],[],a,[5]


In [8]:
gold.shape

(1499, 21)

In [9]:
test = gold[gold.split=='test']
test.shape

(318, 21)

In [10]:
ann_df['user'].unique()

array(['m', 'a', 'k'], dtype=object)

## IAA

In [11]:
test = test[test.comment_id.isin(ann_df.comment_id.unique())]
test.shape

(318, 21)

In [12]:
ann_df.head()

Unnamed: 0,post_id,comment_id,tokenized_comment,tagged_comment,ref_expressions,ref_pos,ref_tags,user,confs
0,yhbqwu,iueu4ch,[SENT] Dante has played well other then that offsides when were n't allowed to see !,[SENT] [IN] has played well other then that offsides when were n't allowed to see !,[Dante],"[(7, 12)]",[[IN]],m,[5]
1,q0ta6h,hfb1e8j,[SENT] I mean I like the idea of a top 10 pick but .... we 're about to be tied with the Jets here ..... it's bad,[SENT] I mean I like the idea of a top 10 pick but .... [IN] about to be tied with [OTHER] here ..... it's bad,"[we 're, the Jets]","[(56, 62), (85, 93)]","[[IN], [OTHER]]",m,"[5, 5]"
2,zudthn,j1iyyse,[SENT] I sometimes feel like we ’re just tanking these last games,[SENT] I sometimes feel like [IN] just tanking these last games,[we ’re],"[(29, 35)]",[[IN]],m,[5]
3,rfujw5,hogvith,"[SENT] And ESPN wonders why people tune into the Manningcast over their main broadcast , they suck","[SENT] And [OTHER] wonders why [OTHER] tune into [OTHER] over [OTHER] , [OTHER] suck","[ESPN, people, the Manningcast, their main broadcast, they]","[(11, 15), (28, 34), (45, 60), (66, 86), (89, 93)]","[[OTHER], [OTHER], [OTHER], [OTHER], [OTHER]]",m,"[5, 5, 5, 5, 5]"
4,q5bp35,hg4pmv1,"[SENT] Hurts is n't playing well , but this legitimately might be the worst game plan we 've seen in a long long time .","[SENT] [IN] is n't playing well , but this legitimately might be the worst game plan [IN] seen in a long long time .","[Hurts, we 've]","[(7, 12), (86, 92)]","[[IN], [IN]]",m,"[5, 5]"


In [13]:
user_agrees = {x: [] for x in ['k', 'a', 'm']}
conf_agrees = []
conf_disagrees = []

for i, row in test.iterrows():
    comment_id = row.comment_id
    ref_exps = row.ref_expressions
    ref_tags = row.ref_tags
    for user in user_agrees.keys():
        new_row = ann_df.loc[((ann_df.comment_id==comment_id) & (ann_df.user==user))]
        new_exps = new_row.ref_expressions.values[0]
        new_tags = new_row.ref_tags.values[0]

        if ((new_exps==ref_exps) and (new_tags==ref_tags)):
            user_agrees[user].append(1)
            conf_agrees += new_row.confs.values.tolist()
        else:
            min_overlap = min(len(ref_exps), len(new_exps))
            if min_overlap==0:
                # print(ref_exps, ref_tags, new_exps, new_tags, row.tokenized_comment.replace('[SENT]', ''))
                user_agrees[user].append(0)
                conf_disagrees += new_row.confs.values.tolist()
                continue
            count = 0
            for i in range(0, min_overlap):
                if ((SequenceMatcher(None, ref_exps[i], new_exps[i]).ratio()>0.5) and (ref_tags[i]==new_tags[i])):
                    count += 1
            # print(ref_exps, new_exps)
            user_agrees[user].append(count/min_overlap)
            conf_agrees += new_row.confs.values.tolist()
total = test.shape[0]
scores = [np.round(sum(user_agrees['k'])/total, 2), np.round(sum(user_agrees['a'])/total, 2), np.round(sum(user_agrees['m'])/total, 2)]
np.mean(scores), np.std(scores)

(0.6466666666666666, 0.004714045207910321)

In [14]:
np.round(np.mean(sum(conf_agrees, [])), 2), np.round(np.mean(sum(conf_disagrees, [])), 2)

(4.77, 4.46)

In [15]:
np.round(np.std(sum(conf_agrees, [])), 2), np.round(np.std(sum(conf_disagrees, [])), 2)

(0.81, 1.22)

## Fleiss

In [16]:
users = ['k', 'a', 'm']
tag_map = {'[IN]': 1, '[OUT]': 2, '[OTHER]': 3}

ratings_table = []

for i, row in test.iterrows():
    comment_id = row.comment_id
    num_subjects = max(1, len(row.ref_expressions))

    for ind in range(num_subjects):
        rating_subject = []
        for user in users:
            new_row = ann_df.loc[((ann_df.comment_id==comment_id) & (ann_df.user==user))]
            new_exps = new_row.ref_expressions.values[0]
            new_tags = new_row.ref_tags.values[0]
    
            if new_exps == []:
                rating_subject.append(0)
            else:
                if ind < len(new_tags):
                    rating_subject.append(tag_map[new_tags[ind]])
                else:
                    rating_subject.append(0)
        ratings_table.append(rating_subject)

table, categories = aggregate_raters(ratings_table)
np.round(fleiss_kappa(table),2)

0.69

In [17]:
ann_df_test = ann_df[ann_df['comment_id'].isin(test.comment_id.values)]
ann_df_test.shape

(954, 9)

In [18]:
ann_df.head()

Unnamed: 0,post_id,comment_id,tokenized_comment,tagged_comment,ref_expressions,ref_pos,ref_tags,user,confs
0,yhbqwu,iueu4ch,[SENT] Dante has played well other then that offsides when were n't allowed to see !,[SENT] [IN] has played well other then that offsides when were n't allowed to see !,[Dante],"[(7, 12)]",[[IN]],m,[5]
1,q0ta6h,hfb1e8j,[SENT] I mean I like the idea of a top 10 pick but .... we 're about to be tied with the Jets here ..... it's bad,[SENT] I mean I like the idea of a top 10 pick but .... [IN] about to be tied with [OTHER] here ..... it's bad,"[we 're, the Jets]","[(56, 62), (85, 93)]","[[IN], [OTHER]]",m,"[5, 5]"
2,zudthn,j1iyyse,[SENT] I sometimes feel like we ’re just tanking these last games,[SENT] I sometimes feel like [IN] just tanking these last games,[we ’re],"[(29, 35)]",[[IN]],m,[5]
3,rfujw5,hogvith,"[SENT] And ESPN wonders why people tune into the Manningcast over their main broadcast , they suck","[SENT] And [OTHER] wonders why [OTHER] tune into [OTHER] over [OTHER] , [OTHER] suck","[ESPN, people, the Manningcast, their main broadcast, they]","[(11, 15), (28, 34), (45, 60), (66, 86), (89, 93)]","[[OTHER], [OTHER], [OTHER], [OTHER], [OTHER]]",m,"[5, 5, 5, 5, 5]"
4,q5bp35,hg4pmv1,"[SENT] Hurts is n't playing well , but this legitimately might be the worst game plan we 've seen in a long long time .","[SENT] [IN] is n't playing well , but this legitimately might be the worst game plan [IN] seen in a long long time .","[Hurts, we 've]","[(7, 12), (86, 92)]","[[IN], [IN]]",m,"[5, 5]"


In [19]:
ann_df.to_csv('../../../data/ann_data_full.tsv', sep='\t', quoting=csv.QUOTE_NONE, escapechar="\\", index=False, columns=['post_id', 'comment_id', 'user', 'tokenized_comment', 'tagged_comment', 'ref_expressions', 'ref_pos', 'ref_tags', 'confs'])
ann_df_test.to_csv('../../../data/ann_data_test.tsv', sep='\t', quoting=csv.QUOTE_NONE, escapechar="\\", index=False, columns=['post_id', 'comment_id', 'user', 'tokenized_comment', 'tagged_comment', 'ref_expressions', 'ref_pos', 'ref_tags', 'confs'])

In [22]:
ann_df.rename(columns={'user': 'annotator_id'}, inplace=True)
ann_df.to_csv('../../../../intergroup-nfl/data/ann_data.tsv', sep='\t', quoting=csv.QUOTE_NONE, escapechar="\\", index=False, columns=['post_id', 'comment_id', 'annotator_id', 'tagged_comment', 'ref_expressions', 'ref_pos', 'ref_tags', 'confs'])