In [1]:
import json
import polars as pl
import re
from thefuzz import fuzz
from datasets import load_dataset
from tqdm.auto import tqdm

##### Load sample sports and political data

In [2]:
#sports_df = pl.read_csv('~/sports-language-in-politics/data/processed/sports_sample.csv')
politics_df = pl.read_csv('~/sports-language-in-politics/data/processed/politics_sample.csv')
random_df = pl.read_csv('~/sports-language-in-politics/data/processed/random_sample_no_sports.csv')

##### Extract sports and political comments and clean

In [3]:
#sports_comments = [re.sub(r"[^a-zA-Z0-9]+", ' ', comment).lower() for comment in sports_df['body'].to_list()]
political_comments = [re.sub(r"[^a-zA-Z0-9]+", ' ', comment).lower() for comment in politics_df['body'].to_list()]
random_comments = [re.sub(r"[^a-zA-Z0-9]+", ' ', comment).lower() for comment in random_df['body'].to_list()]

##### Filter short comments

In [4]:
thresh = 20
random_comments_long = []
political_comments_long = []

r_lens = [len(r.split()) for r in random_comments]
p_lens = [len(c.split()) for c in political_comments]

for i in range(len(random_comments)):
    if r_lens[i] >= thresh:
        random_comments_long.append(random_comments[i])
for i in range(len(political_comments)):
    if p_lens[i] >= thresh:
        political_comments_long.append(political_comments[i])

##### Metaphors

In [5]:
with open('/users/Ujan/sports-language-in-politics/data/processed/meta_dict_2.json', 'r') as fp:
    data = json.load(fp)

In [6]:
meta_list = []
for key, values in data.items():
    meta_list.extend(values)
meta_list = [m.lower() for m in meta_list]
# manual edit
#meta_list[3] = 'fall short of the mark'

##### Samples from random and poitical comments

In [7]:
thresh = 50000

#wikitext = load_dataset('wikitext', 'wikitext-103-v1')
# sample size
random_comments = random_comments[:thresh]
political_comments = political_comments[:thresh]

##### token_set_ratio tokenizes both input strings, removes duplicate tokens, and calculates the similarity score based on the intersection and union of the token sets

difflib.ratio -> edit distance

token_set_ratio attempts to rule out differences in the strings. Calls ratio on three particular substring sets and returns the max:

- intersection-only and the intersection with remainder of string one
- intersection-only and the intersection with remainder of string two
- intersection with remainder of one and intersection with remainder of two
  
By splitting up the intersection and remainders of the two strings, we're accounting for both how similar and different the two strings are:

In [8]:
rand_dists = []
pol_dists = []

progress_bar = tqdm(range(len(meta_list)))
for meta in meta_list:
    scores = []
    for com in random_comments:
        scores.append(fuzz.token_set_ratio(meta, com))
    rand_dists.append(sum(scores)/thresh)
    scores = []
    for com in political_comments:
        scores.append(fuzz.token_set_ratio(meta, com))
    pol_dists.append(sum(scores)/thresh)
    progress_bar.update(1)

  0%|          | 0/486 [00:00<?, ?it/s]

##### Paired T test

##### look into fuzzy match code

In [10]:
print('Average similarity of metaphors with random text: {}'.format(sum(rand_dists)/len(rand_dists)))
print('Average similarity of metaphors with political comments: {}'.format(sum(pol_dists)/len(pol_dists)))

Average similarity of metaphors with random text: 22.9193774074074
Average similarity of metaphors with political comments: 22.581753497942376


In [41]:
meta_list = [re.sub(r"[^a-zA-Z0-9]+", ' ', meta).lower() for meta in meta_list]

In [59]:
#meta_list.remove('bout')
#meta_list.remove('out')
#meta_list.remove('close')
#meta_list.remove('ace')
#meta_list.remove('up')
#meta_list.remove('pawn')
#meta_list.remove('win')
#meta_list.remove('check')
#meta_list.remove('check in')
#meta_list.remove('bush')
#meta_list.remove('card')
#meta_list.remove('pass')
#meta_list.remove('flat out')
#meta_list.remove('drawback')
#meta_list.remove('blank')
#meta_list.remove('boner')
#meta_list.remove('hotdog')
#meta_list.remove('iron man')
#meta_list.remove('set up')
#meta_list.remove('flat out')
#meta_list.remove('hit on')
#meta_list.remove('set to')
#meta_list.remove('discard')
#meta_list.remove('dummy')
#meta_list.remove('hold in')

In [60]:
count = 0
for comment in political_comments:
    for meta in meta_list:
        if len(re.findall("\\b"+meta+"\\b", comment))>0:
            count += 1

print(count)

693


In [61]:
count = 0
for comment in random_comments:
    for meta in meta_list:
        if len(re.findall("\\b"+meta+"\\b", comment))>0:
            count += 1

print(count)

766
