In [40]:
import json
import polars as pl
import re
from thefuzz import fuzz
from datasets import load_dataset
from tqdm.auto import tqdm

##### Load sample sports and political data

In [2]:
sports_df = pl.read_csv('~/sports-language-in-politics/data/processed/sports_sample.csv')
politics_df = pl.read_csv('~/sports-language-in-politics/data/processed/politics_sample.csv')

##### Extract sports and political comments and clean

In [3]:
sports_comments = [re.sub(r"[^a-zA-Z0-9]+", ' ', comment).lower() for comment in sports_df['body'].to_list()]
political_comments = [re.sub(r"[^a-zA-Z0-9]+", ' ', comment).lower() for comment in politics_df['body'].to_list()]

##### Filter short comments

In [4]:
thresh = 20
sports_comments_long = []
political_comments_long = []

s_lens = [len(s.split()) for s in sports_comments]
p_lens = [len(c.split()) for c in political_comments]

for i in range(len(sports_comments)):
    if s_lens[i] >= thresh:
        sports_comments_long.append(sports_comments[i])
for i in range(len(political_comments)):
    if p_lens[i] >= thresh:
        political_comments_long.append(political_comments[i])

##### Metaphors

In [6]:
with open('/users/Ujan/sports-language-in-politics/data/processed/meta_dict_1.json', 'r') as fp:
    data = json.load(fp)

In [7]:
meta_list = []
for key, values in data.items():
    meta_list.extend(values)
meta_list = [m.lower() for m in meta_list]
# manual edit
meta_list[3] = 'fall short of the mark'

##### Samples from wikitext and poitical comments

In [35]:
thresh = 50000

wikitext = load_dataset('wikitext', 'wikitext-103-v1')
# sample size
wiki = wikitext['train']['text'][:thresh]
political_comments = political_comments[:thresh]

##### token_set_ratio tokenizes both input strings, removes duplicate tokens, and calculates the similarity score based on the intersection and union of the token sets

In [41]:
wiki_dists = []
pol_dists = []

progress_bar = tqdm(range(len(meta_list)))
for meta in meta_list:
    scores = []
    for text in wiki:
        scores.append(fuzz.token_set_ratio(meta, text))
    wiki_dists.append(sum(scores)/thresh)
    scores = []
    for com in political_comments:
        scores.append(fuzz.token_set_ratio(meta, com))
    pol_dists.append(sum(scores)/thresh)
    progress_bar.update(1)

  0%|          | 0/749 [00:00<?, ?it/s]

In [53]:
print('Average similarity of metaphors with wiki text: {}'.format(sum(wiki_dists)/len(wiki_dists)))
print('Average similarity of metaphors with political comments: {}'.format(sum(pol_dists)/len(pol_dists)))
print('')
print('Metaphors more similar to political comments? {}'.format((sum(pol_dists)/len(pol_dists))>sum(wiki_dists)/len(wiki_dists)))

Average similarity of metaphors with wiki text: 14.445885767690235
Average similarity of metaphors with political comments: 25.81442053404541

Metaphors more similar to political comments? True
