In [12]:
import json
import polars as pl
import re
from thefuzz import fuzz
from datasets import load_dataset
from tqdm.auto import tqdm

##### Load sample sports and political data

In [10]:
#sports_df = pl.read_csv('~/sports-language-in-politics/data/processed/sports_sample.csv')
politics_df = pl.read_csv('~/sports-language-in-politics/data/processed/politics_sample.csv')  # 3203899
random_df = pl.read_csv('~/sports-language-in-politics/data/processed/random_sample_no_sports.csv')  # 9280850

##### Extract sports and political comments and clean

In [3]:
#sports_comments = [re.sub(r"[^a-zA-Z0-9]+", ' ', comment).lower() for comment in sports_df['body'].to_list()]
political_comments = [re.sub(r"[^a-zA-Z0-9]+", ' ', comment).lower() for comment in politics_df['body'].to_list()]
random_comments = [re.sub(r"[^a-zA-Z0-9]+", ' ', comment).lower() for comment in random_df['body'].to_list()]

##### Filter short comments

In [4]:
thresh = 20
random_comments_long = []
political_comments_long = []

r_lens = [len(r.split()) for r in random_comments]
p_lens = [len(c.split()) for c in political_comments]

for i in range(len(random_comments)):
    if r_lens[i] >= thresh:
        random_comments_long.append(random_comments[i])
for i in range(len(political_comments)):
    if p_lens[i] >= thresh:
        political_comments_long.append(political_comments[i])

##### Samples from random and poitical comments

In [7]:
thresh = 50000

#wikitext = load_dataset('wikitext', 'wikitext-103-v1')
# sample size
random_comments = random_comments[:thresh]
political_comments = political_comments[:thresh]

##### token_set_ratio tokenizes both input strings, removes duplicate tokens, and calculates the similarity score based on the intersection and union of the token sets

difflib.ratio -> edit distance

token_set_ratio attempts to rule out differences in the strings. Calls ratio on three particular substring sets and returns the max:

- intersection-only and the intersection with remainder of string one
- intersection-only and the intersection with remainder of string two
- intersection with remainder of one and intersection with remainder of two
  
By splitting up the intersection and remainders of the two strings, we're accounting for both how similar and different the two strings are:

##### look into fuzzy match code

In [38]:
wiki = load_dataset('wikitext','wikitext-103-v1', split='train+test')
wiki_text = wiki['text']
wiki_text = [w for w in wiki_text if w != ''][:thresh]
wiki_text = [re.sub(r"[^a-zA-Z0-9]+", ' ', comment).lower() for comment in wiki_text]

##### Metaphors

In [2]:
with open('/users/Ujan/sports-language-in-politics/data/processed/meta_dict_2.json', 'r') as fp:
    data = json.load(fp)

meta_list = []
for key, values in data.items():
    meta_list.extend(values)

# improve regex
meta_list = [re.sub(r"[^a-zA-Z0-9]+", ' ', meta).lower() for meta in meta_list]

In [3]:
meta_list

['cat and mouse game',
 'play cat and mouse',
 'play possum',
 'roll over and play dead',
 'when the cat s away',
 'beside the point',
 'come unstrung',
 'drawback',
 'fall short of the mark',
 'fall short of the target',
 'high strung',
 'hit the bull s eye',
 'let slip the dogs of war',
 'off color',
 'room to swing a cat in',
 'shoot your bolt',
 'slings and arrows',
 'upshot',
 'athlete s foot',
 'badger',
 'keep it up',
 'bounce back',
 'rubber check',
 'that s the way the ball bounces',
 'your ball game',
 'ball game',
 'ballpark',
 'ballpark figure',
 'baseball size hail',
 'bat an idea around',
 'bat cleanup',
 'batting a thousand',
 'batting average',
 'bench',
 'benched',
 'big league',
 'blank',
 'bleachers',
 'boner',
 'bronx cheer',
 'bush',
 'bush league',
 'call em as you see em',
 'caught napping',
 'caught off base',
 'chalk something up to experience',
 'change of pace',
 'choke',
 'charley horse',
 'close call',
 'clutch hitter',
 'clutch',
 'come off the bench',
 'c

In [51]:
no_list = [
    'bout','out','close','ace','up','pawn','win','check','check in',
    'bush','card','pass','flat out','drawback','blank','boner',
    'hotdog','iron man','set up','flat out','prize','debut',
    'hit on','set to','discard','dummy','hold in','fan', 'gentlemen',
]

meta_list = [m for m in meta_list if m not in no_list]

political comments

In [53]:
count = 0
for comment in political_comments:
    for meta in meta_list:
        if len(re.findall("\\b"+meta+"\\b", comment))>0:
            count += 1

print(count)
# count for each metaphor

457


random comments

In [54]:
count = 0
for comment in random_comments:
    for meta in meta_list:
        if len(re.findall("\\b"+meta+"\\b", comment))>0:
            count += 1

print(count)
# count for each metaphor

430


wikitext

In [55]:
count = 0
for comment in wiki_text:
    for meta in meta_list:
        if len(re.findall("\\b"+meta+"\\b", comment))>0:
            count += 1

print(count)

942


In [14]:
random_subs = random_df['subreddit'].unique().to_list()

In [15]:
random_subs

['StarshipTheory',
 '3gun',
 'BeatTheStreak',
 'GuitarAmps',
 'GifRecipes',
 'MaxMSP',
 'lineapp',
 'fortniteloadouts',
 'bystandertales',
 'e3expo',
 'ABroadInJapan',
 'KalmarReunion',
 'Howwastoday',
 'SXSW',
 'wallpaperengine',
 'pittsburghpanthers',
 'whybrows',
 'freebsd',
 'firebrigade',
 'oldbritishtelly',
 'JuniorDoctorsUK',
 'Open_Science',
 'irishwolfhound',
 'MoviesAnywhere',
 'VWMK7',
 'lebowski',
 'trollbi',
 'leahgotti',
 'impregnation',
 'playarkmobile',
 'Socionics',
 'ShakyBeatsFest',
 'Onodera',
 'minimalcatart',
 'Oahu',
 'GoldenKamuy',
 'Gatlinburg',
 'irezumi',
 'kfeets',
 'emulation',
 'Solving_A858',
 'Kalterkrieg',
 'Sober',
 'usedpanties',
 'RedRiverGorge',
 'digimon',
 'twicemedia',
 'Anne',
 'FellowKids',
 'crustpunk',
 'Manga_Collection',
 'RequestABot',
 'cleganebowl',
 'circlejerkaustralia',
 'StrikeWitches',
 'ems',
 'C418',
 'hittableFaces',
 'starwarsnsfw',
 'Madlib',
 'javahelp',
 'ArtBell',
 'SexiestLeague',
 'motorcyclememes',
 'Old_Recipes',
 'LineR

In [5]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('sentence-transformers/msmarco-distilbert-cos-v5')

Downloading .gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/5.13k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/545 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/319 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [8]:
query_embedding = model.encode('bat an idea around')
passage_embedding = model.encode('bat this idea around')

print("Similarity:", util.cos_sim(query_embedding, passage_embedding))

Similarity: tensor([[0.8983]])
