In [26]:
import polars as pl
import json
import statistics
import re

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.cluster import AgglomerativeClustering

##### Load data

In [27]:
data_dir = '/Volumes/PortableSSD/CSS/data/processed/'

In [28]:
data_df = pl.read_csv(data_dir+'posts_2015-21_ps_min_2c_politics.csv')
data_df.head()

id,created_utc,subreddit,category,super_category,author,domain,url,title,selftext,num_comments,score,gilded,upvote_ratio
str,i64,str,str,str,str,str,str,str,str,i64,i64,i64,f64
"""589v04""",1476878103,"""The_Donald""","""politics_2019""","""politics""","""gmousasi""","""i.redd.it""","""https://i.redd…","""Just a rare ba…","""""",17,1242,0,100.0
"""589ygu""",1476879588,"""Enough_Sanders…","""politics_2019""","""politics""","""12-juin-3049""","""reddit.com""","""http://www.red…","""Bernout gets e…","""""",12,28,0,100.0
"""58a7z5""",1476883248,"""EnoughTrumpSpa…","""politics_2019""","""politics""","""TheIronTARDIS""","""np.reddit.com""","""http://np.redd…","""In case you ne…","""""",38,371,0,100.0
"""58aa2h""",1476883994,"""politics""","""politics_2019""","""politics""","""Naggers123""","""breitbart.com""","""http://www.bre…","""Poll: Hillary …","""""",170,305,0,100.0
"""58bnuv""",1476899246,"""politics""","""politics_2019""","""politics""","""Metaprinter""","""gop.com""","""https://gop.co…","""Trump asking t…","""""",38,24,0,100.0


##### Data exploration

In [6]:
# keep id, num_comments etc.
# metaphor match on clusters
    # framebert to remove non metaphors
# engagement in metaphor posts vs non metaphor posts within cluster
    # num comments

##### Filter image domains

In [7]:
no_domains = ["i.imgur.com","imgur.com","i.reddituploads.com","i.sli.mg","i.magaimg.net","gfycat.com","pbs.twimg.com","sli.mg"]

data_df = data_df.filter(~pl.col('domain').is_in(no_domains))

##### Filter images

In [8]:
no_imgs = ['jpeg', 'png', 'tiff', 'gif', 'jpg']

data_df = data_df.filter(~pl.col('url').str.contains_any(no_imgs))

##### Filter small posts

In [9]:
min_chars = 10

data_df = data_df.filter(pl.col('title').str.len_chars() >= min_chars)

##### Cast datetime

In [10]:
MILLISECONDS_IN_SECOND = 1000

datetimes = data_df.select((pl.col("created_utc") * MILLISECONDS_IN_SECOND).cast(
    pl.Datetime).dt.with_time_unit("ms").alias("datetime"))

data_df.replace("created_utc", datetimes['datetime'].dt.date())

  pl.Datetime).dt.with_time_unit("ms").alias("datetime"))
    df = df.with_columns(new_column.alias(column_name))
instead.
  data_df.replace("created_utc", datetimes['datetime'].dt.date())


id,created_utc,subreddit,category,super_category,author,domain,url,title,selftext,num_comments,score,gilded,upvote_ratio
str,date,str,str,str,str,str,str,str,str,i64,i64,i64,f64
"""589ygu""",2016-10-19,"""Enough_Sanders…","""politics_2019""","""politics""","""12-juin-3049""","""reddit.com""","""http://www.red…","""Bernout gets e…","""""",12,28,0,100.0
"""58a7z5""",2016-10-19,"""EnoughTrumpSpa…","""politics_2019""","""politics""","""TheIronTARDIS""","""np.reddit.com""","""http://np.redd…","""In case you ne…","""""",38,371,0,100.0
"""58aa2h""",2016-10-19,"""politics""","""politics_2019""","""politics""","""Naggers123""","""breitbart.com""","""http://www.bre…","""Poll: Hillary …","""""",170,305,0,100.0
"""58bnuv""",2016-10-19,"""politics""","""politics_2019""","""politics""","""Metaprinter""","""gop.com""","""https://gop.co…","""Trump asking t…","""""",38,24,0,100.0
"""58c5tw""",2016-10-19,"""ShitRConservat…","""politics_2019""","""politics""","""bluefootedpig""","""reddit.com""","""https://www.re…","""Chab can't und…","""""",2,1,0,100.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""2up5gp""",2015-02-04,"""CanadaPolitics…","""politics_2019""","""politics""","""blueberryfickl…","""self.CanadaPol…","""http://www.red…","""Are you curren…","""""",45,10,0,100.0
"""2uvtx0""",2015-02-05,"""Shitstatistssa…","""politics_2019""","""politics""","""highdra""","""investorplace.…","""http://investo…","""The state save…","""""",2,14,0,100.0
"""2uwrz1""",2015-02-05,"""politics""","""politics_2019""","""politics""","""Katalyna""","""cnn.com""","""http://www.cnn…","""What will Puti…","""""",3,6,0,100.0
"""2vdrv2""",2015-02-10,"""politics""","""politics_2019""","""politics""","""TwoGee""","""thinkprogress.…","""http://thinkpr…","""A federal cour…","""""",21,89,0,100.0


In [16]:
len(data_df['created_utc'].unique().to_list())

2407

In [9]:
day_one = data_df['created_utc'].unique()[1000]

In [10]:
day1_df = data_df.filter(pl.col('created_utc')==day_one)

In [11]:
day1_df.head()

id,created_utc,subreddit,category,super_category,author,domain,url,title,selftext,num_comments,score,gilded,upvote_ratio
str,date,str,str,str,str,str,str,str,str,i64,i64,i64,f64
"""72q8ds""",2017-09-27,"""politics""","""politics_2019""","""politics""","""iamnewredditer…","""nypost.com""","""http://nypost.…","""Trump: Even Us…","""""",33,0,0,100.0
"""72r86e""",2017-09-27,"""The_Donald""","""politics_2019""","""politics""","""Matterak""","""youtu.be""","""https://youtu.…","""It looks like …","""""",5,4,0,100.0
"""72w4iy""",2017-09-27,"""The_Donald""","""politics_2019""","""politics""","""godemperorsecr…","""youtube.com""","""https://www.yo…","""Roy Moore expr…","""""",6,8,0,100.0
"""72qvd4""",2017-09-27,"""socialism""","""politics_2019""","""politics""","""AutoModerator""","""self.socialism…","""https://www.re…","""Daily Discussi…","""Feel free to d…",20,10,0,100.0
"""72s1kg""",2017-09-27,"""politics""","""politics_2019""","""politics""","""[deleted]""","""politico.com""","""http://www.pol…","""Russian-Funded…","""[deleted]""",43,50,0,100.0


In [12]:
len(day1_df)

3442

In [13]:
posts = day1_df['title'].to_list()
ids = day1_df['id'].to_list()

In [14]:
posts = [post.replace("'", '') for post in posts]
posts = [re.sub(r"[^a-zA-Z0-9.]+", ' ', post).lower() for post in posts]

In [15]:
posts = list(set(posts))

In [16]:
len(posts)

3041

In [17]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [18]:
embeddings = model.encode(posts)
print(embeddings)

[[-3.79345790e-02  6.82645142e-02  3.11806612e-02 ... -3.55560728e-03
  -3.34419161e-02 -1.96293201e-02]
 [-3.60639691e-02  4.61865291e-02 -4.88960110e-02 ... -1.66006349e-02
   2.93890350e-02  3.72065194e-02]
 [-7.61352330e-02  3.55548337e-02 -9.46362037e-03 ... -3.33440900e-02
  -3.90029475e-02 -3.47849391e-02]
 ...
 [-1.04269544e-04  1.34999529e-01 -1.12017207e-02 ...  1.81642827e-02
   1.04403812e-02 -3.50751840e-02]
 [-6.35885522e-02  2.33104061e-02 -2.58716028e-02 ... -3.30541842e-02
   9.30643175e-03  6.67301938e-02]
 [-2.03776173e-02  5.19978777e-02  2.39353776e-02 ...  4.33782674e-03
   1.88888367e-02 -2.41267420e-02]]


In [19]:
embeddings.shape

(3041, 768)

In [20]:
dists = cosine_distances(embeddings, embeddings)

In [21]:
# 0.4

threshold = 0.4

cluster = AgglomerativeClustering(metric='precomputed', linkage='average', n_clusters=None, distance_threshold=threshold)
clusters = cluster.fit(dists)

In [22]:
clusters.n_clusters_

1832

In [23]:
l2post = {}

for l in range(len(clusters.labels_)):
    if clusters.labels_[l] not in l2post:
        l2post[clusters.labels_[l]] = [posts[l]] 
    else:
        l2post[clusters.labels_[l]].append(posts[l])

In [24]:
count = 0

for key, val in l2post.items():
    if len(val) > 1:
        print(val)
        print('END')
        print(' ')
        count += 1

count

['donald trump cuts barack obama s refugee policy in half accepts more christians than muslims', 'president trump cuts obama s refugee target in half takes more christians than muslims ', 'trump caps refugee resettlement at 45 000 marking all time low', 'donald trump cuts barack obama s refugee policy in half takes more christians than muslims', 'trump administration to drop refugee cap to 45 000 lowest in years', 'trump plans to slash refugee admissions to lowest level since 2006', 'trump cuts obama s refugee target in half takes more christians than muslims', 'trump slashes refugee admissions to 45 000', 'president trump cuts obamas refugee target in half slashes syrian intake by over 80 and prioritizes christian applicants']
END
 
['the latest gop tax plan nearly doubles standard deduction', 'the doubled standard deduction in the gop tax plan is a lie', 'gop tax framework', 'gop tax overhaul aims for corporate cuts simpler code', 'gop tax plan overview', 'trump gop tax plan cuts rat

421

In [39]:
with open(data_dir+'post_clusters_5.json') as f:
    cluster_data = json.load(f)

In [40]:
for day, clusters in cluster_data.items():
    for c_id, p_ids in clusters.items():
        if len(p_ids) > 1:
            print(data_df.filter(pl.col('id').is_in(p_ids))['title'].to_list())
            print('END')
            print('')
    break

['‘I misspoke one word’: Kellyanne Conway calls her critics ‘haters’ after Bowling Green fiasco', 'Conway says she misspoke on Iraqi terrorists, calls some critics ‘haters’']
END

['Seattle federal judge grants temporary restraining order on immigration ban on nationwide basis', 'US judge temporarily blocks Trump’s travel ban nationwide', "Trump's immigration order temporarily halted nationwide", 'Trump immigration ban temporarily blocked nationwide by court', 'Judge Blocks Immigration Order; President Trump Responds', 'What is going on with the travel ban?', 'Seattle Judge is Wrong and Overreached With Nationwide Injunction on Travel Ban', 'Trump Blasts Ruling of "So-Called Judge" as DHS Forced to Halt Muslim Ban', 'TRUMP HITS BACK: White House Files Emergency Request To Stop Judge’s Halt Of The Ban!', 'Read the judges order to halt travel ban', 'Judge in Seattle halts Trump’s immigration order nationwide', "Us Judge Temporarily Blocks President Trump's Travel Ban Nationwide", 'Seattl

KeyboardInterrupt: 

In [38]:
len(data_df)

9127279