In [1]:
import polars as pl
import json
import statistics
import re

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.cluster import AgglomerativeClustering

##### Load data

In [2]:
data_dir = '/Volumes/PortableSSD/CSS/data/processed/'

In [3]:
data_df = pl.read_csv(data_dir+'posts_2015-21_ps_min_2c_politics.csv')
data_df.head()

id,created_utc,subreddit,category,super_category,author,domain,url,title,selftext,num_comments,score,gilded,upvote_ratio
str,i64,str,str,str,str,str,str,str,str,i64,i64,i64,f64
"""589v04""",1476878103,"""The_Donald""","""politics_2019""","""politics""","""gmousasi""","""i.redd.it""","""https://i.redd…","""Just a rare ba…","""""",17,1242,0,100.0
"""589ygu""",1476879588,"""Enough_Sanders…","""politics_2019""","""politics""","""12-juin-3049""","""reddit.com""","""http://www.red…","""Bernout gets e…","""""",12,28,0,100.0
"""58a7z5""",1476883248,"""EnoughTrumpSpa…","""politics_2019""","""politics""","""TheIronTARDIS""","""np.reddit.com""","""http://np.redd…","""In case you ne…","""""",38,371,0,100.0
"""58aa2h""",1476883994,"""politics""","""politics_2019""","""politics""","""Naggers123""","""breitbart.com""","""http://www.bre…","""Poll: Hillary …","""""",170,305,0,100.0
"""58bnuv""",1476899246,"""politics""","""politics_2019""","""politics""","""Metaprinter""","""gop.com""","""https://gop.co…","""Trump asking t…","""""",38,24,0,100.0


##### Data exploration

In [6]:
# keep id, num_comments etc.
# metaphor match on clusters
    # framebert to remove non metaphors
# engagement in metaphor posts vs non metaphor posts within cluster
    # num comments

##### Filter image domains

In [7]:
no_domains = ["i.imgur.com","imgur.com","i.reddituploads.com","i.sli.mg","i.magaimg.net","gfycat.com","pbs.twimg.com","sli.mg"]

data_df = data_df.filter(~pl.col('domain').is_in(no_domains))

##### Filter images

In [8]:
no_imgs = ['jpeg', 'png', 'tiff', 'gif', 'jpg']

data_df = data_df.filter(~pl.col('url').str.contains_any(no_imgs))

##### Filter small posts

In [9]:
min_chars = 10

data_df = data_df.filter(pl.col('title').str.len_chars() >= min_chars)

##### Cast datetime

In [10]:
MILLISECONDS_IN_SECOND = 1000

datetimes = data_df.select((pl.col("created_utc") * MILLISECONDS_IN_SECOND).cast(
    pl.Datetime).dt.with_time_unit("ms").alias("datetime"))

data_df.replace("created_utc", datetimes['datetime'].dt.date())

  pl.Datetime).dt.with_time_unit("ms").alias("datetime"))
    df = df.with_columns(new_column.alias(column_name))
instead.
  data_df.replace("created_utc", datetimes['datetime'].dt.date())


id,created_utc,subreddit,category,super_category,author,domain,url,title,selftext,num_comments,score,gilded,upvote_ratio
str,date,str,str,str,str,str,str,str,str,i64,i64,i64,f64
"""589ygu""",2016-10-19,"""Enough_Sanders…","""politics_2019""","""politics""","""12-juin-3049""","""reddit.com""","""http://www.red…","""Bernout gets e…","""""",12,28,0,100.0
"""58a7z5""",2016-10-19,"""EnoughTrumpSpa…","""politics_2019""","""politics""","""TheIronTARDIS""","""np.reddit.com""","""http://np.redd…","""In case you ne…","""""",38,371,0,100.0
"""58aa2h""",2016-10-19,"""politics""","""politics_2019""","""politics""","""Naggers123""","""breitbart.com""","""http://www.bre…","""Poll: Hillary …","""""",170,305,0,100.0
"""58bnuv""",2016-10-19,"""politics""","""politics_2019""","""politics""","""Metaprinter""","""gop.com""","""https://gop.co…","""Trump asking t…","""""",38,24,0,100.0
"""58c5tw""",2016-10-19,"""ShitRConservat…","""politics_2019""","""politics""","""bluefootedpig""","""reddit.com""","""https://www.re…","""Chab can't und…","""""",2,1,0,100.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""2up5gp""",2015-02-04,"""CanadaPolitics…","""politics_2019""","""politics""","""blueberryfickl…","""self.CanadaPol…","""http://www.red…","""Are you curren…","""""",45,10,0,100.0
"""2uvtx0""",2015-02-05,"""Shitstatistssa…","""politics_2019""","""politics""","""highdra""","""investorplace.…","""http://investo…","""The state save…","""""",2,14,0,100.0
"""2uwrz1""",2015-02-05,"""politics""","""politics_2019""","""politics""","""Katalyna""","""cnn.com""","""http://www.cnn…","""What will Puti…","""""",3,6,0,100.0
"""2vdrv2""",2015-02-10,"""politics""","""politics_2019""","""politics""","""TwoGee""","""thinkprogress.…","""http://thinkpr…","""A federal cour…","""""",21,89,0,100.0


##### Cluster examples

In [18]:
with open(data_dir+'post_clusters_5.json') as f:
    data = json.load(f)

In [19]:
for day, clusters in data.items():
    for cluster_id, post_ids in clusters.items():
        if len(post_ids) > 1:
            print(data_df.filter(pl.col('id').is_in(post_ids))['title'].to_list())
            print('END')
            print(' ')

['Want to streamline government? Start with the Pentagon.', 'WH Petition: reduce the size of the federal government']
END
 
["I've heard people refer to T_D as a safe space. Would you agree or disagree?", 'T_D likes to ban dissenters. How is this different than a safe space?']
END
 
['Jeremy Corbyn uses IBTimes UK video without authorisation to attack Tim Farron', 'Jeremy Corbyn uses IBTimes UK video without authorisation to attack Tim Farron', "Corbyn just pinched our video to attack Tim Farron and didn't even give us credit"]
END
 
['Corbyn responds to labours low poll ratings: "only 3 days ago there was a big swing to Labour in a council by-election"', 'Corbyn responds to labours low poll ratings: "only 3 days ago there was a big swing to Labour in a council by-election"', 'Sophy Ridge: \'"We\'re getting it through": Jeremy Corbyn says voters are getting his message', 'Jeremy Corbyn dodges key question on use of Labour party whips in Brexit vote three times']
END
 
['The inauguratio

KeyboardInterrupt: 

In [20]:
non_triv_count = 0

for day, clusters in data.items():
    for cluster_id, post_ids in clusters.items():
        if len(post_ids) > 1:
            non_triv_count += 1

non_triv_count

3026

In [21]:
same_count = 0

for day, clusters in data.items():
    for cluster_id, post_ids in clusters.items():
        if len(post_ids) > 1:
            posts = data_df.filter(pl.col('id').is_in(post_ids))['title'].to_list()
            if len(set(posts)) == 1:
                same_count += 1

same_count

264

In [None]:
2143 -> 264
2588 -> 288
3026 -> 264

#### Analyze matched posts

In [5]:
with open('/users/ujan/Downloads/post_cluster_matches_5.json') as f:
    data = json.load(f)

##### total clusters in 5 days

In [6]:
tot = 0

for day, clusters in data.items():
    tot += len(clusters)

tot

2049

##### clusters with metaphors

In [7]:
meta_cluster_count = 0

for day, clusters in data.items():
    for c_id, matches in clusters.items():
        if len(matches['meta']) > 0:
            #print(data_df.filter(pl.col('id').is_in(matches['meta']))['title'].to_list())
            #print('END')
            #print(' ')
            meta_cluster_count += 1

meta_cluster_count

204

##### examples

In [33]:
for day, clusters in data.items():
    for c_id, matches in clusters.items():
        if len(matches['meta']) > 0:
            print(data_df.filter(pl.col('id').is_in(matches['meta']))['title'].to_list())
            print('END')
            print(' ')

['Seattle judge blocks Trump immigration order', "Trump to overturn 'so-called' judge's ban", 'Boston Judge Unblocks Trump Travel Ban, Asks "Where Does It Say Muslim Countries?"', 'Trump questions authority of judge who halted his immigration ban', 'Federal Judge Halts Trump Travel Ban Nationwide']
END
 
['FBI Vault quietly drops another set of Clinton Files - release 6', 'Midnight “Release” Again – 178 Page FBI Clinton Email Document Release With Interesting Content…']
END
 
['Der Spiegel: Trump beheading cover sparks criticism']
END
 
["Judge Who Blocked Trump Action Has Activist Past: Robart made a point of saying 'Black Lives Matter' in a dispute with Seattle police, does pro bono work for refugees.", "Same judge who halted the EO travel ban declared 'black lives matter' from the bench in Seattle @2:18"]
END
 
["Open Letter to President Trump: Constitutionally, the POTUS only has 3 enumerated powers. One charges him with the 'protection' of this sovereign nation. I know you don't h

KeyboardInterrupt: 

#### Num comments in meta posts vs non meta posts in same cluster

In [31]:
for day, clusters in data.items():
    for c_id, matches in clusters.items():
        if len(matches['meta']) > 0:
            meta_num_comments = data_df.filter(pl.col('id').is_in(matches['meta']))['num_comments'].to_list()
            non_meta_num_comments = data_df.filter(pl.col('id').is_in(matches['non_meta']))['num_comments'].to_list()
            print(statistics.median(meta_num_comments))
            print(statistics.median(non_meta_num_comments))
            print(' ')

5
5
 
61.0
20.0
 
17
7.0
 
58.0
7
 
5
14.5
 
4
2
 
41
6.0
 
23
6.5
 
11
21
 
1485.0
5
 
19.0
5
 
17
5
 
2
4.0
 
27
3
 
50
42
 
8
4
 
16


StatisticsError: no median for empty data

##### median num_comments

In [8]:
meta_num_comments = []
non_meta_num_comments = []

for day, clusters in data.items():
    for c_id, matches in clusters.items():
        if len(matches['meta']) > 0 and len(matches['non_meta']) > 0:
            meta_num_comments.extend(data_df.filter(pl.col('id').is_in(matches['meta']))['num_comments'].to_list())
            non_meta_num_comments.extend(data_df.filter(pl.col('id').is_in(matches['non_meta']))['num_comments'].to_list())
            
print(statistics.median(meta_num_comments))
print(statistics.median(non_meta_num_comments))

7
7.0


##### mean num_comments

In [10]:
print(statistics.mean(meta_num_comments))
print(statistics.mean(non_meta_num_comments))

39.327510917030565
52.67888888888889


##### median comparison across clusters

In [11]:
meta_median_c = 0
non_meta_median_c = 0

for day, clusters in data.items():
    for c_id, matches in clusters.items():
        if len(matches['meta']) > 0 and len(matches['non_meta']) > 0:
            meta_num_comments = data_df.filter(pl.col('id').is_in(matches['meta']))['num_comments'].to_list()
            non_meta_num_comments = data_df.filter(pl.col('id').is_in(matches['non_meta']))['num_comments'].to_list()
            meta_median = statistics.median(meta_num_comments)
            non_meta_median = statistics.median(non_meta_num_comments)

            if meta_median > non_meta_median :
                meta_median_c += 1
            elif non_meta_median > meta_median:
                non_meta_median_c += 1

print(meta_median_c)
print(non_meta_median_c)

83
83


##### only popular subs

In [12]:
sublist = ["politics", "The_Donald", "Conservative", "PoliticalDiscussion", "AskTrumpSupporters", "AskALiberal", "NeutralPolitics", "neutralnews", "moderatepolitics", "worldnews", "Republican", "Liberal", "progressive", "esist"]

In [15]:
pol_ids = data_df.filter(pl.col('subreddit').is_in(sublist))['id'].to_list()

##### median num_comments

In [26]:
meta_num_comments = []
non_meta_num_comments = []

for day, clusters in data.items():
    for c_id, matches in clusters.items():

        if all(item in pol_ids for item in matches['meta']) and all(item in pol_ids for item in matches['non_meta']):
        
            if len(matches['meta']) > 0 and len(matches['non_meta']) > 0:
                meta_num_comments.extend(data_df.filter(pl.col('id').is_in(matches['meta']))['num_comments'].to_list())
                non_meta_num_comments.extend(data_df.filter(pl.col('id').is_in(matches['non_meta']))['num_comments'].to_list())
            
print(statistics.median(meta_num_comments))
print(statistics.median(non_meta_num_comments))

8
8


In [28]:
print(statistics.mean(meta_num_comments))
print(statistics.mean(non_meta_num_comments))

33.16438356164384
29.63157894736842
