In [None]:
import os
import re
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import spacy
from textblob import TextBlob
from tqdm import tqdm
from langdetect import detect, LangDetectException
from collections import Counter

folder = "data"

rs_files = [f for f in os.listdir(folder) if f.startswith("RS") and f.endswith(".csv")]
rc_files = [f for f in os.listdir(folder) if f.startswith("RC") and f.endswith(".csv")]

def get_common_columns(files):
    common_cols = None
    for filename in files:
        path = os.path.join(folder, filename)
        df = pd.read_csv(path, nrows=0)
        cols = set(df.columns)
        if common_cols is None:
            common_cols = cols
        else:
            common_cols |= cols
    return common_cols or set()

rs_common = get_common_columns(rs_files)
rc_common = get_common_columns(rc_files)

print("Common columns in RS files:", rs_common)
print("Common columns in RC files:", rc_common)

print("Total number of columns in RS files:", len(rs_common))
print("Total number of columns in RC files:", len(rc_common))

Common columns in RS files: {'edited', 'stickied', 'link_flair_type', 'removed_by_category', 'link_flair_template_id', 'is_robot_indexable', 'visited', 'subreddit_id', 'author_is_blocked', 'crosspost_parent_list', 'view_count', 'gilded', 'updated_on', 'is_reddit_media_domain', 'media_embed', 'no_follow', 'thumbnail', 'subreddit_type', 'hidden', 'mod_reason_title', 'score', 'treatment_tags', 'can_gild', 'subreddit', 'awarders', 'banned_at_utc', 'retrieved_on', 'media', 'contest_mode', 'previous_selftext', 'url', 'pwls', 'num_comments', 'is_gallery', 'gallery_data', 'author_flair_css_class', '_meta', 'subreddit_name_prefixed', 'report_reasons', 'approved_by', 'author_flair_richtext', 'author_fullname', 'num_reports', 'link_flair_text', 'over_18', 'title', 'author_cakeday', 'archived', 'num_crossposts', 'pinned', 'author_flair_type', 'ups', 'parent_whitelist_status', 'author', 'approved_at_utc', 'is_created_from_ads_ui', 'crosspost_parent', 'hide_score', 'can_mod_post', 'mod_note', 'disti

First we need to load data and find out which columns will we use for analysis. Reddit submissions contains 120 columns and comments 74. Obviously we won't use them all, so we need to choose the most important.

In [2]:
rs_columns = ['id', 'author', 'created_utc', 'subreddit', 'title', 'selftext',
 'link_flair_text', 'domain', 'score', 'ups', 'downs', 'num_comments',
 'upvote_ratio', 'total_awards_received', 'retrieved_on', 'is_original_content']

rc_columns = ['id', 'author', 'created_utc', 'subreddit', 'body', 'score',
 'ups', 'downs', 'parent_id', 'link_id', 'is_submitter', 
 'controversiality', 'total_awards_received', 'retrieved_on']

We decided to leave those columns as they give us almost all information we need to conduct our analysis and get some meaningful results. We may alter these during our experiments and analysis.

In [3]:
def read_subset(file, columns, source_type):
    path = os.path.join(folder, file)
    try:
        df = pd.read_csv(path, usecols=lambda c: c in columns)
        df = df.reindex(columns=columns)
        df["source_type"] = source_type
        return df
    except Exception as e:
        return pd.DataFrame(columns=columns + ["source_type"])

if not os.path.exists("preprocessed/reddit_submissions_filtered.csv") or not os.path.exists("preprocessed/reddit_comments_filtered.csv"):
    rs_dfs = [read_subset(f, rs_columns, "submission") for f in rs_files]
    rc_dfs = [read_subset(f, rc_columns, "comment") for f in rc_files]

In [4]:
if not os.path.exists("preprocessed/reddit_submissions_filtered.csv"):
    submissions_df = pd.concat(rs_dfs, ignore_index=True)
    submissions_df.to_csv("preprocessed/reddit_submissions_filtered.csv", index=False)
else:
    submissions_df = pd.read_csv("preprocessed/reddit_submissions_filtered.csv")

if not os.path.exists("preprocessed/reddit_comments_filtered.csv"):
    comments_df = pd.concat(rc_dfs, ignore_index=True)
    comments_df.to_csv("preprocessed/reddit_comments_filtered.csv", index=False)
else:
    comments_df = pd.read_csv("preprocessed/reddit_comments_filtered.csv")
    comments_df = comments_df[comments_df['created_utc'] > 1704116554]

In [5]:
print(f"Submissions DataFrame: {len(submissions_df):,} rows, {len(submissions_df.columns)} columns")
print(f"Comments DataFrame: {len(comments_df):,} rows, {len(comments_df.columns)} columns")

Submissions DataFrame: 355,382 rows, 17 columns
Comments DataFrame: 11,613,821 rows, 15 columns


In [6]:
if not os.path.exists("preprocessed/reddit_combined.csv"):
    comments_df["link_id_clean"] = comments_df["link_id"].str.replace("t3_", "", regex=False)
    
    joined_df = comments_df.merge(
        submissions_df,
        left_on="link_id_clean",
        right_on="id",
        suffixes=("_comment", "_submission")
    )
    
    joined_df.to_csv("preprocessed/reddit_combined.csv", index=False)

In [7]:
joined_df = pd.read_csv("preprocessed/reddit_combined.csv")

Ok, so now that we have our data filtered we can start exploring our data more thoroughly.
We will mainly explore both datasets simultaneously, applying quite similar changes. 
To begin, we need to filter out data that has nothing to do with elections. We will use keyword search and filter out data that doesn't have none of next keywords.

In [8]:
submissions_df.columns

Index(['id', 'author', 'created_utc', 'subreddit', 'title', 'selftext',
       'link_flair_text', 'domain', 'score', 'ups', 'downs', 'num_comments',
       'upvote_ratio', 'total_awards_received', 'retrieved_on',
       'is_original_content', 'source_type'],
      dtype='object')

In [9]:
comments_df.columns

Index(['id', 'author', 'created_utc', 'subreddit', 'body', 'score', 'ups',
       'downs', 'parent_id', 'link_id', 'is_submitter', 'controversiality',
       'total_awards_received', 'retrieved_on', 'source_type'],
      dtype='object')

In [10]:
nlp = spacy.load("en_core_web_sm")

In [11]:
if not os.path.exists("preprocessed/election_submissions.csv") or not os.path.exists("preprocessed/election_comments.csv"):
    import re
    keywords = [
        'election', 'vote', 'voting', 'ballots', 'campaign', 'polls', 'polling', 'debate',
        'liberal', 'conservative', 'ndp', 'bloc', 'green party', 'ppc', "people's party",
        'carney', 'mark carney', 'trudeau', 'justin trudeau', 'prime minister',
        'leadership race', 'minority government', 'majority government',
        'coalition', 'confidence vote', 'cabinet', 'mp', 'riding', 'parliament',
        'candidate', 'seat', 'platform', 'policy', 'promises',
        'trump', 'usa', 'united states', 'america', 'american', 'tariffs', 'trade war',
        'annexation', 'border', 'nafta', 'usmca', 'foreign policy', 'sanctions',
        'biden', 'washington', 'white house', 'congress',
        'inflation', 'economy', 'housing', 'immigration', 'refugees', 'carbon tax',
        'climate change', 'healthcare', 'indigenous', 'reconciliation',
        'gun control', 'freedom convoy', 'protest', 'corruption', 'scandal',
        'china', 'russia', 'ukraine', 'nato', 'defense', 'war'
    ]
    
    pattern = re.compile('|'.join([re.escape(k) for k in keywords]), flags=re.IGNORECASE)
    
    election_submissions = submissions_df[
        submissions_df['title'].fillna('').str.contains(pattern) |
        submissions_df['selftext'].fillna('').str.contains(pattern)
    ]
    election_comments = comments_df[
        comments_df['body'].fillna('').str.contains(pattern)
    ]

    def get_sentiment_tb(text):
        if pd.isna(text):
            return 0
        return TextBlob(text).sentiment.polarity
    
    election_comments['sentiment_score'] = election_comments['body'].apply(get_sentiment_tb)
    election_submissions['sentiment_score'] = election_submissions['full_text'].apply(get_sentiment_tb)
    
    texts = election_submissions['full_text'].tolist()
    
    people_list = []
    places_list = []
    
    for doc in tqdm(nlp.pipe(texts, batch_size=500, n_process=4), total=len(texts)):
        people = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
        places = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
        people_list.append(", ".join(people))
        places_list.append(", ".join(places))

    election_submissions['people'] = people_list
    election_submissions['places'] = places_list
    
    texts = election_comments['body'].tolist()
    
    people_list = []
    places_list = []
    
    for doc in tqdm(nlp.pipe(texts, batch_size=500, n_process=4), total=len(texts)):
        people = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
        places = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
        people_list.append(", ".join(people))
        places_list.append(", ".join(places))

    election_comments['people'] = people_list
    election_comments['places'] = places_list
    
    election_submissions.to_csv("preprocessed/election_submissions.csv", index=False)
    election_comments.to_csv("preprocessed/election_comments.csv", index=False)

In [12]:
election_submissions = pd.read_csv("preprocessed/election_submissions.csv")
election_comments = pd.read_csv("preprocessed/election_comments.csv")
election_comments = election_comments[election_comments['created_utc'] > 1704116554]

First, let's see how the dynamic between election related and other topics changed over time

In [13]:
election_submissions

Unnamed: 0,id,author,created_utc,subreddit,title,selftext,link_flair_text,domain,score,ups,...,num_comments,upvote_ratio,total_awards_received,retrieved_on,is_original_content,source_type,full_text,sentiment_score,people,places
0,1h3q22t,Feedmepi314,1.733011e+09,CanadaPolitics,Trudeau promises border helicopters at ‚Äòvery p...,,,globalnews.ca,50,50,...,42,0.94,0,1733011238,False,submission,Trudeau promises border helicopters at ‚Äòvery p...,0.200000,"Trudeau, Trump",
1,1h3q6bi,No_Inspection2904,1.733012e+09,montreal,THERIAN PACK,I‚Äôm creating a Montreal based pack for any the...,:Dfl: Discussion,self.montreal,1,1,...,7,0.15,0,1733011542,False,submission,THERIAN PACK I‚Äôm creating a Montreal based pac...,0.280313,üêª,"Montreal, Montreal"
2,1h3qevy,Rude-Visit4347,1.733012e+09,QuebecLibre,EDI: les preuves scientifiques de son caract√®r...,>les preuves scientifiques sur le caract√®re no...,Opinion,journaldemontreal.com,39,39,...,54,0.68,0,1733012225,False,submission,EDI: les preuves scientifiques de son caract√®r...,0.000000,"d‚ÄôIbram X. Kendi, Joel Finkelstein, la t√™te",
3,1h3qh92,keiths31,1.733012e+09,AskACanadian,"What, if any, benefit would there be to make t...",,,self.AskACanadian,1,1,...,0,1.00,0,1733012420,False,submission,"What, if any, benefit would there be to make t...",0.000000,,"Washington, DC"
4,1h3ql13,Grand-Impact-4069,1.733013e+09,AmericaBad,Is this genuinely an ‚ÄúAmerican dream‚Äù?,,Question,v.redd.it,473,473,...,241,0.92,0,1733012725,False,submission,Is this genuinely an ‚ÄúAmerican dream‚Äù?,0.000000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123765,1f60t2q,ConsistentReality860,1.725148e+09,canada,Trudeau's visit to Sault Ste. Marie wraps-up w...,,Politics,northernontario.ctvnews.ca,534,534,...,722,0.85,0,1725147636,False,submission,Trudeau's visit to Sault Ste. Marie wraps-up w...,-0.333333,"Trudeau, Sault Ste, Marie",
123766,1f60t9s,ConsistentReality860,1.725148e+09,CanadaPolitics,Trudeau's visit to Sault Ste. Marie wraps-up w...,,,northernontario.ctvnews.ca,17,16,...,6,0.84,0,1725147651,False,submission,Trudeau's visit to Sault Ste. Marie wraps-up w...,-0.333333,"Trudeau, Sault Ste, Marie",
123767,1f60v3b,random_uzerr,1.725148e+09,montreal,Salaire Cybers√©curit√© Montr√©al,"Hello tout le monde,\n\nJ‚Äôai pour projet de m‚Äô...",Question MTL,self.montreal,1,1,...,6,0.13,0,1725147800,False,submission,Salaire Cybers√©curit√© Montr√©al Hello tout le m...,0.000000,"Cybers√©curit√© Montr√©al, Montr√©al, conna√Ætre le...",
123768,1f611vk,ash-dropem,1.725148e+09,ontario,License/Service Ontario info updating question,"I recently got my G1, but I forgot I didn‚Äôt up...",Question,self.ontario,1,1,...,4,0.31,0,1725148359,False,submission,License/Service Ontario info updating question...,-0.040000,,


In [None]:
for df in [election_submissions, election_comments, submissions_df, comments_df]:
    df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s', errors='coerce')

submissions_daily = (
    submissions_df.groupby(pd.Grouper(key='created_utc', freq='D'))
    .size()
    .rename('total_submissions')
)

election_submissions_daily = (
    election_submissions.groupby(pd.Grouper(key='created_utc', freq='D'))
    .size()
    .rename('election_submissions')
)

comments_daily = (
    comments_df.groupby(pd.Grouper(key='created_utc', freq='D'))
    .size()
    .rename('total_comments')
)

election_comments_daily = (
    election_comments.groupby(pd.Grouper(key='created_utc', freq='D'))
    .size()
    .rename('election_comments')
)

submissions_trend = pd.concat([submissions_daily, election_submissions_daily], axis=1).fillna(0)
comments_trend = pd.concat([comments_daily, election_comments_daily], axis=1).fillna(0)

submissions_trend['ratio'] = submissions_trend['election_submissions'] / submissions_trend['total_submissions']
comments_trend['ratio'] = comments_trend['election_comments'] / comments_trend['total_comments']

submissions_trend['ratio_smooth'] = submissions_trend['ratio'].rolling(window=7, min_periods=1).mean()
comments_trend['ratio_smooth'] = comments_trend['ratio'].rolling(window=7, min_periods=1).mean()

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=submissions_trend.index,
    y=submissions_trend['ratio'],
    mode='lines',
    name='Daily ratio',
    line=dict(color='lightblue', width=1),
    opacity=0.5
))
fig.add_trace(go.Scatter(
    x=submissions_trend.index,
    y=submissions_trend['ratio_smooth'],
    mode='lines',
    name='7-day rolling mean',
    line=dict(color='steelblue', width=3)
))
fig.update_layout(
    title='Ratio of Election-related Submissions Over Time (Smoothed)',
    xaxis_title='Date',
    yaxis_title='Election-related / Total Submissions',
    hovermode='x unified',
    template='plotly_white',
    height=500
)
fig.show()

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=comments_trend.index,
    y=comments_trend['ratio'],
    mode='lines',
    name='Daily ratio',
    line=dict(color='navajowhite', width=1),
    opacity=0.5
))
fig.add_trace(go.Scatter(
    x=comments_trend.index,
    y=comments_trend['ratio_smooth'],
    mode='lines',
    name='7-day rolling mean',
    line=dict(color='darkorange', width=3)
))
fig.update_layout(
    title='Ratio of Election-related Comments Over Time (Smoothed)',
    xaxis_title='Date',
    yaxis_title='Election-related / Total Comments',
    hovermode='x unified',
    template='plotly_white',
    height=500
)
fig.show()

From this plot we can clearly see that topicality of elections increases as  time to elections reduces. We also see different spikes(e.g. Trump inauguration), which are expected as they increased political and elections debate.
We can also notice that comments have higher ratio, which could be explained by the fact that politics is more controverisial topic than cats dogs or canadian forests.

In [15]:
submissions_df['created_utc'] = pd.to_datetime(submissions_df['created_utc'], unit='s', errors='coerce')

submissions_df['is_election_related'] = submissions_df['id'].isin(election_submissions['id'])

monthly_total_avg = (
    submissions_df.groupby(pd.Grouper(key='created_utc', freq='M'))['num_comments']
    .mean()
    .rename('All Submissions')
)
monthly_election_avg = (
    submissions_df[submissions_df['is_election_related']]
    .groupby(pd.Grouper(key='created_utc', freq='M'))['num_comments']
    .mean()
    .rename('Election-related')
)

monthly_avg_comments = pd.concat([monthly_total_avg, monthly_election_avg], axis=1).fillna(0)

fig = go.Figure()
fig.add_trace(go.Bar(
    x=monthly_avg_comments.index.strftime('%b %Y'),
    y=monthly_avg_comments['All Submissions'],
    name='All Submissions',
    marker_color='#9CA3AF',
    opacity=0.6
))
fig.add_trace(go.Bar(
    x=monthly_avg_comments.index.strftime('%b %Y'),
    y=monthly_avg_comments['Election-related'],
    name='Election-related',
    marker_color='#2563EB',
    opacity=0.8
))
fig.update_layout(
    title='Average Discussion Intensity per Submission',
    xaxis_title='',
    yaxis_title='Avg. Comments per Submission',
    barmode='group',
    template='plotly_white',
    height=500,
    xaxis=dict(tickangle=30)
)
fig.show()


'M' is deprecated and will be removed in a future version, please use 'ME' instead.


'M' is deprecated and will be removed in a future version, please use 'ME' instead.



Looking at the barchart, we can clearly see that election-related submissions received greater number of comments on average throughout all research period.
Now, let's take a look at one more thing: let's look at dynamics of upvotes, downvotes and score of posts on average.

In [16]:
election_submissions['created_utc'] = pd.to_datetime(election_submissions['created_utc'], unit='s', errors='coerce')

monthly_metrics = (
    election_submissions
    .groupby(pd.Grouper(key='created_utc', freq='d'))[['ups', 'downs', 'score']]
    .mean()
)

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=monthly_metrics.index,
    y=monthly_metrics['ups'],
    mode='lines',
    name='Upvotes',
    line=dict(color='#2563EB', width=2)
))
fig.add_trace(go.Scatter(
    x=monthly_metrics.index,
    y=monthly_metrics['downs'],
    mode='lines',
    name='Downvotes',
    line=dict(color='#EF4444', width=2)
))
fig.update_layout(
    title='Average Upvotes vs Downvotes per Submission Over Time',
    xaxis_title='Month',
    yaxis_title='Average Count',
    template='plotly_white',
    hovermode='x unified',
    height=500
)
fig.show()

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=monthly_metrics.index,
    y=monthly_metrics['score'],
    mode='lines',
    line=dict(color='#059669', width=2)
))
fig.update_layout(
    title='Average Score per Submission Over Time',
    xaxis_title='Week',
    yaxis_title='Average Score',
    template='plotly_white',
    hovermode='x unified',
    height=500
)
fig.show()

To be honest, these graphs weren't as informative as I thought they would be. Anyway, we discovered that there is no stats on downs as reddit hid them several years ago. We can nevertheless try to extract info on both conservatives and liberals from election dataframe and look how they scored throughout campaign. 

In [20]:
election_submissions['created_utc'] = pd.to_datetime(election_submissions['created_utc'], unit='s', errors='coerce')

election_submissions['full_text'] = (
    (election_submissions['title'].fillna('') + ' ' + election_submissions['selftext'].fillna('')).str.lower()
)

liberal_keywords = [
    'liberal', 'trudeau', 'mark carney', 'freeland', 'ndp', 'green party', 
    'progressive', 'left-wing', 'liberals', 'canada liberals',
    'jagmeet singh', 'elizabeth may', 'chrystia freeland', 'justin trudeau',
    'liberal party', 'new democrat', 'new democratic party', 'grits',
    'centre-left', 'social democrat', 'carbon tax', 'climate action',
    'pharmacare', 'dental care', 'childcare', 'universal healthcare',
    'indigenous reconciliation', 'diversity', 'lgbtq rights'
]

conservative_keywords = [
    'conservative', 'poilievre', 'right-wing', 'tory', 'ppc', 
    'peoples party', 'reform party', 'harper',
    'pierre poilievre', 'conservative party', 'cpc', 'tories',
    'centre-right', 'fiscal conservative', 'tax cuts', 'small government',
    'axe the tax', 'carbon tax repeal', 'balanced budget', 'deficit reduction',
    'law and order', 'tough on crime', 'blue', 'alberta conservative',
    'maxime bernier', 'stephen harper', 'erin o\'toole', 'andrew scheer'
]

def categorize_text(text):
    text = text.lower()
    if any(re.search(rf'\b{kw}\b', text) for kw in liberal_keywords):
        return 'Liberal / Left'
    elif any(re.search(rf'\b{kw}\b', text) for kw in conservative_keywords):
        return 'Conservative / Right'

election_submissions['political_category'] = election_submissions['full_text'].apply(categorize_text)

score_trends = (
    election_submissions.groupby([pd.Grouper(key='created_utc', freq='d'), 'political_category'])['score']
    .mean()
    .unstack(fill_value=0)
)

colors = {
    'Liberal / Left': '#3B82F6',
    'Conservative / Right': '#EF4444',
}

fig = go.Figure()
for col in score_trends.columns:
    fig.add_trace(go.Scatter(
        x=score_trends.index,
        y=score_trends[col],
        mode='lines',
        name=col,
        line=dict(color=colors.get(col, '#000000'), width=2.5)
    ))

fig.update_layout(
    title='Average Submission Score Over Time by Political Leaning (Text-based)',
    xaxis_title='Month',
    yaxis_title='Average Score',
    template='plotly_white',
    hovermode='x unified',
    height=600
)
fig.show()

NameError: name 'pd' is not defined

One more thing we can visualize is controversality rates

In [18]:
election_comments

Unnamed: 0,id,author,created_utc,subreddit,body,score,ups,downs,parent_id,link_id,is_submitter,controversiality,total_awards_received,retrieved_on,source_type,sentiment_score,people,places
0,lus4irv,Krazee9,2024-11-01 00:00:00,canada,This article doesn't say any of that.\n\nWhat ...,7,7,0,t1_lury64m,t3_1ggrspf,False,0,0,1730419216,comment,-0.062500,Deepak Obrahi,India
1,lus4j73,doomscrolling_tiktok,2024-11-01 00:00:05,canada,I think people in the sub are a little extremi...,2,2,0,t1_luphswl,t3_1ggepop,False,0,0,1730419222,comment,-0.064062,,
2,lus4jfc,lizardelitecouncil,2024-11-01 00:00:07,vancouver,It‚Äôs like when the Thai protests that happened...,-3,-3,0,t1_lus1mxw,t3_1ggowib,False,1,0,1730419223,comment,0.800000,,
3,lus4kap,Adorable_Octopus,2024-11-01 00:00:16,CanadaPolitics,I have a hard time buying that Patton is using...,5,5,0,t1_luq0743,t3_1ggip72,False,0,0,1730419232,comment,-0.164583,,Patton
4,lus4lna,pm-me-racecars,2024-11-01 00:00:31,AskACanadian,Funny story:\n\nI'm half Mennonite. One of my ...,4,4,0,t1_lus2o48,t3_1ggn0no,False,0,0,1730419247,comment,0.031667,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4061273,mkrxwes,OilersHD,2025-03-31 23:59:34,canada,Thats not true. Permits and taxes account for ...,15,14,0,t1_mkrwaj1,t3_1jogwdb,False,0,0,1743465605,comment,-0.019318,,Ontario
4061274,mkrxx8a,VinlandRocks,2025-03-31 23:59:42,newfoundland,"The way they're ""protecting"" Greenland? I bet ...",2,2,0,t1_mkrw4ap,t3_1jodu32,True,0,0,1743465613,comment,0.077143,,"Greenland, Greenland"
4061275,mkrxxie,Twinsta,2025-03-31 23:59:45,NovaScotia,Just tuning in to say Alberta is great money/t...,13,12,0,t3_1joe0e2,t3_1joe0e2,False,0,0,1743465616,comment,0.390625,,"Alberta, America"
4061276,mkrxxql,seemefail,2025-03-31 23:59:47,CanadaPolitics,They‚Äôve started already with the accelerator f...,2,2,0,t1_mkqwh2f,t3_1joai6l,False,0,0,1743465618,comment,0.000000,,


In [19]:
controversiality_by_post = (
    election_comments.groupby("link_id")
      .agg(
          controversiality_rate=("controversiality", "sum"),
          created_utc=("created_utc", "min")
      )
      .reset_index()
      .sort_values("created_utc")
)

daily_controversiality = (
    controversiality_by_post
    .set_index("created_utc")
    .resample("W")["controversiality_rate"]
    .mean()
    .reset_index()
)
events = [
    ("2024-11-05", "US Election Day"),
    ("2024-11-25", "Trump tariff tweet"),
    ("2024-12-01", "Canadian debate controversy"),
    ("2025-01-06", "Trudeau resignation"),
    ("2025-03-01", "New PM sworn in"),
    ("2025-04-28", "Federal Elections")
]

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=daily_controversiality["created_utc"],
    y=daily_controversiality["controversiality_rate"],
    mode='lines+markers',
    name='7-Day Smoothed Rate',
    line=dict(color='blue', width=2),
    marker=dict(size=6)
))

for date_str, label in events:
    date = pd.to_datetime(date_str)
    nearest_idx = (daily_controversiality["created_utc"] - date).abs().idxmin()
    y = daily_controversiality.loc[nearest_idx, "controversiality_rate"]
    
    fig.add_trace(go.Scatter(
        x=[date],
        y=[y],
        mode='markers',
        marker=dict(color='red', size=10),
        showlegend=False,
        hovertext=label
    ))
    
    fig.add_annotation(
        x=date,
        y=y * 1.05,
        text=label,
        showarrow=False,
        font=dict(color='red', size=9),
        textangle=0
    )

fig.update_layout(
    title="Average Post Controversiality per Day (Smoothed) with Key Election Events",
    xaxis_title="Date",
    yaxis_title="Average Controversiality Rate",
    template='plotly_white',
    hovermode='x unified',
    height=600,
    yaxis=dict(rangemode='tozero')
)
fig.show()

In [20]:
score_trends = (
    election_submissions.groupby([pd.Grouper(key='created_utc', freq='3D'), 'political_category'])['sentiment_score']
    .mean()
    .unstack(fill_value=0)
)

colors = {
    'Liberal / Left': '#3B82F6',
    'Conservative / Right': '#EF4444',
}

events = [
    ("2024-11-05", "US Election Day"),
    ("2024-11-25", "Trump tariff tweet"),
    ("2024-12-01", "Canadian debate controversy"),
    ("2025-01-06", "Trudeau resignation"),
    ("2025-03-01", "New PM sworn in"),
    ("2025-04-28", "Federal Elections")
]

for category in ['Liberal / Left', 'Conservative / Right']:
    fig = go.Figure()
    
    fig.add_trace(go.Scatter(
        x=score_trends.index,
        y=score_trends[category],
        mode='lines',
        name=category,
        line=dict(color=colors.get(category, '#000000'), width=2.5)
    ))
    
    for date_str, label in events:
        date = pd.to_datetime(date_str)
        nearest_idx_array = score_trends.index.get_indexer([date], method='nearest')
        nearest_idx = nearest_idx_array[0]
        y = score_trends.iloc[nearest_idx][category]
        
        fig.add_trace(go.Scatter(
            x=[score_trends.index[nearest_idx]],
            y=[y],
            mode='markers',
            marker=dict(color='red', size=10),
            showlegend=False,
            hovertext=label
        ))
        
        fig.add_annotation(
            x=score_trends.index[nearest_idx],
            y=y * 1.05 if y > 0 else y * 0.95,
            text=label,
            showarrow=False,
            font=dict(color='black', size=11),
            textangle=0
        )
    
    fig.update_layout(
        title=f'Average Submission Score Over Time: {category}',
        xaxis_title='Date',
        yaxis_title='Average Sentiment Score',
        template='plotly_white',
        hovermode='x unified',
        height=600
    )
    fig.show()

In [None]:


all_people = election_submissions['people'].dropna().str.split(', ')
all_people_flat = [person for sublist in all_people for person in sublist if person]

all_places = election_submissions['places'].dropna().str.split(', ')
all_places_flat = [place for sublist in all_places for place in sublist if place]

people_counter = Counter(all_people_flat)
places_counter = Counter(all_places_flat)

most_common_people = people_counter.most_common(20)
most_common_places = places_counter.most_common(20)

print("Most Popular People:")
for person, count in most_common_people:
    print(f"{person}: {count}")

print("\nMost Popular Places:")
for place, count in most_common_places:
    print(f"{place}: {count}")


top_people_df = pd.DataFrame(most_common_people, columns=['Person', 'Count'])
top_places_df = pd.DataFrame(most_common_places, columns=['Place', 'Count'])

fig = px.bar(
    top_people_df,
    x='Count',
    y='Person',
    orientation='h',
    title='Top 20 Mentioned People',
    color='Count',
    color_continuous_scale='Blues'
)
fig.update_layout(
    yaxis={'categoryorder': 'total ascending'},
    template='plotly_white',
    height=600
)
fig.show()

fig = px.bar(
    top_places_df,
    x='Count',
    y='Place',
    orientation='h',
    title='Top 20 Mentioned Places',
    color='Count',
    color_continuous_scale='Reds'
)
fig.update_layout(
    yaxis={'categoryorder': 'total ascending'},
    template='plotly_white',
    height=600
)
fig.show()

Most Popular People:
Trudeau: 4349
Trump: 2821
Mark Carney: 2465
Je: 1858
Donald Trump: 1424
Pierre Poilievre: 1371
Justin Trudeau: 1307
Doug Ford: 1305
Montr√©al: 1133
Danielle Smith: 810
Mais: 692
Carney: 564
Est-ce: 422
Poilievre: 378
Donc: 374
Smith: 364
Jagmeet Singh: 337
\-: 320
FreshCo: 279
MLA: 243

Most Popular Places:
Canada: 21041
US: 4887
Alberta: 4628
Ontario: 4306
Toronto: 3894
U.S.: 3748
Ottawa: 3634
Calgary: 3192
Montreal: 2912
America: 2475
Carney: 1970
USA: 1638
Quebec: 1611
China: 1579
Vancouver: 1515
B.C.: 1447
Saskatchewan: 841
Ukraine: 804
the United States: 796
Manitoba: 701


In [22]:
df_people = election_submissions[election_submissions['people'] != ""]

df_people = df_people.assign(person=df_people['people'].str.split(', ')).explode('person')

df_people = df_people[df_people['person'] != ""]

df_people['created_utc'] = pd.to_datetime(df_people['created_utc'])

top_people = df_people['person'].value_counts().head(5).index.tolist()


df_top = df_people[df_people['person'].isin(top_people)]
sentiment_grouped = (
    df_top.groupby([pd.Grouper(key='created_utc', freq='w'), 'person'])
    .agg(avg_sentiment=('sentiment_score', 'mean'))
    .reset_index()
)

for person in top_people:
    df_person = sentiment_grouped[sentiment_grouped['person'] == person]
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=df_person['created_utc'],
        y=df_person['avg_sentiment'],
        mode='lines+markers',
        line=dict(width=2),
        marker=dict(size=6)
    ))
    fig.update_layout(
        title=f"Sentiment Over Time for {person}",
        xaxis_title='Date',
        yaxis_title='Average Sentiment Score',
        template='plotly_white',
        hovermode='x unified',
        height=500,
        yaxis=dict(range=[-1, 1])
    )
    fig.show()


'w' is deprecated and will be removed in a future version, please use 'W' instead.



In [23]:
df_places = election_submissions[election_submissions['places'] != ""]

df_places = df_places.assign(place=df_places['places'].str.split(', ')).explode('place')

df_places = df_places[df_places['place'] != ""]

df_places['created_utc'] = pd.to_datetime(df_places['created_utc'])

top_places = df_places['place'].value_counts().head(5).index.tolist()

df_top_places = df_places[df_places['place'].isin(top_places)]
sentiment_grouped_places = (
    df_top_places.groupby([pd.Grouper(key='created_utc', freq='W'), 'place'])
    .agg(avg_sentiment=('sentiment_score', 'mean'))
    .reset_index()
)

for place in top_places:
    df_place = sentiment_grouped_places[sentiment_grouped_places['place'] == place]
    
    # Plotly version
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=df_place['created_utc'],
        y=df_place['avg_sentiment'],
        mode='lines+markers',
        line=dict(color='green', width=2),
        marker=dict(size=6)
    ))
    fig.update_layout(
        title=f"Sentiment Over Time for {place}",
        xaxis_title='Date',
        yaxis_title='Average Sentiment Score',
        template='plotly_white',
        hovermode='x unified',
        height=500,
        yaxis=dict(range=[-1, 1])
    )
    fig.show()

In [None]:
import torch
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

Device set to use mps:0


In [19]:
text = "Donald Trump save us"

labels = ["left-wing", "right-wing", "centrist"]

result = classifier(text, labels)

print(f"Text: {text}")
print(f"Labels: {result['labels']}")
print(f"Scores: {result['scores']}")

Text: Donald Trump save us
Labels: ['right-wing', 'left-wing', 'centrist']
Scores: [0.862888753414154, 0.09426861256361008, 0.042842667549848557]


In [None]:
import numpy as np



'centrist'