In [1]:
%load_ext autoreload
%autoreload 2

%cd '..'

/dss/dsshome1/03/di93fup/polarization_reddit


In [2]:
import logging

# logging.basicConfig(
#     level=logging.INFO,
#     format="%(asctime)s [%(levelname)s] %(message)s",
#     handlers=[
#         logging.FileHandler("data/logs/mass_shootings_preprocess_comments_notebook.log"),
#         logging.StreamHandler()
#     ]
# )

In [3]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

from load.utils import (
    load_df_from_parquet,
)
from preprocessing.utils import (
    save_event_comments,
    build_vocab,
    save_event_vocab,
)

from preprocessing.constants import (
    MASS_SHOOTINGS_EVENTS_INFO,
    MASS_SHOOTINGS_KEYWORDS,
    MIN_OCCURENCE_FOR_VOCAB,
    get_event_regex,
)

In [4]:
YEARS = [2015, 2016, 2017, 2018]

EVENT_NAMES = list(MASS_SHOOTINGS_EVENTS_INFO.keys())

MASS_SHOOTING_EVENTS_KEYWORDS = [
    keyword
    for event_name in EVENT_NAMES
    for keyword in MASS_SHOOTINGS_EVENTS_INFO[event_name]["keywords"]
]


In [5]:
MASS_SHOOTING_REGEX = get_event_regex(
    MASS_SHOOTINGS_KEYWORDS,
    MASS_SHOOTING_EVENTS_KEYWORDS,
    operator="and",
)

logging.info(MASS_SHOOTING_REGEX)


In [6]:
MASS_SHOOTING_REGEX

'(?:shoot|gun|kil|attack|massacr|victim).*(?:charleston|chattanoog|umpqu|roseburg|colorado springs|plan par|san bernardino|kalamazoo|orlando|puls nightclub|dalla|baton roug|burlington|cascad mal|fort lauderd|fresno|up|san francisco|vega|harvest fest|mandalay bay|rout 91|thornton|walmart|denv|sutherland springs|parkland|mars stonem|nashvil|waffl hous|sant fe|annapol|capit gazet|pittsburgh|tre of lif|thousand oak)'

## Filter event comments

In [7]:
logging.info("Load partisan comments")
partisan_comments = pd.concat(
    [
        load_df_from_parquet(
            file_name=f"user_partisan_comments_{year}.parquet",
        )
        for year in YEARS
    ]
)


In [8]:
partisan_comments.dtypes

author          string[python]
body_cleaned    string[python]
created_utc              int64
subreddit       string[python]
id              string[python]
parent_id       string[python]
party           string[python]
tokens          string[python]
dtype: object

In [9]:
logging.info("Filtering event data based on keywords...")
event_comments = partisan_comments[
    partisan_comments["tokens"].str.contains(
        MASS_SHOOTING_REGEX,
    )
].copy()
logging.info("finished keyword filtering")


In [10]:
event_comments.shape

(74274, 8)

In [11]:
logging.info(event_comments.shape)

In [12]:
del partisan_comments

In [13]:
logging.info(event_comments.dtypes)

In [14]:
logging.info(f"Nr of event comments: {len(event_comments)}")

In [15]:
logging.info("saving event comments...")
save_event_comments(event_comments, "mass_shootings")

In [16]:
event_comments

Unnamed: 0,author,body_cleaned,created_utc,subreddit,id,parent_id,party,tokens
17,O9LpP,> are you going to blame him for having bigger...,1420236694,Anarcho_Capitalism,cncjgou,t1_cncilv0,rep,ar you going to blam him for hav big bal than ...
27,O9LpP,> the victim of garner was the store owners an...,1420257458,Anarcho_Capitalism,cnctss1,t1_cncrge0,rep,the victim of garn was the stor own and it obv...
47,O9LpP,> molyneaux would say that blockbuster fell vi...,1420661783,Anarcho_Capitalism,cnhle8y,t1_cngxzyb,rep,molyneaux would say that blockbust fel victim ...
50,O9LpP,"> ... i guess the bourgeoisie ( white , christ...",1420741352,Anarcho_Capitalism,cnil3y5,t1_cnig83q,rep,i guess the bourgeoisy whit christian middl cl...
296,O9LpP,what 's the alternative to an individual getti...,1432669170,Anarcho_Capitalism,crlnpa0,t1_crl6xc5,rep,what the altern to an individ get to own stuff...
...,...,...,...,...,...,...,...,...
4377737,tPgqK,i 'm convinced nugent is a left wing double ag...,1451485623,progun,cyg4tv6,t1_cyg250e,rep,i convint nug is a left wing doubl ag try his ...
4377791,DFBdY,or people can stop blaming all for the few . t...,1451496880,politics,cygb9rk,t1_cygafu3,dem,or peopl can stop blam al for the few ther ar ...
4377895,MLTYk,i support more gradual changes to our tax-syst...,1451523699,PoliticalDiscussion,cygsayd,t3_3yrxfc,dem,i support mor grad chang to our than most prog...
4377969,zK18O,hmmm so you did the research on this case righ...,1451569808,PoliticalHumor,cyh9q7o,t1_cyh1gud,dem,hmmm so you did the research on thi cas right ...


## Build event vocabulary

In [17]:
# Read event data
# events_comments = load_event_comments(
#     EVENT_NAME,
# )

event_vocab = build_vocab(
    event_comments["tokens"],
    min_comment_freq=MIN_OCCURENCE_FOR_VOCAB,
)


In [None]:
logging.info("Vocabulary length")
logging.info(len(event_vocab))

In [None]:
logging.info("Saving event vocab")
save_event_vocab(event_vocab, "mass_shootings")