# Collecting data for our diachronics project from Reddit

In [None]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd

RAW_DATA_LOCATION = 'data/raw/corpus.csv'

## Reddit Bot

In [None]:
from webscrapers.reddit.reddit_bot import RedditBuddy

In [None]:
bot = RedditBuddy()

In [None]:
bot.submission_limit = 500
data = [
    bot.search(
        subreddit='feminism', 
        searches=['abortion']
    ),
    bot.search(
        subreddit='feminism', 
        searches=['pro-life']
    ),
    bot.search(
        subreddit='feminism', 
        searches=['prolife']
    ),
    bot.search(
        subreddit='feminism', 
        searches=['pro life','forced birth','forced-birth']
    ),
    bot.search(
        subreddit='feminism', 
        searches=['forced birth']
    ),
    bot.search(
        subreddit='feminism', 
        searches=['forced-birth']
    ),
    bot.search(
        subreddit='feminism', 
        searches=['force birth']
    ),
]

In [None]:
data = pd.concat(data, ignore_index=True)
print(data.shape)
data = data.drop_duplicates()
print(data.shape)

In [None]:
data.to_csv(RAW_DATA_LOCATION, index=False, encoding='utf-8')

## Data Preprocessing

In [None]:
data = pd.read_csv(RAW_DATA_LOCATION)

In [None]:
data['body'].loc[data['parent_id'].isin(['ROOT'])] = data['submission_title'].loc[data['parent_id'].isin(['ROOT'])]

In [None]:
data['body'] = data['body'].apply(lambda x: re.sub(r'<br>', ' ', str(x)).strip())

In [None]:
data['tag'] = data['body'].apply(lambda x: '|'.join(list(set(re.findall(r'(pro-life|prolife\s|pro life|forced birth|forced-birth|forcedbirth|force birth)', str(x).lower())))))

In [None]:
def replace_values(x):
    x_ = re.sub(r'(pro-life|prolife\s|pro life)', 'pro_life', x)
    x_ = re.sub(r'(forced birth|forced-birth|forcedbirth|force birth)', 'forced_birth', x_)
    x_ = '|'.join(set(x_.split('|')))
    return x_

data['tag'] = data['tag'].apply(lambda x: replace_values(str(x)))

In [None]:
data['tag'].value_counts()

In [None]:
data['tag'].loc[data['tag'].isin(["nan"])] = None
data.to_csv(RAW_DATA_LOCATION, index=False, encoding='utf-8')

## Smaller, targeted context corpus

In [None]:
data = pd.read_csv(RAW_DATA_LOCATION)

In [None]:
# get all tagged item IDs and their parents
parent_ids = data['parent_id'].loc[~data['tag'].isna()].unique().tolist()
parents = [pid.split('_')[-1] for pid in parent_ids]

subcorpus1 = data.loc[
    data['parent_id'].isin(parent_ids) | data['comment_id'].isin(parents)
].copy()

In [None]:
subcorpus1.head()

In [None]:
# get all children . . .
data['parent_id_'] = data['parent_id'].apply(lambda x: x.split('_')[-1])
subcorpus2 = data.loc[data['parent_id_'].isin(data['comment_id'].loc[~data['tag'].isna()].unique())].copy()
del subcorpus2['parent_id_']

In [None]:
subcorpus2.head()

In [None]:
subcorpus = pd.concat([subcorpus1,subcorpus2], ignore_index=True)
subcorpus = subcorpus.drop_duplicates()

In [None]:
subcorpus['tag'].value_counts()

In [None]:
subcorpus.to_csv(RAW_DATA_LOCATION.replace('.csv', '-localcontext.csv'), index=False, encoding='utf-8')

In [None]:
subcorpus.shape

## Some quick post-hoc analyses

In [None]:
data = pd.read_csv(RAW_DATA_LOCATION)
data = data.loc[~data['body'].isin(['[deleted]', '[removed]'])]

In [None]:
data['parent_id'].unique()

In [None]:
data['comment_created_at'] = pd.to_datetime(data['comment_created_at'], unit='s')

In [None]:
data['comment_created_at'].min(), data['comment_created_at'].max()

In [None]:
data['tag'].value_counts()

In [None]:
data['after_dobbs'] = (data['comment_created_at'] > pd.to_datetime('2022-06-24'))
data[['tag', 'after_dobbs']].value_counts(sort=False)

In [None]:
data['after_dobbs'].value_counts()

In [None]:
xi_data = [
    {
        'term': str(term), 
        'pre-Dobbs': (data['tag'].isin([term]) & (~data['after_dobbs'])).sum(),
        'post-Dobbs': (data['tag'].isin([term]) & data['after_dobbs']).sum()
    } for term in data['tag'].unique()
]
xi_data = pd.DataFrame(xi_data)
xi_data.head(10)

In [None]:
from scipy.stats import chi2_contingency 
res = chi2_contingency(xi_data[['pre-Dobbs', 'post-Dobbs']].loc[~xi_data['term'].isin(['pro_life|forced_birth', 'forced_birth|pro_life'])].values)
res.dof, res.statistic, res.pvalue

In [None]:
res.expected_freq / res.expected_freq.sum(axis=0).reshape(1,-1)

In [None]:
xi__ = xi_data.loc[~xi_data['term'].isin(['pro_life|forced_birth', 'forced_birth|pro_life'])].copy()
xi__[['pre-Dobbs', 'post-Dobbs']] = xi__[['pre-Dobbs', 'post-Dobbs']] / xi__[['pre-Dobbs', 'post-Dobbs']].values.sum(axis=0).reshape(1,-1)
xi__['pre/post'] = xi__['pre-Dobbs'] / xi__['post-Dobbs']
xi__.head()