# Using Reddit Tools to collect hate speech loci

In [None]:
# import os 

# os.mkdir('data')
# os.mkdir('data/group_names')
# os.mkdir('data/group_data')

In [None]:
import pandas as pd
import numpy as np
from webscrapers.reddit.reddit_bot import RedditBuddy

bot = RedditBuddy()
bot.include_comments = False

## 1. Sourcing possible hate content/groups.

We start by first seeking out which groups are likely to support the dissemination of hate-based content and generating a list of such subreddits. To do this, we'll source posts `r/AgainstHateSubreddits` community, which is dedicated to tracking this kind of content.

In [None]:
bot.time_filter="all"
results = bot.recent_submissions(
    subreddit='againsthatesubreddits',
    limit=5000
)

In [None]:
results.shape

We'll then try to extract exact subreddit names from the list returned.

In [None]:
import regex as re

results['subreddit_mentioned'] = results['submission_title'].apply(lambda x: re.findall(r'(\br/\w+\b|\br-\w+\b)', str(x)))

# antisemitic_subreddits = np.unique(sum(results['subreddit_mentioned'].tolist(), []))

In [None]:
results[['submission_flair', 'subreddit_mentioned']].astype(str).value_counts(sort=False)

In [None]:
results.head(20)

In [None]:
from datetime import datetime as dt

results.to_csv('data/group_names/{}.csv'.format(dt.now().date().isoformat().replace('-', '')), index=False, encoding='utf-8')

## Compiling data collected across multiple weeks

In [None]:
import os
import pandas as pd
import numpy as np
import regex as re

data_location = '/Volumes/ROY/comp_ling/datasci/RedditHateStudy/processes/data/group_names'

files = [os.path.join(data_location, f) for f in os.listdir(data_location) if f.endswith('.csv') and ('all_dates' not in f) and ('._' not in f)]

df = [pd.read_csv(f) for f in files]
df = pd.concat(df, ignore_index=True)
df = df.drop_duplicates(subset=['submission_id'])

df['subreddit_mentioned'] = df['subreddit_mentioned'].apply(lambda x: x.lower() if isinstance(x,str) else "['']")
del_row = []

meta_data_cols = [col for col in list(df) if col not in [ 'subreddit_mentioned']]
df_ = []
for i in df.index:
    subs_mentioned = df['subreddit_mentioned'].loc[i].replace('[', "").replace("]", "").replace("'", "").split(', ')
    meta_data = df[meta_data_cols].loc[i].to_list()
    df_ += [meta_data+[re.sub(r'(r/|r-)', '', sub)] for sub in subs_mentioned]

df = pd.DataFrame(
    np.array(df_, dtype=object),
    columns=meta_data_cols+['subreddit_mentioned']
)

df.to_csv(os.path.join(data_location, 'all_dates.csv'), index=False, encoding='utf-8')
df.shape

In [None]:
df.head()

In [None]:
counts = df[['submission_flair', 'subreddit_mentioned']].value_counts(sort=False)
df_counts = [[k[0], k[1], v] for k,v in counts.items()]
df_counts = pd.DataFrame(
    np.array(df_counts, dtype=object),
    columns = ['submission_flair', 'subreddit_mentioned', 'count']
)
df_counts.sort_values(by=['submission_flair', 'count'],ascending=False).to_csv(os.path.join(data_location, 'all_counts.csv'), index=False, encoding='utf-8')

In [None]:
df_counts.sort_values(by=['submission_flair', 'count'],ascending=False).head(1000)

 ## Pulling data from problematic subreddits
 
Now that we have a list of problematic subreddits, we can actually go through and pull data from them. The following code is confusing, but it will automate the creation of search queries for grabbing hate-based content from Reddit and pulling those results. 

We'll use our counts for hate-producing subreddits to generate a list of the top 5 hate-producing groups per target of hate speech.

We'll then create a query for each target group, per each subreddit producing that kind of content, using the discriminatory lexicon curated by weaponizedword via their API.

We'll then search each subreddit using the query for that target group.

In [1]:
targets = {
    'Antisemitism': [
        ('malignant_meaning', 'jew'), 
        ('malignant_meaning', 'Jew'),
        ('malignant_meaning', 'Judaism'),
    ],
    'LGBTQ+ hatred': [
        ('malignant_meaning', 'homosexual'), 
        ('malignant_meaning', 'Homosexual'),
        ('malignant_meaning', 'lesbian'),
    ], 
    'Queerphobia': [
        ('malignant_meaning', 'homosexual'), 
        ('malignant_meaning', 'Homosexual'),
        ('malignant_meaning', 'lesbian'),
    ],
    'Islamophobia': [
        ('malignant_meaning', 'Muslim'), 
        ('malignant_meaning', 'Islam'), 
    ],
    'Gender Hatred': [
        ('malignant_meaning', 'female'),
        ('malignant_meaning', 'women'),
        ('malignant_meaning', 'woman'),
    ], 
    'Misogyny': [
        ('malignant_meaning', 'female'),
        ('malignant_meaning', 'women'),
        ('malignant_meaning', 'woman'),
    ],
    'Xenophobia': [
        ('malignant_meaning', 'immigrant'),
        ('malignant_meaning', 'immigrants'),
    ],
    'Transphobia': [
        ('malignant_meaning', 'trans'),
        ('malignant_meaning', 'Trans'),
    ],
    'Racism': [
        # Anti-Black
        ('malignant_meaning', 'black'),
        ('malignant_meaning', 'Black'),
        ('malignant_meaning', 'african'),
        ('malignant_meaning', 'African'),
        
        # Anti-Asian
        ('malignant_meaning', 'Asian'),
        ('malignant_meaning', 'asian'),
        
        # Anti Hispanic/Latine
        ('malignant_meaning', 'hispanic'),
        ('malignant_meaning', 'Hispanic'),
        ('malignant_meaning', 'Mexican'),
        ('malignant_meaning', 'mexican'),
        ('malignant_meaning', 'Latin'),
        ('malignant_meaning', 'latin'),
    ],
}

### Getting subreddit names

In [2]:
from datetime import datetime as dt
import pandas as pd
import numpy as np
import os
import json

path = '/Volumes/ROY/comp_ling/datasci/RedditHateStudy/processes/data'
dfg = pd.read_csv(os.path.join(path, 'group_names/all_counts.csv'))

target_query = dict()
for k in targets.keys():
    sub = dfg.loc[
        dfg['submission_flair'].isin([k]) 
        & ~dfg['subreddit_mentioned'].isna()
    ].sort_values(by=['count'],ascending=False)
    
    target_query[k] = {
        'subreddits': sub['subreddit_mentioned'].values[5:10].tolist(),
        'query': None
    }
    

### Creating queries

In [3]:
from webscrapers.weaponizedword.api import weaponizedword, query_data
LOAD = True

ww = weaponizedword()

if LOAD:
    ww.load_search()
else:
    ww.search(endpoint_name='get_discriminatory')
    ww.save_search()


In [4]:
for k,v in targets.items():
    query = []
    for field, search in v:
        query += [ww.create_query_from_results(field, search)]
    target_query[k]['query'] = ' OR '.join(query)

In [5]:
with open(os.path.join(path, 'queries.json'), 'w', encoding='utf-8') as f:
    txt = json.dumps(target_query, indent=4)
    f.write(txt)
f.close()

### Using queries to grab data from subreddits

In [6]:
from webscrapers.reddit.reddit_bot import RedditBuddy

bot = RedditBuddy()
# bot.submission_limit = 500

Version 7.7.0 of praw is outdated. Version 7.7.1 was released Tuesday July 11, 2023.


In [7]:
D = pd.DataFrame()

D['hate_target'] = [None]
D['subreddit'] = [None]

In [8]:
import json

with open(os.path.join(path, 'queries.json'), 'r', encoding='utf-8') as f:
    target_query = json.loads(f)
f.close()

TypeError: the JSON object must be str, bytes or bytearray, not TextIOWrapper

In [8]:
data = []

In [9]:
for k,v in target_query.items():
    
    # Query edits after experimentation
    if k == 'Antisemitism':
        # v['query'] += ' OR "soros" OR "rothschild" OR "globalist" OR "jew"'
        v['query'] = '"soros" OR "rothschild" OR "globalist" OR "jew"'
    if k == 'Gender Hatred':
        # v['query'] += ' OR "women" OR "woman"'
        v['query'] = '"women" OR "woman"'
    if k == 'Misogyny':
        # v['query'] += ' OR "women" OR "woman"'
        v['query'] = '"women" OR "woman"'
    if k == 'Racism':
        # v['query'] += ' OR "black" OR "latin" OR "african" OR "mexican"'
        v['query'] = '"black" OR "latin" OR "african" OR "mexican" OR "asian" OR "pacific islander"'
    if k == 'Transphobia':
        v['query'] = 'trans OR "hormone replacement therapy" OR "hormone replacement" OR "bottom surgery" OR "hormone"'
    if k == 'Xenophobia':
        v['query'] += ' OR "immigrant" OR "border"'
    
    
    for subreddit in v['subreddits']:
        print(subreddit, k)
        # if (subreddit == 'cringetopia') and (k=='Gender Hatred'):
        #     scrape = True
        if (D['subreddit'].isin([subreddit]) & D['hate_target'].isin([k])).sum() < 1:
            try:
                data += [
                    bot.search( 
                        subreddit=subreddit,
                        searches=[
                            v['query']
                        ]
                    )
                ]
                data[-1]['hate_target'] = k
            except Exception as e:
                print(e)

tucker_carlson Antisemitism


100%|██████████| 100/100 [00:37<00:00,  2.68it/s]


fascismreclaimed Antisemitism
received 404 HTTP response
greentext Antisemitism


100%|██████████| 100/100 [02:21<00:00,  1.41s/it]


timpool Antisemitism


100%|██████████| 100/100 [01:05<00:00,  1.52it/s]


politicalcompass Antisemitism


100%|██████████| 100/100 [01:32<00:00,  1.08it/s]


socialjusticeinaction LGBTQ+ hatred
received 404 HTTP response
worldnationalists LGBTQ+ hatred
received 404 HTTP response
louderwithcrowder LGBTQ+ hatred
received 404 HTTP response
europeansocialists LGBTQ+ hatred


100%|██████████| 13/13 [00:24<00:00,  1.88s/it]


tucker_carlson LGBTQ+ hatred


100%|██████████| 52/52 [00:43<00:00,  1.21it/s]


trueanon Queerphobia


100%|██████████| 100/100 [01:38<00:00,  1.02it/s]


fingmemes Islamophobia


100%|██████████| 100/100 [01:38<00:00,  1.01it/s]


funnymemes Islamophobia


 92%|█████████▏| 22/24 [00:22<00:02,  1.04s/it]


received 429 HTTP response
hindutvarises Islamophobia


100%|██████████| 51/51 [00:49<00:00,  1.02it/s]


indiandankmemes Islamophobia


100%|██████████| 100/100 [01:40<00:00,  1.00s/it]


indiarises Islamophobia


100%|██████████| 27/27 [00:25<00:00,  1.05it/s]


theredpill Gender Hatred
received 403 HTTP response
trueunpopularopinion Gender Hatred


100%|██████████| 100/100 [10:04<00:00,  6.04s/it]


traditionalmuslims Misogyny


100%|██████████| 100/100 [00:50<00:00,  1.97it/s]


southasianmasculinity Misogyny


100%|██████████| 100/100 [01:17<00:00,  1.30it/s]


purplepilldebate Misogyny


100%|██████████| 100/100 [16:08<00:00,  9.69s/it]


prolife Misogyny


100%|██████████| 100/100 [02:52<00:00,  1.72s/it]


politicalcompassmemes Misogyny


100%|██████████| 100/100 [06:34<00:00,  3.94s/it]


tuckercarlson Xenophobia
received 403 HTTP response
wallstreetsilver Xenophobia


100%|██████████| 100/100 [01:00<00:00,  1.65it/s]


wallstreetsilver Transphobia


100%|██████████| 100/100 [01:36<00:00,  1.04it/s]


antihatecommunities Transphobia
received 404 HTTP response
louderwithcrowder Transphobia
received 404 HTTP response
transmedical Transphobia


100%|██████████| 100/100 [01:40<00:00,  1.00s/it]


tumblrinaction Transphobia
received 404 HTTP response
conservative Racism


100%|██████████| 100/100 [03:52<00:00,  2.32s/it]


heckoffcommie Racism
received 404 HTTP response
timpool Racism


100%|██████████| 100/100 [01:56<00:00,  1.16s/it]


politicalcompass Racism


100%|██████████| 100/100 [01:29<00:00,  1.12it/s]

louderwithcrowder Racism
received 404 HTTP response





In [10]:
D = pd.concat(data, ignore_index=True)
D = D.drop_duplicates(subset=['comment_id', 'submission_id', 'hate_target'])

D.to_csv(
    os.path.join(
        path, 
        'group_data/all-groups-{}-2.tsv'.format(
            dt.now().date().isoformat().replace('-', '')
        )
    ), 
    sep='\t', 
    index=False, 
    encoding='utf-8'
)

D.shape

(185251, 13)

In [11]:
D['hate_target'].value_counts()

Misogyny         79958
Gender Hatred    35510
Racism           26199
Antisemitism     16380
Transphobia       8955
Islamophobia      8903
Xenophobia        3524
Queerphobia       3390
LGBTQ+ hatred     2432
Name: hate_target, dtype: int64

## Pulling Content from Antisemitic Subreddits [Single example]

In [None]:
import pandas as pd
import numpy as np
import regex as re
from webscrapers.reddit.reddit_bot import RedditBuddy
from datetime import datetime as dt

bot = RedditBuddy()

In [None]:
data, antisemitic_subreddits = [], ['conspiracy']
for subreddit in antisemitic_subreddits:
    data += [
        bot.search(
            subreddit=subreddit,
            searches=[
                '"soros" OR "rothschild" OR "globalist"'
            ]
            # time_filter='month'
        )
    ]

data = pd.concat(data, ignore_index=True)
data.shape

In [None]:
data['parent_level'] = data['parent_id'].apply(lambda x: x.split('_')[0])
data['parent_id'] = data['parent_id'].apply(lambda x: x.split('_')[-1])

In [None]:
print(list(data))

In [None]:
k = data['submission_title'].value_counts()
print(len(k))
k

In [None]:
import nltk.data
sent_det = nltk.data.load('tokenizers/punkt/english.pickle')

df, meta_data_cols = [], [col for col in list(data) if col != 'body']
for i in data.index:
    text = data['body'].loc[i]
    if text:
        text = re.sub(r'(?<=<QUOTE>)(.*?)(?=<QUOTE>)', '', text)
        text = re.sub('<QUOTE>', '', text)
        text = text.split('<br>')
        text = sum([sent_det.tokenize(t) for t in text], [])
        df += [data[meta_data_cols].loc[i].tolist()+[sent] for sent in text]

df = pd.DataFrame(
    np.array(df, dtype=object),
    columns = meta_data_cols + ['body']
)

In [None]:
df.shape

In [None]:
sel = df['body'].apply(lambda x: '[removed]' in x)
sel.sum()

In [None]:
df.loc[~sel].to_csv(
    'data/group_data/{}-{}.csv'.format(
        '-'.join(antisemitic_subreddits),
        dt.now().date().isoformat().replace('-', '')
    ), 
    index=False, encoding='utf-8'
)