# Load Reddit Data

In [1]:
import pickle
import pandas as pd

with open('../data/allSubredditsEnriched.pickle','rb') as read_file:
    all_subreddits = pickle.load(read_file)

# Define "Core" Network

In [2]:
network_seed_list = [
'r/latinoamerica',
'r/argentina',
'r/bolivia',
'r/brasil',
'r/chile',
'r/colombia',
'r/costa_rica',
'r/cuba',
'r/dominican',
'r/ecuador',
'r/estadosunidos',
'r/elsalvador',
'r/es',
'r/guatemala',
'r/honduras',
'r/mexico',
'r/nicaragua',
'r/panama',
'r/paraguay',
'r/peru',
'r/puertorico',
'r/spain',
'r/uruguay',
'r/venezuela',
'r/LatinAmerica',
'r/redditores',
'r/espanol',
'r/catolicismo',
'r/futbol',
'r/videojuego',
'r/musicaenespanol',
]

# Find Nodes

In [3]:
import re

with_word_breaks = list(map(lambda w: r'\b' + w + r'\b', network_seed_list))
print(with_word_breaks)
regex = re.compile('|'.join(with_word_breaks))

def filter_network_reddits(dataframe):
    return dataframe[dataframe['description_html'].str.contains(regex, na=False) | dataframe['wiki_text'].str.contains(regex, na=False)]

['\\br/latinoamerica\\b', '\\br/argentina\\b', '\\br/bolivia\\b', '\\br/brasil\\b', '\\br/chile\\b', '\\br/colombia\\b', '\\br/costa_rica\\b', '\\br/cuba\\b', '\\br/dominican\\b', '\\br/ecuador\\b', '\\br/estadosunidos\\b', '\\br/elsalvador\\b', '\\br/es\\b', '\\br/guatemala\\b', '\\br/honduras\\b', '\\br/mexico\\b', '\\br/nicaragua\\b', '\\br/panama\\b', '\\br/paraguay\\b', '\\br/peru\\b', '\\br/puertorico\\b', '\\br/spain\\b', '\\br/uruguay\\b', '\\br/venezuela\\b', '\\br/LatinAmerica\\b', '\\br/redditores\\b', '\\br/espanol\\b', '\\br/catolicismo\\b', '\\br/futbol\\b', '\\br/videojuego\\b', '\\br/musicaenespanol\\b']


In [4]:
out_degree = filter_network_reddits(all_subreddits)

In [6]:
out_degree.info()

<class 'pandas.core.frame.DataFrame'>
Index: 650 entries, niobio to Tarragona
Data columns (total 8 columns):
Unnamed: 0          650 non-null int64
Unnamed: 0.1        650 non-null int64
Unnamed: 0.1.1      650 non-null int64
desc                550 non-null object
created_date        650 non-null object
subs                650 non-null int64
description_html    649 non-null object
wiki_text           86 non-null object
dtypes: int64(4), object(4)
memory usage: 45.7+ KB


# Create Edges and Attribute List

In [12]:
formatted = out_degree.reset_index().rename(columns={ 'real_name': 'reddit_name' })

In [13]:
import sys
import importlib
sys.path.append('../src/')

import reddit
reddit = importlib.reload(reddit)

import network
network = importlib.reload(network)

reddit_list = list(formatted['reddit_name'])
edges = list(map(lambda name: network.find_edges(reddit_name=name, reddit_type='r', dataframe=all_subreddits, depth='out_degree'), reddit_list))

In [23]:
import pandas as pd

flattened_edges = [y for x in edges for y in x]
df = pd.DataFrame(flattened_edges)

In [28]:
edges_to_core_network = df[df['reddit'].isin(list(map(lambda s: s[2:], network_seed_list)))]

In [38]:
renamed = edges_to_core_network.rename(columns={ 'parent': 'source', 'type_of_parent': 'source_type', 'reddit': 'target', 'type_of_reddit': 'target_type', 'type_of_edge': 'edge_type' })
deduped = renamed.drop_duplicates(subset=['source', 'target', 'edge_type'])
deduped[['source', 'source_type', 'target', 'target_type', 'edge_type', 'context']].to_csv('../data/outDegreeEdgeList.csv')

In [41]:
reddits_in_edge_list = formatted[formatted['reddit_name'].isin(edges_to_core_network.source.unique())]
reddits_in_edge_list[['reddit_name', 'desc', 'created_date', 'subs']].to_csv('../data/outDegreeAttributeList.csv')

In [44]:
reddits_in_edge_list.head()

Unnamed: 0.2,reddit_name,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,desc,created_date,subs,description_html,wiki_text
0,niobio,20610,20610,1201335,,2017-10-13,4,"<!-- SC_OFF --><div class=""md""><h1><a href=""/r...",
1,brasilbackup,29262,29262,1209987,Fa&ccedil;a backup antes de jogar algo pouco t...,2017-10-30,2,"<!-- SC_OFF --><div class=""md""><h4><em>(banner...",
2,Tabarnia,47152,47152,1227877,Comunidad Aut&oacute;noma de Tabarnia - Acta e...,2017-12-26,305,"<!-- SC_OFF --><div class=""md""><p><strong>Bien...",
3,SpainAuxiliares,74545,74545,1255270,"This subreddit is a gathering place for past, ...",2018-03-13,437,"<!-- SC_OFF --><div class=""md""><p><em>This sub...",
4,capitalistmexico,80352,80352,1261077,"While /r/mexico is nice, it not sufficient sin...",2018-03-24,2,"<!-- SC_OFF --><div class=""md""><p>While <a hre...",
