### Import packages

In [1]:
import praw #Reddit Python wrapper
import re #regex
import pandas as pd #Data manipulation

import time #Time
import datetime as dt #Time

import networkx as nx #Network analysis


### Define functions

In [2]:
#Get /r/ subreddits from subreddit description string using regex
def getSubs(sub):
    subreddit = reddit.subreddit(sub)
    sub_description = subreddit.description
    subsList = re.findall(r'/r/\w+', sub_description)
    subsList = [x[1:] for x in subsList]
    return subsList

#Return sfw-only list of subreddits
def get_sfw_list(inp_list):
    not_over18_list = []
    
    global checkedList
    global errorList
    global sfwList
    
    for x in inp_list:
        if x not in checkedList:
            checkedList.append(x)
            try:
                sub = reddit.subreddit(x[2:])
                if sub.over18 == True: 
                    continue
                else:
                    not_over18_list.append(x)
                    sfwList.append(x)
            except:
                errorList.append(x)
        elif x in sfwList:
            not_over18_list.append(x)
        else:
            continue
    return not_over18_list

#Return sorted lower-cased list of subreddits
def lower_sort(inp_list):
    out_list = [x.lower() for x in inp_list]
    out_list.sort()
    return out_list

#Return display names of subreddits
def get_display_name(inp_list):
    out_list = []
    for x in inp_list:
        sub = reddit.subreddit(x[2:])
        display = sub.display_name_prefixed
        out_list.append(display)
    return out_list

#Add count of subreddits to a dictionary
def addDict(dictionary, list_to_add):
    for x in list_to_add:
        if x in dictionary:
            dictionary[x] = dictionary[x] + 1
        else: 
            dictionary[x] = 1

#Return list of tuples of linked subreddit pairs
def createLinks(sub,subsList):
    linkList = []
    for x in subsList:
        linkList.append((x,sub))
    return linkList

#Return list of tuples of linked subreddit pairs extending out from selected subreddit
def extendOut(tuplesList, sub, layer):
    newList = []
    i = 0
    searchList = ['r/'+sub]
    while i < layer:
        for x in tuplesList:
            if x[1] in searchList:
                newList.append(x)
                searchList.append(x[0])
            elif x[0] in searchList:
                newList.append(x)
                searchList.append(x[1])
            else:
                continue
        i=i+1
    return newList

#Crawl reddit from an intial sub, returning list of tuples of subreddit pairs and dictionary of subreddit counts 
def crawlReddit(initial_sub, limit):
    
    #Initial subs list
    subsList = get_sfw_list(lower_sort(getSubs(initial_sub)))
    sub_dict = {}
    listoflinks = [createLinks('r/'+initial_sub,subsList)]
    
    #Add initial subs list to dictionary
    addDict(sub_dict,subsList)
    
    #Create search list from dictionary
    searchList = list(sub_dict)
    searchedList = ['r/'+initial_sub]
    
    start_time = time.time()
    
    i = 0 
    
    while limit > len(sub_dict):
        for x in searchList:
            if len(sub_dict) >= limit:
                break
            elif x not in searchedList: 
                try:

                    #get subs, lowercase sort, get sfw list
                    subsList = get_sfw_list(lower_sort(getSubs(x[2:])))
                    addDict(sub_dict,subsList)     
                    listoflinks.append(createLinks(x,subsList))

                    searchList = list(set(searchList+subsList))
                    searchedList.append(x)
                    searchList = list(set(searchList)-set(searchedList))

                except:
                    errorList.append(x)
                    searchList = list(set(searchList)-set(errorList))

            else:
                continue

            end_time = time.time()
            time_past =  dt.timedelta(seconds=round(end_time-start_time))
            if len(searchedList) == 0: time_per_search = dt.timedelta(seconds=round(0)) 
            else: time_per_search = time_past/len(searchedList)
            time_to_go = len(searchList)*time_per_search
            est_total_time = time_past+time_to_go

            print('Search: '+str(len(searchList)),'Searched: '+str(len(searchedList)),
                  'Error: '+str(len(errorList)),'Checked: '+str(len(checkedList)),
                  'SFW: '+str(len(sfwList)),'Sub_dict: '+str(len(sub_dict)),'Itter: '+str(i),
                  'Time: '+str(time_past), 'Time-per-search: '+str(time_per_search),
                  'Time-to-go: '+str(time_to_go), 'Total Time: '+str(est_total_time), end="\r") 
        i = i+1
        
    return listoflinks, sub_dict

#Returns flattened list of tuples without self-referencing subreddits
def flatten_tuples_list(inp_list):
    flatten = lambda l: [item for sublist in l for item in sublist]
    flat_list = flatten(inp_list)
    #Get rid of self-referencing subs
    unique_flat_list = []
    for x in flat_list:
        if x[0] != x[1]:
            unique_flat_list.append(x)
    return unique_flat_list

                

### Crawl Reddit - return list of subreddit links (edges) and dictionary of subreddit counts (nodes)

In [3]:
#Access Reddit
client_secret = ''
client_id = ''

reddit = praw.Reddit(client_id=client_id,
                     client_secret=client_secret,
                     user_agent='crawling subs')

In [4]:
#Define lists
subsList = []
searchList = []
searchedList = []
errorList = []
checkedList = []
sfwList = []
listoflinks = []

In [5]:
#Crawl Reddit starting from r/datascience until after 1000 subs limit passed
#Return list of subreddit links as list of tuples
#Return dictionary of subreddit counts
listoflinks, sub_dict = crawlReddit('datascience',1000)

Search: 788 Searched: 215 Error: 17 Checked: 1021 SFW: 1003 Sub_dict: 1003 Itter: 3 Time: 0:02:59 Time-per-search: 0:00:00.832558 Time-to-go: 0:10:56.055704 Total Time: 0:13:55.055704

### Create gexf (Graphic Exchange XML Format) file for use in Gephi

In [6]:
#Flatten list of subreddit links
flattened_tuples_list = flatten_tuples_list(listoflinks)

In [7]:
#Create dictionary of new flattened list of tuples
tuple_dict = {}
list_for_dict = []
for x in flattened_tuples_list:
    list_for_dict.append(x[0])
        
addDict(tuple_dict,list_for_dict)

tuple_dict = list(tuple_dict.items())


In [8]:
#Create networkx Graph
G = nx.Graph()

#Add nodes to networkx Graph from new dictionary
for x in tuple_dict:
    G.add_node(x[0],weight=x[1])
    
#Add edges to networkx Graph from flattened list of subreddit links
G.add_edges_from(flattened_tuples_list)

#Write gexf file (Graph Exchange XML Format) for use in Gephi
nx.write_gexf(G, 'datascience.gexf')
