In [None]:
from pymongo import MongoClient
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import networkx as nx
from sympy import degree
from tqdm import tqdm
import math
import seaborn as sns
import powerlaw as plw
import datetime
import plotly.graph_objects as go
from reddit import plotting, network
from reddit import datahandling as dh
%load_ext autoreload
%autoreload 2

In [None]:
client = MongoClient('localhost', 27017)
db = client.reddit
start = 1577836800
end = 1580515200


def getUTC(date):
    return date.replace(tzinfo=datetime.timezone.utc).timestamp()

def getPipeline(subreddit, start, end, limit=100000):
    pipeline = [
        {'$project': {'_id': 0, 'link_id':0, 'score':0}},#  'id': 1, 'subreddit': 1, 'parent_id': 1, 'author': 1}}, 
        {'$match': {'subreddit': subreddit,'created_utc':{ '$gt': start, '$lt': end },'author': {'$ne': '[deleted]'}}},
        #{'$limit': limit},
        {'$project': {'subreddit': 0}},
        {'$sort': {'id':1}}
    ]
    return pipeline

def getDataframe(db, subreddit, start, end, which='both'):
    """
	Loads data from given subreddit from database and packs into dataframe
	Args:
		subreddit:
		start:
		end:
        which: 
	Returns:
        Dataframe for given parameters
	"""
    ### Load Database and apply pipeline, assign unique index to each user
    if which != 'both':
        collection = db[which]
        cursor = collection.aggregate(getPipeline(str(subreddit),start,end),allowDiskUse=True )
        #cursor = collection.find({'subreddit':subreddit}).hint('Interactions')
        df = pd.DataFrame(list(cursor))
        df.drop(columns=['subreddit','score', 'link_id','_id','created_utc','num_comments','domain'], errors='ignore', inplace=True)
        if which == 'submissions':
            df.insert(2, 'parent_id', '')
    else:
        df_sub = getDataframe(subreddit, which='submissions')
        df_com = getDataframe(subreddit,which='comments')
        df = pd.concat([df_sub,df_com])
    df['id'] = df['id'].astype('string') 
    df['parent_id'] = df['parent_id'].astype('string').str.replace('t3_','')
    df['parent_id'] = df['parent_id'].astype('string').str.replace('t1_','')
    return df
    
def getMyParents(df):
    """
	Args:
		df:
	Returns:
        numpy array with parent user id
	"""
    df['user_id'] = df.groupby('author').ngroup()    ##Add user id
    df['parent_user_id'] = pd.Series(dtype=int)
    df = df.assign(parent_user_id = -1)
    df_array = df.to_numpy()
    del(df)
    for i,li in tqdm(enumerate(df_array[:,2])): #iterate over posts link id
        pos = np.searchsorted(df_array[:,1],li)
        if pos != df_array.shape[0]:
            if li == df_array[pos,1]:
                df_array[i,4] = df_array[pos,3]
    return df_array

def getData(db,subreddit,start,end,which,save=True):
    try:
        data = np.load('./top100/first1000000/{}_2020.npy'.format(subreddit), allow_pickle=True)
        return data
    except:
        pass
    try:
        df = getDataframe(db,subreddit,start,end,'comments')
        data = getMyParents(df)
        del(df)
        if save:
            np.save('./top100/first100000/{}_2020.npy'.format(subreddit), data)
        return data
    except:
        return None


In [None]:
## ToDo: export to library
def getTop(top, sortby = None):
    if sortby is None:
        raise NameError('No argument to sort by')
    pipe = [
        {'$sort': {sortby:-1}},
        {'$limit': top},
        {'$project': {'_id':1}}
    ]
    statdb = client.reddit_statistics
    collection = statdb.subreddit_submissions
    cursor = collection.aggregate(pipe)
    df = pd.DataFrame(list(cursor))
    df_array = df.to_numpy()
    return(df_array)

def getTopNum(top, sortby = None):
    if sortby is None:
        raise NameError('No argument to sort by')
    pipe = [
        {'$sort': {sortby:-1}},
        {'$limit': top},
        {'$project': {'comments':1, '_id': -1}}
    ]
    statdb = client.reddit_statistics
    collection = statdb.subreddit_submissions
    cursor = collection.aggregate(pipe)
    df = pd.DataFrame(list(cursor))
    df_array = df.to_numpy()
    return(df_array)


In [None]:
num_subreddits = 50
subreddits = dh.getTop(client.reddit_statistics, num_subreddits, 'comments') #topsubreddits[:,0]
bots = np.load('bots.npy')
#clr = ['tab:blue', 'tab:orange', 'tab:green','tab:red','tab:purple','tab:pink','tab:olive','tab:cyan']
active_users = []
subreddit_users = []
start = getUTC(datetime.datetime(2020,1,1))
end = getUTC(datetime.datetime(2020,1,14))
for idx, subreddit in tqdm(enumerate(subreddits)):
    data = getData(db,subreddit.item(),start,end,'comments')
    if data is None:
        active_users.append([])
        subreddit_users.append(0)
    else:
        user_list, count = np.unique(data[:,0], return_counts=True)
        user_list = user_list[count.argsort()]
        active_users.append(user_list[0:int(user_list.shape[0]*0.01)]) #Most active 1%
        subreddit_users.append(user_list.shape[0])


In [None]:
subreddit_graph = nx.DiGraph()
subreddit_graph.add_nodes_from(np.arange(num_subreddits))
bots = np.intersect1d(np.unique(np.concatenate(active_users).ravel()), bots)

#Clear active users from bots
for i, au in enumerate(active_users):
    active_users[i] = np.setdiff1d(au, bots)

for i in range(num_subreddits):
    for j in range(num_subreddits):
        if i != j:
            weight = np.intersect1d(active_users[i], active_users[j]).shape[0]
            if weight > np.shape(active_users[i])[0]*0.01:
                subreddit_graph.add_edge(i,j, weight=weight)

weights = [subreddit_graph[u][v]['weight']*0.4 for u,v in subreddit_graph.edges()]