In [120]:
import pandas
import math
import re

from multiprocessing.pool import ThreadPool
from functools import partial
from itertools import chain
from neo4j import GraphDatabase

In [121]:
SUBREDDIT = 'investing'
GRAPH_PASSWORD = 'test12345'

In [122]:
THREADS = 48

GRAPH_USERS_LOC = f'data/{SUBREDDIT}/graph_users.csv'
GRAPH_POSTS_LOC = f'data/{SUBREDDIT}/graph_posts.csv'

GRAPH_COMMUNITIES_INTERNAL_LOC = f'data/{SUBREDDIT}/graph_communities_internal.csv'
GRAPH_COMMUNITIES_EXTERNAL_LOC = f'data/{SUBREDDIT}/graph_communities_external.csv'

In [123]:
driver = GraphDatabase.driver('bolt://localhost:7687', auth=('neo4j', GRAPH_PASSWORD))

In [132]:
# process the direct graph for communities and page rank which should group nodes to single symbols
def directed_communities_and_page_rank(tx):
    weights = ['depth_score', 'up_votes_with_depth', 'count']
    
    tx.run("""CALL gds.graph.drop('analyze', false) YIELD graphName;""")
    
    # create a new in memory graph with all vertices and edges to process for communities and page rank by a couple different weights
    tx.run(f"""
        CALL gds.graph.create(
            'analyze',
            '*',
            '*',
            {{relationshipProperties: {json.dumps(weights)}}}
        )
    """)
    
    # try it out with a couple of different weights
    for weight in weights:
        tx.run(f"""
            CALL gds.louvain.write('analyze', {{
                writeProperty: '{weight}_community_id', 
                tolerance: .00001,
                maxLevels: 40, 
                maxIterations: 200,
                relationshipWeightProperty: '{weight}' 
            }})
        """)
    
    # ditto for page rank
    for weight in weights: 
        tx.run(f"""
            CALL gds.pageRank.write('analyze', {{
                writeProperty: '{weight}_page_rank', 
                maxIterations: 50,
                dampingFactor: 0.85,
                relationshipWeightProperty: '{weight}' 
            }})
        """)
        
# process the graph for communities but make symbol mentiodns undirected to group similar symbols
def undirected_communities(tx):
    tx.run("""CALL gds.graph.drop('analyze', false) YIELD graphName;""")
    
    tx.run(f"""
        CALL gds.graph.create(
            'analyze',
            ['Symbol', 'User', 'Post'],
            {{
                Mentioned: {{orientation: 'UNDIRECTED'}},
                Post: {{}},
                Replied: {{}}
            }},
            {{relationshipProperties: ['depth_score']}}
        )
    """)
    
    tx.run("""
        CALL gds.louvain.write('analyze', { 
            writeProperty: 'undirected_community_id', 
            tolerance: .00001,
            maxLevels: 40, 
            maxIterations: 200,
            relationshipWeightProperty: 'depth_score'
        })
    """)
    
with driver.session() as session:
    session.write_transaction(directed_communities_and_page_rank)
    session.write_transaction(undirected_communities)

In [133]:
# run page rank per community vs for the entire graph, didn't see major differences here
def inter_community_page_rank(community_id, tx):
    tx.run("""CALL gds.graph.drop('inter', false) YIELD graphName;""")
    tx.run(f"""
        CALL gds.graph.create.cypher(
            'inter',
            'MATCH (n {{depth_score_community_id: {community_id}}}) RETURN id(n) AS id',
            'MATCH (a {{depth_score_community_id: {community_id}}})-[r]->(b {{depth_score_community_id: {community_id}}}) RETURN id(a) AS source, id(b) AS target, r.depth_score as depth_score'
        )
    """)
    tx.run("""
        CALL gds.pageRank.write('inter', { 
            writeProperty: 'inter_community_page_rank', 
            maxIterations: 50,
            dampingFactor: 0.85,
            relationshipWeightProperty: 'depth_score'
        })
    """)

In [134]:
def extract_vertices(label, fields, tx):
    results = tx.run(f"""MATCH (n: {label}) RETURN {', '.join(map(lambda f: f'n.{f}', fields))}""")
    
    rows = []
    for result in results:
        row = {}
        for field in fields:
            row[field] = result[f'n.{field}']
        
        rows.append(row)
        
    return pandas.DataFrame(rows)

In [135]:
user_fields = [
    'id', 
    'posts_up_votes',
    'replied_to',
    'symbols',
    
    'depth_score_community_id',
    'up_votes_with_depth_community_id',
    'depth_score_page_rank',
    'up_votes_with_depth_page_rank',
    'undirected_community_id'
]

with driver.session() as session:
    users_overview = session.read_transaction(partial(extract_vertices, 'User', user_fields))
    
users_overview.sort_values(by=['up_votes_with_depth_page_rank'], ascending=False).to_csv(GRAPH_USERS_LOC, index=False, header=True)

In [137]:
post_fields = [
    'id',
    'title',
    'author_id',
    'up_votes',
    'sum_up_votes',
    'replies',
    'authors',
    
    'depth_score_community_id',
    'up_votes_with_depth_community_id',
    'depth_score_page_rank',
    'up_votes_with_depth_page_rank',
    'undirected_community_id'
]

with driver.session() as session:
    posts_overview = session.read_transaction(partial(extract_vertices, 'Post', post_fields))
    
posts_overview.sort_values(by=['up_votes_with_depth_page_rank'], ascending=False).to_csv(GRAPH_POSTS_LOC, index=False, header=True)

In [136]:
symbol_fields = [
    'id',
    'authors',
    'posts',
    
    'depth_score_community_id',
    'up_votes_with_depth_community_id',
    'depth_score_page_rank',
    'up_votes_with_depth_page_rank',
    'undirected_community_id'
]

with driver.session() as session:
    symbols_overview = session.read_transaction(partial(extract_vertices, 'Symbol', symbol_fields))
    
symbols_overview = symbols_overview.sort_values(by=['up_votes_with_depth_page_rank'], ascending=False)
symbols_overview

Unnamed: 0,id,authors,posts,depth_score_community_id,up_votes_with_depth_community_id,depth_score_page_rank,up_votes_with_depth_page_rank,undirected_community_id
3679,TSLA,10462,1518,2007,2007,411.452977,437.618453,2007
3680,CEO,11561,1208,2008,2008,306.604773,411.560768,2008
3676,GME,7307,370,2004,2004,326.427324,398.522515,22792
3677,SPY,7499,1640,2005,2005,303.144145,326.200608,2002
3678,IPO,7663,804,2006,2006,249.940758,267.274954,2006
...,...,...,...,...,...,...,...,...
173,QFIN,1,1,57853,57853,0.150000,0.150000,115197
76,NGVC,1,1,57756,57756,0.150000,0.150000,115100
77,NUV,1,1,57757,57757,0.163821,0.150000,1994
367,MFIN,1,1,58047,58047,0.164646,0.150000,2002


In [138]:
def query_symbol_community(symbol_row):
    def _query(tx):        
        # copy the initial row so we can add more data to it without worrying about threading issues
        symbol_row_with_internal = symbol_row.copy(deep=True)
        
        # aggregate internal edges
        results_internal_edges = tx.run("""
            MATCH (ns {depth_score_community_id: $community})-[r]-(ne {depth_score_community_id: $community}) 
                RETURN DISTINCT sum(r.depth_score), sum(r.up_votes_with_depth), sum(r.count), count(*)
        """, community = symbol_row.up_votes_with_depth_community_id).single()
    
        symbol_row_with_internal['sum_depth_score'] = results_internal_edges['sum(r.up_votes_with_depth)']
        symbol_row_with_internal['sum_up_votes_with_depth'] = results_internal_edges['sum(r.up_votes_with_depth)']
        symbol_row_with_internal['sum_relationships'] = results_internal_edges['sum(r.count)']
        symbol_row_with_internal['sum_interactions'] = results_internal_edges['count(*)']
            
         # aggregate edges to external communities
        results_external = tx.run("""
            MATCH (ns {depth_score_community_id: $community})-[r]-(ne) 
                WHERE ne.depth_score_community_id <> $community 
                RETURN DISTINCT ne.depth_score_community_id, sum(r.depth_score), sum(r.up_votes_with_depth), sum(r.count), count(*)
        """, community = symbol_row.up_votes_with_depth_community_id)
        
        rows = []
        for result in results_external:
            # we dont care about direction so we need a stable ordering
            community_ids = [symbol_row.depth_score_community_id, result[f'ne.depth_score_community_id']]
            community_ids.sort()
            rows.append([
                community_ids[0],
                community_ids[1],
                result[f'sum(r.depth_score)'],
                result[f'sum(r.up_votes_with_depth)'],
                result[f'sum(r.count)'],
                result[f'count(*)']
            ])
        
        # keep only the interesting stuff
        externalities = pandas.DataFrame(rows, columns=['source_community_id', 'target_community_id', 'sum_depth_score', 'sum_up_votes_with_depth', 'sum_relationships', 'sum_interactions'])\
            .sort_values(by=['sum_depth_score'], ascending=False)
        depth_score_filter = externalities.std()['sum_depth_score'] + externalities.mean()['sum_depth_score']
        externalities = externalities[externalities['sum_depth_score'] > depth_score_filter]
            
        return (
            symbol_row_with_internal,
            externalities
        )
    
    with driver.session() as session:
        return session.read_transaction(_query)
    
with ThreadPool(THREADS) as pool:
    symbols_community = pool.map(query_symbol_community, list(map(lambda r: r[1], symbols_overview.iterrows())))
    
symbols_community_internal = pandas.DataFrame([s[0] for s in symbols_community])
symbols_community_internal.to_csv(GRAPH_COMMUNITIES_INTERNAL_LOC, index=False, header=True)

symbols_community_external = pandas.concat([s[1] for s in symbols_community])
# remove dupes since we don't care about direction
symbols_community_external = symbols_community_external.drop_duplicates(subset=['source_community_id', 'target_community_id'])
symbols_community_external.to_csv(GRAPH_COMMUNITIES_EXTERNAL_LOC, index=False, header=True)

In [131]:
driver.close()