# The Echo Chamber Simulator - Graph Generator

### Setup

In [69]:
import sys
sys.path.append('../src')

import os
import csv
import time
import torch
import random

import pandas as pd
import networkx as nx

from data import DataPipeline
from generate.generate_synthetic_graph import GraphGenerator, parse_args
from graph import homophily

args = parse_args("")
args.draw_user_graph = False
args.data_dir = 'dataset/processed/synth_polarization'

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
data_dir = os.path.join(parent_dir, args.data_dir)
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

    
args.min_retweets_per_tweet = 3
args.expected_retweets_per_user = 3

args

Namespace(add_argumentation_graph=False, center_skew=1.0, centralized=False, data_dir='dataset/processed/synth_polarization', draw_user_graph=False, edge_sample_ratio=1.0, expected_retweets_per_user=3, expected_tweets_per_user=5, expected_user_edges=30000, in_out_ratio=0.05, min_retweets_per_tweet=3, num_bins=1, num_partitions=3, num_users_per_partition=100, scenario=None, user_based=False)

In [70]:
gg = GraphGenerator()

### Create User Graph

Number of Community Members

In [71]:
community_dict = {
    0: {'members': 500, 'gatekeepers': 10},
    1: {'members': 500, 'gatekeepers': 10},
}

Average number of user follows

In [72]:
follow_dict = {
    0: {'members': 5, 'gatekeepers': 20},
    1: {'members': 5, 'gatekeepers': 20},
}

### Community Connection Probabilities

In [73]:
connection_dict = {
    0: {
        'members':     {0: 0.8, 1: 0.2},
        'gatekeepers': {0: 0.8, 1: 0.2},
    },
    1: {
        'members':     {0: 0.2, 1: 0.8},
        'gatekeepers': {0: 0.2, 1: 0.8},
    },
}

In [74]:
user_graph = gg.create_scenario_user_graph(
    community_dict, follow_dict, connection_dict,
    args.center_skew, min_in_degree=args.min_retweets_per_tweet)

nx.write_gexf(user_graph, os.path.join(data_dir, "reduced_user_graph.gexf"))

users = user_graph.nodes

create user graph: 100%|███████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  3.73it/s]
densify user graph: 100%|████████████████████████████████████████████████████████| 449/449 [00:00<00:00, 9324.46it/s]


### Create Twitter Graph

In [75]:
twitter_graph = gg.create_twitter_graph(
    user_graph, args.expected_tweets_per_user, 
    args.expected_retweets_per_user, args.min_retweets_per_tweet)

nx.write_gexf(twitter_graph, os.path.join(data_dir, "reduced_twitter_graph.gexf"))

tweets = [x for x,y in twitter_graph.nodes(data=True) if y['node_type']=='tweet']
random.shuffle(tweets)

create twitter graph: 100%|███████████████████████████████████████████████████| 1020/1020 [00:00<00:00, 41964.85it/s]
densify twitter graph: 100%|█████████████████████████████████████████████████| 1020/1020 [00:00<00:00, 132353.36it/s]


### Create Dataset

In [76]:
edges = []

out_edges = twitter_graph.out_edges(tweets, data=True)
edges += gg.get_tweet_edges(out_edges)

in_edges = list(twitter_graph.in_edges(tweets, data=True))
random.shuffle(in_edges)
cutoff = int(len(in_edges) * args.edge_sample_ratio)
in_edges = in_edges[:cutoff]

edges += gg.get_retweet_edges(in_edges)
edges += gg.get_follow_chains(user_graph)

fetching tweet edges: 100%|█████████████████████████████████████████████████| 4161/4161 [00:00<00:00, 1159403.37it/s]
fetching retweet edges: 100%|█████████████████████████████████████████████| 14773/14773 [00:00<00:00, 2259259.57it/s]


In [77]:
df = pd.DataFrame(edges, columns=['head', 'tail', 'edge_type', 'query_type', 'head_type', 'tail_type', 'bin'])
df.to_csv(os.path.join(data_dir, "edges.csv"))

with open(os.path.join(data_dir, "num_entities.csv"), 'w', newline='') as f:  
    csv_writer = csv.writer(f, delimiter=',')
    csv_writer.writerow(['user', 'tweet'])
    csv_writer.writerow([len(users), len(tweets)])
    
with open(os.path.join(data_dir, "edge_types.csv"), 'w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',')
    csv_writer.writerow(['edge_type', 'edge_tuple'])
    csv_writer.writerow(['retweet', (0,)])
    csv_writer.writerow(['tweet', (1,)])
    csv_writer.writerow(['follow', (0,1)])