# Load Edge List

In [2]:
import pandas as pd
import sys
sys.path.append('./src')

edge_list = pd.read_csv('../data/mergedEdgeList.csv')

In [36]:
edge_list.head()

Unnamed: 0.1,Unnamed: 0,source,source_type,target,target_type,edge_type,context
0,0,latinoamerica,r,latinoamerica,r,desc,
1,1,latinoamerica,r,argentina,r,desc,merica</li>\n<li>Images macros</li>\n<li>Posts...
2,2,latinoamerica,r,bolivia,r,desc,quot;Yo cuando...&quot;</li>\n</ul>\n\n<hr/>\n...
3,3,latinoamerica,r,brasil,r,desc,latinoamericanos</strong></p>\n\n<ul>\n<li><a ...
4,4,latinoamerica,r,chile,r,desc,"dit.com/r/argentina"">r/argentina</a></li>\n<li..."


In [9]:
edge_with_multi = edge_list[(edge_list.target_type == 'm') | (edge_list.source_type == 'm')]

# Validation

In [16]:
source_multis = edge_list[edge_list.source_type == 'm']['source'].unique()

In [15]:
target_multis = edge_list[edge_list.target_type == 'm']['target'].unique()

In [18]:
target_multis

array(['argentinahub', 'spanish_speaking', 'multi_br', 'chilehub',
       'musicamundo', 'musicalatina', 'aprendeotroidioma',
       'latinoamerica', 'nationalphotosubs', 'imaginaryexpanded',
       'breviario'], dtype=object)

In [19]:
source_multis

array(['argentinahub', 'multi_br', 'chilehub'], dtype=object)

# Subset to just the source multis

In [20]:
relevant = edge_list[edge_list.source.isin(source_multis) | edge_list.target.isin(source_multis)]

In [23]:
relevant.head()

Unnamed: 0.1,Unnamed: 0,source,source_type,target,target_type,edge_type,context
31,31,argentina,r,argentinahub,m,desc,""">ExplicaciÃ³n actualizada</a>. <a href=""http:..."
55,55,argentina,r,argentinahub,m,wiki,m/about/). [ExplicaciÃ³n actualizada](http://w...
140,140,brasil,r,multi_br,m,desc,"ubreddits na sidebar <a href=""https://www.redd..."
166,166,brasil,r,multi_br,m,wiki,ovos subreddits na sidebar [clicando aqui](htt...
237,237,chile,r,chilehub,m,desc,>: Registro pÃºblico de todas las acciones tom...


# Create Multi Hash

In [30]:
from collections import defaultdict

multi_to_reddits = dict()
for source in source_multis:
    multi_to_reddits[source] = relevant[(relevant.source == source) & (relevant.target_type == 'r')]['target'].unique()

In [31]:
multi_to_reddits

{'argentinahub': array(['bariloche', 'mendoza', 'fulbo', 'notargentina', 'linuxargentina',
        'argentos', 'argentinados', 'republicaargentina',
        'argentinacirclejerk', 'empleos_ar', 'fceyn', 'buenosairesbici',
        'river_plate', 'chachacha', 'argentinazi', 'corrientes',
        'argentinadrama', 'lgbt_de_argentina', 'argentinacirclejerk2',
        'argenbitcoin', 'expatriados', 'argentina', 'buenosaires', 'arggw',
        'rosario', '678', 'argenbeauty', 'mercadoreddit', 'arautos',
        'dankgentina', 'roleros_argentina', 'argaming', 'notacj',
        'argenpics', 'cordoba', 'musicaargentina', 'argentinacocina'],
       dtype=object),
 'multi_br': array(['brugal', 'youtubebrasil', 'parana', 'ajs_br',
        'leagueoflegendsbrazil', 'corinthians', 'palmeiras',
        'territoriolivre', 'jogatina', 'riodejaneiro', 'carreiras',
        'monarquia', 'foradecasa', 'libertarianismo', 'batepapo',
        'musicanova', 'gambiarra', 'brasilball', 'medoiosoio',
        'cine

# Create New Edges

In [32]:
edges_to_source_multis = relevant[relevant.target.isin(source_multis)]

In [43]:
import numpy as np

acc = []

def create_new_edges(source, edge_type, context, multi, multi_hash):
    targets = multi_hash[multi]
    for t in targets:
        acc.append([source, 'r', t, 'r', f'{edge_type}-multi', context, multi])

In [44]:
edges_to_source_multis.apply(lambda row: create_new_edges(row.source, row.edge_type, row.context, row.target, multi_to_reddits), axis=1)

31      None
55      None
140     None
166     None
237     None
266     None
1305    None
1445    None
1500    None
dtype: object

In [46]:
new_edges = pd.DataFrame(acc, columns=['source', 'source_type', 'target', 'target_type', 'edge_type', 'context', 'multi_name'])

# Prep Edge List

In [60]:
# edge_list.drop(columns='Unnamed: 0', inplace=True)
valid_edges = edge_list[(edge_list.source_type == 'r') & (edge_list.target_type == 'r')]

# Append New Edges

In [63]:
full_edge_list = valid_edges.append(new_edges, ignore_index=True, sort=False)

In [64]:
print(valid_edges.shape)
print(new_edges.shape)
print(full_edge_list.shape)

(3233, 6)
(434, 7)
(3667, 7)


# Saved Final Edge List

In [65]:
full_edge_list.to_csv('../data/finalEdgeList.csv')