In [10]:
import pyreadstat
import pandas as pd
import numpy as np
import networkx as nx
import warnings
import itertools
from tqdm import tqdm

In [21]:
df, meta = pyreadstat.read_sav("../data/raw/OD_2017.sav")
df.drop_duplicates(subset=['zona', 'zona_esc', 'zonatra1', 'id_pess'], inplace=True)

In [22]:
df = df[df['muni_dom'] == 36]
max_zona = int(df.zona.max())

In [30]:
trab_esc = df[['zona', 'zona_esc', 'zonatra1', 'id_pess', 'id_dom']].copy()
trab_esc.rename(columns={'zona':'home_zone',
                        'zona_esc':'school',
                        'zonatra1':'work',
                        'id_pess': 'id', 'id_dom': 'house_id'}, inplace=True)

trab_esc = trab_esc[~(trab_esc['work'] > max_zona) & 
        ~(trab_esc['school'] > max_zona) & ~(trab_esc['home_zone'] > max_zona)].reset_index(drop=True)

In [40]:
def add_person_to_graph(G, person):
    G.add_node(person['id'],
               work = person['work'],
               school = person['school'],
               home = person['home_zone']
    )    

def add_people_to_graph(G, df):
    df.apply(lambda x: add_person_to_graph(G, x), axis=1)
    print('Adicionando Pessoas')
    print(len(G.nodes))
    print(len(G.edges))
    print(50*'*')

In [32]:


def add_houses_to_graph(G, trab_esc):
    for h in tqdm(trab_esc['house_id'].unique()):
        tmp = trab_esc[trab_esc['house_id'] == h]
        zone = tmp['home_zone'].value_counts().index[0]
        if len(tmp) > 1:
            for p1, p2 in list(itertools.combinations(tmp['id'].values, 2)):
                add_edge(G,p1, p2, 'house', zone)

    print('Adicionando Vertices de Casas')
    print(len(G.nodes))
    print(len(G.edges))
    print(50*'*')

In [33]:
def add_work_to_graph(G, trab_esc):
    for w in tqdm(trab_esc['work'].unique()):
        if pd.notna(w):
            tmp = trab_esc[trab_esc['work'] == w]
            zone = tmp['work'].value_counts().index[0]

            if len(tmp) > 1:
                possible_combinations = np.array(list(itertools.combinations(tmp['id'].values, 2)))
                size_comb = int(len(possible_combinations))
                index_combs = np.random.choice(size_comb, size=int(size_comb*.05))
                real_combs = possible_combinations[index_combs]        
                for p1, p2 in real_combs:
                    add_edge(G, p1, p2, 'work', zone)

    print(len(G.nodes))
    print(len(G.edges))
    print(50*'*')

In [34]:
def add_schools_to_graph(G, trab_esc):
    for s in tqdm(trab_esc['school'].unique()):
        if pd.notna(s):
            tmp = trab_esc[trab_esc['school'] == s]
            zone = tmp['school'].value_counts().index[0]

            if len(tmp) > 1:
                possible_combinations = np.array(list(itertools.combinations(tmp['id'].values, 2)))
                size_comb = int(len(possible_combinations))
                index_combs = np.random.choice(size_comb, size=int(size_comb*.25))
                real_combs = possible_combinations[index_combs]        
                for p1, p2 in real_combs:
                    add_edge(G, p1, p2, 'school', zone)

    print(len(G.nodes))
    print(len(G.edges))
    print(50*'*')

In [37]:
def create_graph(trab_esc):
    G = nx.Graph()
    add_people_to_graph(G, trab_esc)
    add_houses_to_graph(G, trab_esc)
    add_work_to_graph(G, trab_esc)
    add_schools_to_graph(G, trab_esc)
    
    return G
 
    
gs = [create_graph(trab_esc) for i in range(3)]
#nx.write_gpickle(G, 'Grafo_Zonas_SP.gpickle')

  0%|          | 22/21708 [00:00<01:38, 219.73it/s]

Adicionando Pessoas
55492
0
**************************************************


100%|██████████| 21708/21708 [01:44<00:00, 208.10it/s]
  1%|          | 3/339 [00:00<00:13, 25.38it/s]

Adicionando Vertices de Casas
55492
61634
**************************************************


100%|██████████| 339/339 [00:01<00:00, 273.92it/s]
  5%|▌         | 17/336 [00:00<00:01, 167.87it/s]

55492
155840
**************************************************


100%|██████████| 336/336 [00:00<00:00, 464.03it/s]


55492
253546
**************************************************


  0%|          | 21/21708 [00:00<01:43, 209.40it/s]

Adicionando Pessoas
55492
0
**************************************************


100%|██████████| 21708/21708 [01:43<00:00, 209.50it/s]
  2%|▏         | 8/339 [00:00<00:04, 77.56it/s]

Adicionando Vertices de Casas
55492
61634
**************************************************


100%|██████████| 339/339 [00:01<00:00, 286.76it/s]
  5%|▍         | 16/336 [00:00<00:02, 158.30it/s]

55492
155695
**************************************************


100%|██████████| 336/336 [00:00<00:00, 458.57it/s]


55492
253574
**************************************************


  0%|          | 23/21708 [00:00<01:35, 226.54it/s]

Adicionando Pessoas
55492
0
**************************************************


100%|██████████| 21708/21708 [01:43<00:00, 208.99it/s]
  2%|▏         | 8/339 [00:00<00:04, 79.47it/s]

Adicionando Vertices de Casas
55492
61634
**************************************************


100%|██████████| 339/339 [00:01<00:00, 263.76it/s]
  5%|▌         | 17/336 [00:00<00:01, 167.74it/s]

55492
155818
**************************************************


100%|██████████| 336/336 [00:00<00:00, 464.48it/s]

55492
253324
**************************************************





In [38]:
gs[0].edges(data=True)

<networkx.classes.graph.Graph at 0x7fd63c7c28d0>

In [45]:
def add_edge(G, person1, person2, edge_type, edge_zone):
        G.add_edge(person1, person2, edge_type=edge_type, zone=edge_zone)

def add_relation_to_graph(G, df, relation, rewire_chance=1):
    for rel in tqdm(df[relation].unique()):
        if pd.notna(rel):
            tmp = df[df[relation] == rel]
            zone = tmp[relation].value_counts().index[0]

            if len(tmp) > 1:
                combinations = np.array(list(itertools.combinations(tmp['id'].values, 2)))
                if rewire_chance < 1:
                    size_combinations = int(len(combinations))
                    size_sample = int(size_combinations*rewire_chance)
                    index_combs = np.random.choice(size_combinations, size= size_sample)
                    final_combs = combinations[index_combs]        
                for p1, p2 in final_combs:
                    add_edge(G, p1, p2, relation, zone)

    print(len(G.nodes))
    print(len(G.edges))
    print(50*'*')
    
def add_houses_to_graph(G, df):
    for h in tqdm(df['house_id'].unique()):
        tmp = df[df['house_id'] == h]
        zone = tmp['home_zone'].value_counts().index[0]
        if len(tmp) > 1:
            for p1, p2 in list(itertools.combinations(tmp['id'].values, 2)):
                add_edge(G,p1, p2, 'house', zone)

    print('Adicionando Vertices de Casas')
    print(len(G.nodes))
    print(len(G.edges))
    print(50*'*')
    
def add_work_to_graph(G, df, rewire_chance=0.05):
    return add_relation_to_graph(G, df, 'work', rewire_chance)


def add_school_to_graph(G, df, rewire_chance=0.25):
    return add_relation_to_graph(G, df, 'work', rewire_chance)

def create_graph(df, seed=None):
    np.random.seed(seed)

    G = nx.Graph()
    add_people_to_graph(G, df)
    add_houses_to_graph(G, df)
    add_work_to_graph(G, df)
    add_schools_to_graph(G, df)
    
    return G

In [46]:
GS = [create_graph(trab_esc) for i in range(3)]

  0%|          | 23/21708 [00:00<01:37, 222.34it/s]

Adicionando Pessoas
55492
0
**************************************************


100%|██████████| 21708/21708 [01:42<00:00, 210.77it/s]
  2%|▏         | 7/339 [00:00<00:04, 67.79it/s]

Adicionando Vertices de Casas
55492
61634
**************************************************


100%|██████████| 339/339 [00:01<00:00, 272.50it/s]
  4%|▍         | 13/336 [00:00<00:02, 128.93it/s]

55492
155885
**************************************************


100%|██████████| 336/336 [00:00<00:00, 450.15it/s]


55492
253670
**************************************************


  0%|          | 45/21708 [00:00<01:39, 217.73it/s]

Adicionando Pessoas
55492
0
**************************************************


100%|██████████| 21708/21708 [01:42<00:00, 211.46it/s]
  2%|▏         | 7/339 [00:00<00:04, 67.60it/s]

Adicionando Vertices de Casas
55492
61634
**************************************************


100%|██████████| 339/339 [00:01<00:00, 241.91it/s]
  4%|▍         | 14/336 [00:00<00:02, 139.52it/s]

55492
155803
**************************************************


100%|██████████| 336/336 [00:00<00:00, 453.03it/s]


55492
253545
**************************************************


  0%|          | 20/21708 [00:00<01:51, 193.81it/s]

Adicionando Pessoas
55492
0
**************************************************


100%|██████████| 21708/21708 [01:42<00:00, 211.56it/s]
  2%|▏         | 7/339 [00:00<00:04, 68.48it/s]

Adicionando Vertices de Casas
55492
61634
**************************************************


100%|██████████| 339/339 [00:01<00:00, 277.72it/s]
  4%|▍         | 14/336 [00:00<00:02, 138.16it/s]

55492
155725
**************************************************


100%|██████████| 336/336 [00:00<00:00, 450.10it/s]

55492
253536
**************************************************





In [56]:
# WITH NO SEED, different graphs have different edges
print(list(GS[0].edges(data=True))[170])
print(list(GS[1].edges(data=True))[170])
print(list(GS[0].edges(data=True))[170] == list(GS[1].edges(data=True))[170])

('00010009102', '00020046103', {'edge_type': 'school', 'zone': 23.0})
('00010009102', '01801479101', {'edge_type': 'school', 'zone': 23.0})
False


In [60]:
GS_seeded = [create_graph(trab_esc, seed=420) for i in range(3)]

  0%|          | 45/21708 [00:00<01:39, 216.92it/s]

Adicionando Pessoas
55492
0
**************************************************


100%|██████████| 21708/21708 [01:41<00:00, 213.64it/s]
  2%|▏         | 7/339 [00:00<00:04, 69.06it/s]

Adicionando Vertices de Casas
55492
61634
**************************************************


100%|██████████| 339/339 [00:01<00:00, 285.01it/s]
  5%|▍         | 16/336 [00:00<00:02, 156.94it/s]

55492
155855
**************************************************


100%|██████████| 336/336 [00:00<00:00, 458.85it/s]


55492
253580
**************************************************


  0%|          | 46/21708 [00:00<01:37, 223.10it/s]

Adicionando Pessoas
55492
0
**************************************************


100%|██████████| 21708/21708 [01:41<00:00, 213.51it/s]
  2%|▏         | 7/339 [00:00<00:04, 66.75it/s]

Adicionando Vertices de Casas
55492
61634
**************************************************


100%|██████████| 339/339 [00:01<00:00, 277.82it/s]
  4%|▍         | 13/336 [00:00<00:02, 130.00it/s]

55492
155855
**************************************************


100%|██████████| 336/336 [00:00<00:00, 448.34it/s]


55492
253580
**************************************************


  0%|          | 23/21708 [00:00<01:35, 227.65it/s]

Adicionando Pessoas
55492
0
**************************************************


100%|██████████| 21708/21708 [01:42<00:00, 211.09it/s]
  2%|▏         | 7/339 [00:00<00:04, 66.84it/s]

Adicionando Vertices de Casas
55492
61634
**************************************************


100%|██████████| 339/339 [00:01<00:00, 276.63it/s]
  4%|▍         | 13/336 [00:00<00:02, 129.73it/s]

55492
155855
**************************************************


100%|██████████| 336/336 [00:00<00:00, 448.27it/s]

55492
253580
**************************************************





In [61]:
# WITH NO SEED, different graphs have different edges
print(list(GS_seeded[0].edges(data=True))[170])
print(list(GS_seeded[1].edges(data=True))[170])
print(list(GS_seeded[0].edges(data=True))[170] == list(GS_seeded[1].edges(data=True))[170])

('00010013101', '02540498103', {'edge_type': 'work', 'zone': 3.0})
('00010013101', '02540498103', {'edge_type': 'work', 'zone': 3.0})
True
