In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
import networkx as nx
import pandas as pd
import numpy as np
from tqdm import tqdm
import itertools

df=pd.read_feather("../../data/raw/work_school_home_sp.feather")

In [2]:
def clusterize_relation(df, relation_name='home', x_coord='home_x', y_coord='home_y',
                            new_col_name = 'Neighbourhood', n_participants = 10, seed=13):
    data = []
    means = []
    for zone in df[relation_name].unique():
        if pd.notna(zone):
            tmp = df[df[relation_name] == zone][['id', x_coord, y_coord, relation_name]].copy()
            n_clusters = int(np.sum(len(tmp)) / n_participants)

            if n_clusters < 2:
                tmp[new_col_name] = tmp[relation_name].astype(int).astype(str) + '_'  + '0'

            else: 
                X = tmp[[x_coord, y_coord]]
                kmeans = KMeans(n_clusters=n_clusters, random_state=seed).fit(X)
                tmp[new_col_name] = kmeans.labels_
                tmp[new_col_name] = (tmp[relation_name].astype(int).astype(str) + '_' +
                                     tmp[new_col_name].astype(int).astype(str))
                    
            data.append(tmp)
                
    df = pd.concat(data)

    return df

In [3]:
def make_clusters(df, home_cluster_n=10, work_cluster_n=20, school_cluster_n=30):
    home_clusters = clusterize_relation(df,'home','home_x','home_y','home_cluster',home_cluster_n)
    work_clusters = clusterize_relation(df,'work','work_x','work_y','work_cluster',work_cluster_n)
    school_clusters = clusterize_relation(df,'school','school_x','school_y','school_cluster',school_cluster_n)

    return merge_cluster_dataframes(df, home_clusters, work_clusters, school_clusters)

def merge_cluster_dataframes(df, home_clusters, work_clusters, school_clusters):
    merged = pd.merge(df, home_clusters.drop(['home_x', 'home_y'], axis=1), 
                                                                   on=['id', 'home'], how='left')
    
    merged = pd.merge(merged, work_clusters.drop(['work_x', 'work_y'], axis=1),
                                                                       on=['id', 'work'], how='left')
    
    merged = pd.merge(merged, school_clusters.drop(['school_x', 'school_y'], axis=1), 
                                                                             on=['id', 'school'], how='left')
    
    merged.drop(['home_x', 'home_y', 'work_x', 'work_y', 'school_x', 'school_y'], axis=1, inplace=True)
    
    assert merged[merged['home'].notna()]['home_cluster'].isna().sum() == 0
    assert merged[merged['work'].notna()]['work_cluster'].isna().sum() == 0
    assert merged[merged['school'].notna()]['school_cluster'].isna().sum() == 0
    
    return merged

In [4]:
df = make_clusters(df, 10, 20, 30)

In [5]:
def add_person_to_graph(G, person):
    G.add_node(person['id'],
               work = person['work'],
               school = person['school'],
               home = person['home'],
               age = person['idade'],
               private_healthcare = person['private_healthcare'],
               home_id = person['home_id']
    )    

def add_people_to_graph(G, df):
    print('Adding People Nodes')
    df.apply(lambda x: add_person_to_graph(G, x), axis=1)
    print(25*'*')

def add_edge(G, person1, person2, edge_type, relation_cluster, edge_zone):
    G.add_edge(person1, person2, edge_type=edge_type, cluster=relation_cluster, zone=edge_zone)

def add_edges(G, df, relation, cluster, edge_type):
    print(f'Adding {edge_type} Edges')
    total_edges = 0
    for c in tqdm(df[cluster].unique()):
        if pd.notna(c):
            tmp = df[df[cluster] == c]
            assert len(np.unique(tmp[relation])) == 1
            zone = tmp[relation].iloc[0]
            if len(tmp) > 1:
                combinations = list(itertools.combinations(tmp['id'].values, 2))
                total_edges += len(combinations)
                for p1, p2 in combinations:
                    add_edge(G,p1, p2, edge_type, c, zone)
   
    print(len([True for x,y,v in G.edges.data(data=True) if v['edge_type'] == edge_type]))
    print(25*'*')
    return total_edges

In [6]:
G = nx.MultiGraph()
add_people_to_graph(G, df)
houses = add_edges(G, df, 'home', 'home_id', 'home')
neighbors = add_edges(G, df, 'home', 'home_cluster', 'neighbor')
works = add_edges(G, df, 'work',   'work_cluster', 'work')
schools = add_edges(G, df, 'school', 'school_cluster', 'school')
assert len(G.nodes()) == len(df)
assert houses == len([True for x,y,v in G.edges.data(data=True) if v['edge_type'] == 'home'])
assert neighbors == len([True for x,y,v in G.edges.data(data=True) if v['edge_type'] == 'neighbor'])
assert works == len([True for x,y,v in G.edges.data(data=True) if v['edge_type'] == 'work'])
assert schools == len([True for x,y,v in G.edges.data(data=True) if v['edge_type'] == 'school'])


Adding People Nodes


  0%|          | 69/21708 [00:00<01:03, 339.79it/s]

*************************
Adding home Edges


100%|██████████| 21708/21708 [01:03<00:00, 340.28it/s]
  1%|▏         | 69/5401 [00:00<00:15, 339.09it/s]

61634
*************************
Adding neighbor Edges


100%|██████████| 5401/5401 [00:16<00:00, 322.67it/s]
  0%|          | 1/1201 [00:00<02:07,  9.40it/s]

348835
*************************
Adding work Edges


100%|██████████| 1201/1201 [00:03<00:00, 313.23it/s]
  1%|          | 3/423 [00:00<00:24, 16.84it/s]

342525
*************************
Adding school Edges


100%|██████████| 423/423 [00:01<00:00, 265.05it/s]


251454
*************************


In [14]:
pd.read_feather("../../data/raw/work_school_home_sp.feather")[['id','school_x','school_y']]

Unnamed: 0,id,school_x,school_y
0,00010001101,,
1,00010001102,329431.0,7395939.0
2,00010002101,,
3,00010002102,,
4,00010002103,,
...,...,...,...
55487,03421904103,,
55488,03422006101,,
55489,03422006102,,
55490,03422109101,,


In [16]:
df = pd.merge(df, 
              pd.read_feather("../../data/raw/work_school_home_sp.feather")[['id','school_x','school_y']], on='id')

In [37]:
df[(df['school_y'] > 7394800.0) & (df['school_y'] < 7394830.0) &
   (df['school_x'] > 331300.0) & (df['school_x'] < 331500.0)][['school_cluster', 'school_x', 'school_y']]

Unnamed: 0,school_cluster,school_x,school_y
498,31_1,331494.0,7394808.0
577,31_1,331434.0,7394817.0
2313,31_1,331496.0,7394807.0
2380,31_1,331434.0,7394817.0
2951,31_1,331434.0,7394817.0
...,...,...,...
54619,31_1,331434.0,7394817.0
54703,31_1,331434.0,7394817.0
55123,31_1,331434.0,7394817.0
55137,31_1,331434.0,7394817.0


In [28]:
df.school_y.value_counts()

7394817.0    105
7394807.0     39
7394973.0     37
7395524.0     11
7394681.0     10
7394642.0      9
7395035.0      7
7394710.0      7
7395447.0      5
7394729.0      4
7395123.0      4
7394798.0      3
7394831.0      3
7395238.0      3
7394602.0      2
7394643.0      2
7394656.0      2
7394814.0      2
7394802.0      2
7394773.0      1
7394723.0      1
7394663.0      1
7394542.0      1
7394485.0      1
7394406.0      1
7394259.0      1
7394752.0      1
7394720.0      1
7394778.0      1
7394808.0      1
7394857.0      1
7394873.0      1
7394959.0      1
7394965.0      1
7394998.0      1
7395029.0      1
7395059.0      1
7395220.0      1
7395428.0      1
7395529.0      1
7394780.0      1
7395033.0      1
Name: school_y, dtype: int64

In [27]:
df.school_x.value_counts()

331434.0    105
331496.0     39
331356.0     37
331723.0     11
331907.0     10
331265.0      9
331607.0      7
331306.0      7
331703.0      5
331895.0      4
331646.0      4
331396.0      4
331566.0      3
331147.0      2
331214.0      2
331248.0      2
331617.0      2
331280.0      2
331467.0      2
331585.0      2
331815.0      1
331587.0      1
331301.0      1
330778.0      1
331800.0      1
331569.0      1
331577.0      1
331275.0      1
331761.0      1
331082.0      1
331360.0      1
331111.0      1
331760.0      1
331887.0      1
331401.0      1
331699.0      1
331469.0      1
331494.0      1
330986.0      1
331502.0      1
331624.0      1
Name: school_x, dtype: int64

In [29]:
df=df[df['school']==31].copy()
df=df.dropna(subset=['school'])
px.scatter(df, x='school_x', y='school_y', color='school_cluster')

In [7]:
nx.write_gpickle(G, 'SP_multiGraph.gpickle')