In [65]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
import networkx as nx
import pandas as pd
import numpy as np
from tqdm import tqdm
import itertools

df=pd.read_feather("../../data/raw/work_school_home_sp_esc.feather")

In [66]:
def clusterize_relation(df, relation_name='home', x_coord='home_x', y_coord='home_y',
                            new_col_name = 'Neighbourhood', n_participants = 10, seed=13):
    data = []
    means = []
    for zone in df[relation_name].unique():
        if pd.notna(zone):
            tmp = df[df[relation_name] == zone][['id', x_coord, y_coord, relation_name]].copy()
            n_clusters = int(np.sum(len(tmp)) / n_participants)

            if n_clusters < 2:
                tmp[new_col_name] = tmp[relation_name].astype(int).astype(str) + '_'  + '0'

            else: 
                X = tmp[[x_coord, y_coord]]
                kmeans = KMeans(n_clusters=n_clusters, random_state=seed).fit(X)
                tmp[new_col_name] = kmeans.labels_
                tmp[new_col_name] = (tmp[relation_name].astype(int).astype(str) + '_' +
                                     tmp[new_col_name].astype(int).astype(str))
                    
            data.append(tmp)
                
    df = pd.concat(data)

    return df

In [67]:
def make_clusters(df, home_cluster_n=10, work_cluster_n=20, school_cluster_n=30):
    home_clusters = clusterize_relation(df,'home','home_x','home_y','home_cluster',home_cluster_n)
    work_clusters = clusterize_relation(df,'work','work_x','work_y','work_cluster',work_cluster_n)
    school_clusters = clusterize_relation(df,'school','school_x','school_y','school_cluster',school_cluster_n)

    return merge_cluster_dataframes(df, home_clusters, work_clusters, school_clusters)

def merge_cluster_dataframes(df, home_clusters, work_clusters, school_clusters):
    merged = pd.merge(df, home_clusters.drop(['home_x', 'home_y'], axis=1), 
                                                                   on=['id', 'home'], how='left')
    
    merged = pd.merge(merged, work_clusters.drop(['work_x', 'work_y'], axis=1),
                                                                       on=['id', 'work'], how='left')
    
    merged = pd.merge(merged, school_clusters.drop(['school_x', 'school_y'], axis=1), 
                                                                             on=['id', 'school'], how='left')
    
    assert merged[merged['home'].notna()]['home_cluster'].isna().sum() == 0
    assert merged[merged['work'].notna()]['work_cluster'].isna().sum() == 0
    assert merged[merged['school'].notna()]['school_cluster'].isna().sum() == 0
    
    return merged

In [68]:
df = make_clusters(df, 10, 20, 30)

In [69]:
def add_person_to_graph(G, person):
    G.add_node(person['id'],
               work = person['work'],
               school = person['school'],
               home = person['home'],
               age = person['idade'],
               private_healthcare = person['private_healthcare'],
               home_id = person['home_id'],
               education = person['education']
    )    

def add_people_to_graph(G, df):
    print('Adding People Nodes')
    df.apply(lambda x: add_person_to_graph(G, x), axis=1)
    print(25*'*')

def add_edge(G, person1, person2, edge_type, relation_cluster, edge_zone):
    G.add_edge(person1, person2, edge_type=edge_type, cluster=relation_cluster, zone=edge_zone)

def add_edges(G, df, relation, cluster, edge_type):
    print(f'Adding {edge_type} Edges')
    total_edges = 0
    for c in tqdm(df[cluster].unique()):
        if pd.notna(c):
            tmp = df[df[cluster] == c]
            assert len(np.unique(tmp[relation])) == 1
            zone = tmp[relation].iloc[0]
            if len(tmp) > 1:
                combinations = list(itertools.combinations(tmp['id'].values, 2))
                total_edges += len(combinations)
                for p1, p2 in combinations:
                    add_edge(G,p1, p2, edge_type, c, zone)
   
    print(len([True for x,y,v in G.edges.data(data=True) if v['edge_type'] == edge_type]))
    print(25*'*')
    return total_edges

In [70]:
G = nx.MultiGraph()
add_people_to_graph(G, df)
houses = add_edges(G, df, 'home', 'home_id', 'home')
neighbors = add_edges(G, df, 'home', 'home_cluster', 'neighbor')
works = add_edges(G, df, 'work',   'work_cluster', 'work')
schools = add_edges(G, df, 'school', 'school_cluster', 'school')
assert len(G.nodes()) == len(df)
assert houses == len([True for x,y,v in G.edges.data(data=True) if v['edge_type'] == 'home'])
assert neighbors == len([True for x,y,v in G.edges.data(data=True) if v['edge_type'] == 'neighbor'])
assert works == len([True for x,y,v in G.edges.data(data=True) if v['edge_type'] == 'work'])
assert schools == len([True for x,y,v in G.edges.data(data=True) if v['edge_type'] == 'school'])


Adding People Nodes


  0%|          | 71/21708 [00:00<01:01, 349.34it/s]

*************************
Adding home Edges


100%|██████████| 21708/21708 [01:03<00:00, 340.42it/s]
  1%|▏         | 72/5401 [00:00<00:15, 347.69it/s]

61634
*************************
Adding neighbor Edges


100%|██████████| 5401/5401 [00:16<00:00, 331.24it/s]
  3%|▎         | 35/1201 [00:00<00:03, 343.12it/s]

348835
*************************
Adding work Edges


100%|██████████| 1201/1201 [00:03<00:00, 322.36it/s]
  0%|          | 0/423 [00:00<?, ?it/s]

342525
*************************
Adding school Edges


100%|██████████| 423/423 [00:01<00:00, 278.92it/s]


251454
*************************


In [71]:
df

Unnamed: 0,home,school,work,id,work_x,work_y,school_x,school_y,home_x,home_y,home_id,idade,criteriobr,renda_fa,education,private_healthcare,home_cluster,work_cluster,school_cluster
0,1.0,,3.0,00010001101,333104.0,7394476.0,,,333743.0,7394463.0,00010001,59.0,4.0,2732.575910,3.0,False,1_1,3_0,
1,1.0,84.0,82.0,00010001102,327503.0,7392159.0,329431.0,7395939.0,333743.0,7394463.0,00010001,21.0,4.0,2732.575910,4.0,False,1_1,82_16,84_1
2,1.0,,1.0,00010002101,333453.0,7394501.0,,,333814.0,7394428.0,00010002,37.0,5.0,3200.000000,4.0,False,1_4,1_7,
3,1.0,,1.0,00010002102,333539.0,7394387.0,,,333814.0,7394428.0,00010002,19.0,5.0,3200.000000,3.0,False,1_4,1_7,
4,1.0,,26.0,00010002103,332344.0,7393317.0,,,333814.0,7394428.0,00010002,18.0,5.0,3200.000000,3.0,False,1_4,26_1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55487,342.0,,90.0,03421904103,327737.0,7396374.0,,,323322.0,7392038.0,03421904,27.0,1.0,9461.630458,5.0,True,342_10,90_0,
55488,342.0,,341.0,03422006101,323653.0,7391801.0,,,322857.0,7390956.0,03422006,52.0,1.0,9461.630458,5.0,True,342_6,341_0,
55489,342.0,,341.0,03422006102,323653.0,7391801.0,,,322857.0,7390956.0,03422006,26.0,1.0,9461.630458,5.0,True,342_6,341_0,
55490,342.0,,342.0,03422109101,323034.0,7390476.0,,,323034.0,7390476.0,03422109,28.0,3.0,3000.000000,5.0,False,342_3,342_1,


In [72]:
df.to_feather("../../data/processed/clusterized_df.feather")

In [73]:
df[(df['school_y'] > 7394800.0) & (df['school_y'] < 7394830.0) &
   (df['school_x'] > 331300.0) & (df['school_x'] < 331500.0)][['school_cluster', 'school_x', 'school_y']]

Unnamed: 0,school_cluster,school_x,school_y
498,31_1,331494.0,7394808.0
577,31_1,331434.0,7394817.0
2313,31_1,331496.0,7394807.0
2380,31_1,331434.0,7394817.0
2951,31_1,331434.0,7394817.0
...,...,...,...
54619,31_1,331434.0,7394817.0
54703,31_1,331434.0,7394817.0
55123,31_1,331434.0,7394817.0
55137,31_1,331434.0,7394817.0


In [75]:
df.school_y.value_counts()

7394817.0    111
7396909.0     91
7396256.0     72
7383162.0     67
7393084.0     61
            ... 
7395693.0      1
7395681.0      1
7393600.0      1
7395640.0      1
7386691.0      1
Name: school_y, Length: 4850, dtype: int64

In [76]:
df.school_x.value_counts()

331434.0    106
329905.0     91
340797.0     73
332783.0     68
325439.0     66
           ... 
316345.0      1
349084.0      1
349067.0      1
323391.0      1
327691.0      1
Name: school_x, Length: 4955, dtype: int64

In [79]:
tmp=df[df['school']==31].copy()
tmp=tmp.dropna(subset=['school'])
px.scatter(tmp, x='school_x', y='school_y', color='school_cluster')

In [80]:
tmp['school_cluster']=='31_1'

55       False
330      False
337      False
389      False
390      False
         ...  
54703     True
55123     True
55137     True
55199     True
55249    False
Name: school_cluster, Length: 280, dtype: bool

In [81]:
px.box(tmp[tmp['school_cluster']=='31_1'], y='idade', x='education', points="all")

In [82]:
tmp.school_cluster.value_counts()

31_1    149
31_0     52
31_2     18
31_6     17
31_3     16
31_5     16
31_4      4
31_7      4
31_8      4
Name: school_cluster, dtype: int64

In [83]:
px.box(tmp[tmp['school_cluster']=='31_3'], y='idade', x='education', points="all")

In [84]:
nx.write_gpickle(G, 'SP_multiGraph.gpickle')