In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
import networkx as nx
import pandas as pd
import numpy as np
from tqdm import tqdm
import itertools

df=pd.read_feather("../../data/interim/work_school_home_sp_esc.feather")
df['id'] = df.index + 1

In [2]:
def clusterize_relation(df, relation_name='home', x_coord='home_x', y_coord='home_y',
                            new_col_name = 'Neighbourhood', n_participants = 10, 
                            category_column = None, seed=13):
    data = []
    means = []
       
    for zone in df[relation_name].unique():
        if pd.notna(zone):
            if category_column is not None:
                tmp = df[df[relation_name] == zone][['id', x_coord, 
                                                 y_coord, relation_name,
                                                 category_column]].copy()
            else:
                tmp = df[df[relation_name] == zone][['id', x_coord, 
                                                 y_coord, relation_name]].copy()
            
            n_clusters = int(np.sum(len(tmp)) / n_participants)
 
            if n_clusters < 2:
                tmp[new_col_name] = (tmp[relation_name].astype(int).astype(str)
                                     + '_' + '0')
            else: 
                X = tmp[[x_coord, y_coord]]
                kmeans = KMeans(n_clusters=n_clusters, random_state=seed).fit(X)
                tmp[new_col_name] = kmeans.labels_
                
                if relation_name == 'school':
                    v = tmp[category_column].dropna().value_counts().sort_index().index
                    p = tmp[category_column].dropna().value_counts().sort_index().values
                    p = p/np.sum(p)
                    tmp[category_column].fillna(np.random.choice(v, p=p), inplace=True)
                
                
                if category_column is not None:
                    tmp[new_col_name] = (tmp[relation_name].astype(int).astype(str) + '_'
                                     + tmp[new_col_name].astype(int).astype(str) + '_'
                                     + tmp[category_column].astype(int).astype(str))
                else:
                    tmp[new_col_name] = (tmp[relation_name].astype(int).astype(str) + '_'
                                     + tmp[new_col_name].astype(int).astype(str))
            
            data.append(tmp)
                
    final = pd.concat(data)

    return final

def make_clusters(df, home_cluster_n=10, work_cluster_n=20, school_cluster_n=30):
    df = df.copy()   
    
    home_clusters = clusterize_relation(df,'home', 'home_x', 'home_y', 'home_cluster',
                                        home_cluster_n, None)
    
    work_clusters = clusterize_relation(df,'work', 'work_x', 'work_y', 'work_cluster',
                                        work_cluster_n, None)
    
    school_clusters = clusterize_relation(df,'school', 'school_x', 'school_y',
                                          'school_cluster', school_cluster_n,
                                          'studies')

    return merge_cluster_dataframes(df, home_clusters, work_clusters, school_clusters)

def merge_cluster_dataframes(df, home_clusters, work_clusters, school_clusters):
    merged = pd.merge(df, home_clusters.drop(['home_x', 'home_y'], axis=1), 
                                             on=['id', 'home'], how='left')
    
    merged = pd.merge(merged, work_clusters.drop(['work_x', 'work_y'], axis=1),
                                                 on=['id', 'work'], how='left')
    
    merged = pd.merge(merged, school_clusters.drop(['school_x', 'school_y', 
                                                    'studies'], axis=1), 
                                                   on=['id', 'school'], how='left')
    
    assert merged[merged['home'].notna()]['home_cluster'].isna().sum() == 0

    assert np.sum([(merged['work'].notna()) 
            & (merged['work_cluster'].isna()
            & (merged['job_level'] != -1))]) == 0
    
    assert np.sum([((merged['school'].notna()) | (merged['job_level'] == -1)) 
            & (merged['school_cluster'].isna())]) == 0
    
    return merged

In [3]:
df = make_clusters(df, 10, 20, 30)

In [4]:
df

Unnamed: 0,home,school,work,id,work_x,work_y,school_x,school_y,home_x,home_y,...,idade,criteriobr,renda_fa,education,job_level,studies,private_healthcare,home_cluster,work_cluster,school_cluster
0,1.0,,3.0,1,333104.0,7394476.0,,,333743.0,7394463.0,...,59.0,4.0,2732.575910,3.0,2.0,1.0,False,1_1,3_0,
1,1.0,84.0,82.0,2,327503.0,7392159.0,329431.0,7395939.0,333743.0,7394463.0,...,21.0,4.0,2732.575910,4.0,2.0,5.0,False,1_1,82_16,84_1_5
2,1.0,,1.0,3,333453.0,7394501.0,,,333814.0,7394428.0,...,37.0,5.0,3200.000000,4.0,2.0,1.0,False,1_4,1_7,
3,1.0,,1.0,4,333539.0,7394387.0,,,333814.0,7394428.0,...,19.0,5.0,3200.000000,3.0,2.0,1.0,False,1_4,1_7,
4,1.0,,26.0,5,332344.0,7393317.0,,,333814.0,7394428.0,...,18.0,5.0,3200.000000,3.0,2.0,1.0,False,1_4,26_1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55487,342.0,,90.0,55488,327737.0,7396374.0,,,323322.0,7392038.0,...,27.0,1.0,9461.630458,5.0,4.0,1.0,True,342_10,90_0,
55488,342.0,,341.0,55489,323653.0,7391801.0,,,322857.0,7390956.0,...,52.0,1.0,9461.630458,5.0,2.0,1.0,True,342_6,341_0,
55489,342.0,,341.0,55490,323653.0,7391801.0,,,322857.0,7390956.0,...,26.0,1.0,9461.630458,5.0,2.0,1.0,True,342_6,341_0,
55490,342.0,,342.0,55491,323034.0,7390476.0,,,323034.0,7390476.0,...,28.0,3.0,3000.000000,5.0,2.0,1.0,False,342_3,342_1,


In [22]:
df.work_cluster.dropna().apply(lambda x: x.split('_')[0]).value_counts().index[0]

'73'

In [106]:
(.0001*works['work'])/73

17580    0.0001
131      0.0001
507      0.0001
7515     0.0001
8175     0.0001
          ...  
50723    0.0001
44736    0.0001
12048    0.0001
38612    0.0001
16913    0.0001
Name: work, Length: 560, dtype: float64

In [111]:
import plotly.express as px

works = df[(df.work == 73)].sort_values(by=['work_x', 'work_y', 'work_cluster'])
fig = px.scatter( x=works['work_x'], y=works['work_y'], color=works['work_cluster'],)
fig.update_layout(plot_bgcolor='rgb(250, 250, 250)',
                  xaxis={'showgrid': False, 'title':'Longitude', 'showticklabels':False},
                  yaxis={'showgrid': False, 'showticklabels':False, 'title':'Latitude'},
                  #title='Example of Work Clusters in Zone 73',
                  showlegend=False)
fig.write_image("clusters.png", scale=3)


In [113]:
!dir

 Volume in drive C has no label.
 Volume Serial Number is 8ADA-5624

 Directory of C:\Users\Yamba\Desktop\CovidMDP\notebooks\Modelagem Grafo SP

13/09/2020  12:38    <DIR>          .
13/09/2020  12:38    <DIR>          ..
13/09/2020  11:57    <DIR>          .ipynb_checkpoints
20/06/2020  21:44         4.310.909 Avalia‡Æo atributos grafo.ipynb
13/09/2020  12:38           158.013 clusters.png
29/08/2020  17:05            68.248 egonet.json
29/08/2020  18:01    <DIR>          ego_nets
31/08/2020  00:27         1.413.457 GRAPH EDA.ipynb
31/08/2020  14:26           757.073 graph.html
24/08/2020  21:06            16.008 heatmap_age_all.pdf
09/07/2020  16:30            17.210 heatmap_age_neighborhood.pdf
24/08/2020  21:07            16.296 heatmap_age_schools.pdf
24/08/2020  21:08            18.179 heatmap_age_work.pdf
13/06/2020  22:23    <DIR>          images
23/08/2020  20:07           290.881 Modelagem Distancia.ipynb
13/09/2020  11:50            20.991 Modelagem grafo individuos.ipynb
13

In [None]:
df[df]

In [5]:
def add_person_to_graph(G, person):
    G.add_node(person['id'],
               work = person['work'],
               school = person['school'],
               home = person['home'],
               job_level = person['job_level'],
               education = person['studies'],
               age = person['idade'],
               private_healthcare = person['private_healthcare'],
               home_id = person['home_id'],
               home_x= person['home_x'], home_y = person['home_y'], 
               school_x = person['school_x'], school_y = person['school_y'],
               work_x = person['work_x'], work_y = person['work_y'],
               criterio_br = person['criteriobr'] )    

def add_people_to_graph(G, df):
    print('Adding People Nodes')
    df.apply(lambda x: add_person_to_graph(G, x), axis=1)
    print(25*'*')

def add_edge(G, person1, person2, edge_type, relation_cluster, edge_zone):
    G.add_edge(person1, person2, edge_type=edge_type,
               cluster=relation_cluster, zone=edge_zone)

def add_edges(G, df, relation, cluster, edge_type):
    print(f'Adding {edge_type} Edges')
    total_edges = 0
    for c in tqdm(df[cluster].unique()):
        if pd.notna(c):
            tmp = df[df[cluster] == c]
            assert len(np.unique(tmp[relation])) == 1
            zone = tmp[relation].iloc[0]
            if len(tmp) > 1:
                combinations = list(itertools.combinations(tmp['id'].values, 2))
                total_edges += len(combinations)
                for p1, p2 in combinations:
                    add_edge(G,p1, p2, edge_type, c, zone)
   
    print(len([True for x,y,v in G.edges.data(data=True) if v['edge_type'] == edge_type]))
    print(25*'*')
    return total_edges

In [6]:
G = nx.MultiGraph()
add_people_to_graph(G, df)
houses = add_edges(G, df, 'home', 'home_id', 'home')
neighbors = add_edges(G, df, 'home', 'home_cluster', 'neighbor')
works = add_edges(G, df, 'work',   'work_cluster', 'work')
schools = add_edges(G, df, 'school', 'school_cluster', 'school')
assert len(G.nodes()) == len(df)
assert houses == len([True for x,y,v in G.edges.data(data=True) if v['edge_type'] == 'home'])
assert neighbors == len([True for x,y,v in G.edges.data(data=True) if v['edge_type'] == 'neighbor'])
assert works == len([True for x,y,v in G.edges.data(data=True) if v['edge_type'] == 'work'])
assert schools == len([True for x,y,v in G.edges.data(data=True) if v['edge_type'] == 'school'])


Adding People Nodes


  0%|▏                                                                             | 52/21708 [00:00<01:23, 258.22it/s]

*************************
Adding home Edges


100%|███████████████████████████████████████████████████████████████████████████| 21708/21708 [01:27<00:00, 246.72it/s]
  0%|▍                                                                              | 26/5401 [00:00<00:21, 250.39it/s]

61634
*************************
Adding neighbor Edges


100%|█████████████████████████████████████████████████████████████████████████████| 5401/5401 [00:23<00:00, 232.32it/s]
  2%|█▍                                                                             | 21/1201 [00:00<00:05, 205.26it/s]

348835
*************************
Adding work Edges


100%|█████████████████████████████████████████████████████████████████████████████| 1201/1201 [00:06<00:00, 189.81it/s]
  3%|██▍                                                                             | 22/734 [00:00<00:03, 218.32it/s]

342525
*************************
Adding school Edges


100%|███████████████████████████████████████████████████████████████████████████████| 734/734 [00:03<00:00, 219.94it/s]


209044
*************************


In [7]:
df.to_feather("../../data/processed/clusterized_df.feather")

In [8]:
nx.write_gpickle(G, '../../data/processed/SP_multiGraph_Job_Edu_Level.gpickle')

In [14]:
len([True for g,d in G.nodes(data=True)  if pd.notna(d['school']) ])

12605