In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
import networkx as nx
import pandas as pd
import numpy as np
from tqdm import tqdm
import itertools

df=pd.read_feather("../../data/interim/sp_data_job_education_levels.feather")
df['id'] = df.index + 1

In [2]:
def clusterize_relation(df, relation_name='home', x_coord='home_x', y_coord='home_y',
                            new_col_name = 'Neighbourhood', n_participants = 10, 
                            category_column = None, seed=13):
    data = []
    means = []
       
    for zone in df[relation_name].unique():
        if pd.notna(zone):
            if category_column is not None:
                tmp = df[df[relation_name] == zone][['id', x_coord, 
                                                 y_coord, relation_name,
                                                 category_column]].copy()
            else:
                tmp = df[df[relation_name] == zone][['id', x_coord, 
                                                 y_coord, relation_name]].copy()
            
            n_clusters = int(np.sum(len(tmp)) / n_participants)
 
            if n_clusters < 2:
                tmp[new_col_name] = (tmp[relation_name].astype(int).astype(str)
                                     + '_' + '0')
            else: 
                X = tmp[[x_coord, y_coord]]
                kmeans = KMeans(n_clusters=n_clusters, random_state=seed).fit(X)
                tmp[new_col_name] = kmeans.labels_
                
                if relation_name == 'school':
                    v = tmp[category_column].dropna().value_counts().sort_index().index
                    p = tmp[category_column].dropna().value_counts().sort_index().values
                    p = p/np.sum(p)
                    tmp[category_column].fillna(np.random.choice(v, p=p), inplace=True)
                
                
                if category_column is not None:
                    tmp[new_col_name] = (tmp[relation_name].astype(int).astype(str) + '_'
                                     + tmp[new_col_name].astype(int).astype(str) + '_'
                                     + tmp[category_column].astype(str))
                else:
                    tmp[new_col_name] = (tmp[relation_name].astype(int).astype(str) + '_'
                                     + tmp[new_col_name].astype(int).astype(str))
            
            data.append(tmp)
                
    final = pd.concat(data)

    return final

def make_clusters(df, home_cluster_n=10, work_cluster_n=20, school_cluster_n=30):
    df = df.copy()   
    
    home_clusters = clusterize_relation(df,'home', 'home_x', 'home_y', 'home_cluster',
                                        home_cluster_n, None)
    
    work_clusters = clusterize_relation(df,'work', 'work_x', 'work_y', 'work_cluster',
                                        work_cluster_n, None)
    
    school_clusters = clusterize_relation(df,'school', 'school_x', 'school_y',
                                          'school_cluster', school_cluster_n,
                                          'studies')

    return merge_cluster_dataframes(df, home_clusters, work_clusters, school_clusters)

def merge_cluster_dataframes(df, home_clusters, work_clusters, school_clusters):
    merged = pd.merge(df, home_clusters.drop(['home_x', 'home_y'], axis=1), 
                                             on=['id', 'home'], how='left')
    
    merged = pd.merge(merged, work_clusters.drop(['work_x', 'work_y'], axis=1),
                                                 on=['id', 'work'], how='left')
    
    merged = pd.merge(merged, school_clusters.drop(['school_x', 'school_y', 
                                                    'studies'], axis=1), 
                                                   on=['id', 'school'], how='left')
    
    assert merged[merged['home'].notna()]['home_cluster'].isna().sum() == 0

    assert np.sum([(merged['work'].notna()) 
            & (merged['work_cluster'].isna())]) == 0
    
    assert np.sum([(merged['school'].notna()) & (merged['school_cluster'].isna())]) == 0
    
    return merged

In [3]:
df = make_clusters(df, 10, 20, 30)

In [4]:
def add_person_to_graph(G, person):
    G.add_node(person['id'],
               work = person['work'],
               school = person['school'],
               home = person['home'],
               job_level = person['job_level'],
               edu_level = person['studies'],
               age = person['idade'],
               private_healthcare = person['private_healthcare'],
               home_id = person['home_id'],
               home_x= person['home_x'], home_y = person['home_y'], 
               school_x = person['school_x'], school_y = person['school_y'],
               work_x = person['work_x'], work_y = person['work_y'],
               criterio_br = person['criteriobr'] )    

def add_people_to_graph(G, df):
    print('Adding People Nodes')
    df.apply(lambda x: add_person_to_graph(G, x), axis=1)
    print(25*'*')

def add_edge(G, person1, person2, edge_type, relation_cluster, edge_zone):
    G.add_edge(person1, person2, edge_type=edge_type,
               cluster=relation_cluster, zone=edge_zone)

def add_edges(G, df, relation, cluster, edge_type):
    print(f'Adding {edge_type} Edges')
    total_edges = 0
    for c in tqdm(df[cluster].unique()):
        if pd.notna(c):
            tmp = df[df[cluster] == c]
            assert len(np.unique(tmp[relation])) == 1
            zone = tmp[relation].iloc[0]
            if len(tmp) > 1:
                combinations = list(itertools.combinations(tmp['id'].values, 2))
                total_edges += len(combinations)
                for p1, p2 in combinations:
                    add_edge(G,p1, p2, edge_type, c, zone)
   
    print(len([True for x,y,v in G.edges.data(data=True) if v['edge_type'] == edge_type]))
    print(25*'*')
    return total_edges

In [5]:
G = nx.MultiGraph()
add_people_to_graph(G, df)
houses = add_edges(G, df, 'home', 'home_id', 'home')
neighbors = add_edges(G, df, 'home', 'home_cluster', 'neighbor')
works = add_edges(G, df, 'work',   'work_cluster', 'work')
schools = add_edges(G, df, 'school', 'school_cluster', 'school')
assert len(G.nodes()) == len(df)
assert houses == len([True for x,y,v in G.edges.data(data=True) if v['edge_type'] == 'home'])
assert neighbors == len([True for x,y,v in G.edges.data(data=True) if v['edge_type'] == 'neighbor'])
assert works == len([True for x,y,v in G.edges.data(data=True) if v['edge_type'] == 'work'])
assert schools == len([True for x,y,v in G.edges.data(data=True) if v['edge_type'] == 'school'])


Adding People Nodes


  0%|          | 72/21708 [00:00<01:01, 352.79it/s]

*************************
Adding home Edges


100%|██████████| 21708/21708 [01:02<00:00, 349.54it/s]
  1%|▏         | 72/5401 [00:00<00:15, 350.29it/s]

61634
*************************
Adding neighbor Edges


100%|██████████| 5401/5401 [00:16<00:00, 334.00it/s]
  0%|          | 1/1201 [00:00<02:11,  9.09it/s]

348835
*************************
Adding work Edges


100%|██████████| 1201/1201 [00:04<00:00, 295.28it/s]
  1%|          | 5/734 [00:00<00:28, 25.87it/s]

342525
*************************
Adding school Edges


100%|██████████| 734/734 [00:02<00:00, 355.20it/s]


209044
*************************


In [9]:
counts = df['school_cluster'].value_counts()
counts[counts > np.quantile(counts.values, 0.98)]


31_1_university     139
93_0_university     112
339_9_university     99
303_1_university     91
169_1_university     82
24_1_university      81
43_3_university      70
70_0_university      70
23_1_university      68
72_0                 59
41_0                 59
226_0                59
53_0                 59
311_0                58
300_1_university     58
Name: school_cluster, dtype: int64

In [10]:
df.to_feather("../../data/processed/clusterized_df_name_levels.feather")

In [11]:
nx.write_gpickle(G, '../../data/processed/SP_job_edu_level.gpickle')