In [2]:
import networkx as nx

from node2vec import Node2Vec

from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from scipy.optimize import linear_sum_assignment
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
G = nx.barbell_graph(m1=4,m2=2)
# nx.draw(G,with_labels = True,node_color='b',node_size=500);


In [4]:
G.edges

EdgeView([(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (6, 8), (6, 9), (7, 8), (7, 9), (8, 9)])

In [5]:
G.nodes

NodeView((0, 1, 2, 3, 4, 5, 6, 7, 8, 9))

In [39]:
[v for v in G.neighbors(0)]

[1, 2, 3]

In [14]:
def embed_nodes(
    G, 
    dimensions = 2, 
    walk_length = 20, 
    num_walks = 10,
    workers = 4,
    window = 10,
    min_count = 1
):

    # Generate walks
    node2vec = Node2Vec(
        G, dimensions=dimensions, walk_length=walk_length, 
        num_walks=num_walks, workers=workers, quiet=False)
    # Learn embeddings 
    model = node2vec.fit(window=window, min_count=min_count)

    return model.wv.vectors

In [6]:
# learn embedding

# Generate walks
node2vec = Node2Vec(G, dimensions=2, walk_length=20, num_walks=10,workers=4,quiet=False)
# Learn embeddings 
model = node2vec.fit(window=10, min_count=1)

Computing transition probabilities: 100%|██████████| 10/10 [00:00<00:00, 2879.52it/s]
Generating walks (CPU: 3): 100%|██████████| 2/2 [00:00<00:00, 674.38it/s]

Generating walks (CPU: 2): 100%|██████████| 3/3 [00:00<00:00, 735.89it/s]
Generating walks (CPU: 1): 100%|██████████| 3/3 [00:00<00:00, 691.44it/s]


In [7]:
model.wv.__dict__

# elements in index_to_key is "word"

{'vector_size': 2,
 'index_to_key': ['3', '6', '9', '1', '0', '8', '7', '2', '4', '5'],
 'next_index': 0,
 'key_to_index': {'3': 0,
  '6': 1,
  '9': 2,
  '1': 3,
  '0': 4,
  '8': 5,
  '7': 6,
  '2': 7,
  '4': 8,
  '5': 9},
 'vectors': array([[-1.0795068 ,  0.62973034],
        [-0.8024769 ,  1.0214614 ],
        [-1.338594  ,  0.09568851],
        [-0.7111508 ,  1.0677968 ],
        [-1.1157986 ,  0.35030797],
        [-0.9055779 ,  0.5786976 ],
        [-1.0032897 ,  0.72942185],
        [-1.0784491 ,  0.37565202],
        [-0.9536215 ,  0.64058024],
        [-1.4322965 ,  0.09616857]], dtype=float32),
 'norms': None,
 'expandos': {'count': array([277, 245, 211, 207, 201, 199, 194, 184, 145, 137]),
  'sample_int': array([395961944, 423114646, 458862063, 463669521, 471162990, 473740015,
         480365870, 494465721, 563659255, 581637179], dtype=uint32)},
 'mapfile_path': None,
 'vectors_lockf': array([1.], dtype=float32)}

In [8]:
model.wv.vectors

array([[-1.0795068 ,  0.62973034],
       [-0.8024769 ,  1.0214614 ],
       [-1.338594  ,  0.09568851],
       [-0.7111508 ,  1.0677968 ],
       [-1.1157986 ,  0.35030797],
       [-0.9055779 ,  0.5786976 ],
       [-1.0032897 ,  0.72942185],
       [-1.0784491 ,  0.37565202],
       [-0.9536215 ,  0.64058024],
       [-1.4322965 ,  0.09616857]], dtype=float32)

In [9]:
# create clusters based on embedding


def get_even_clusters(X, cluster_size):
    """
    running KMeans then finding the minimal matching of points to clusters 
    under the constraint of maximal points assigned to cluster (cluster size)
    Args:
        X: numpy array, num_samples x num_features?
        cluster_size: cluster size
    """
    n_clusters = int(np.ceil(len(X)/cluster_size))
    kmeans = KMeans(n_clusters)
    kmeans.fit(X)
    # print timing
    centers = kmeans.cluster_centers_
    centers = centers.reshape(-1, 1, X.shape[-1]).repeat(cluster_size, 1).reshape(-1, X.shape[-1])
    distance_matrix = cdist(X, centers)
    clusters = linear_sum_assignment(distance_matrix)[1]//cluster_size
    # print timing
    
    return clusters

In [32]:
get_even_clusters(model.wv.vectors, cluster_size=5)



array([0, 1, 0, 1, 0, 1, 1, 0, 1, 0])

In [1]:


from seirsplus.models import *
from seirsplus.networks import *
from seirsplus.sim_loops import *
from seirsplus.utilities import *

In [18]:
demographic_graphs, individual_ageGroups, households = generate_demographic_contact_network(
                                                            N=10000, demographic_data=household_country_data('US'), 
                                                            distancing_scales=[0.7], isolation_groups=[])

G_baseline   = demographic_graphs['baseline']
G_quarantine = demographic_graphs['distancingScale0.7']

Generated overall age distribution:
0-9: 0.1091	(-0.0119 from target)
10-19: 0.1231	(-0.0079 from target)
20-29: 0.1404	(0.0034 from target)
30-39: 0.1346	(0.0016 from target)
40-49: 0.1254	(0.0014 from target)
50-59: 0.1351	(0.0041 from target)
60-69: 0.1222	(0.0072 from target)
70-79: 0.0705	(0.0005 from target)
80+: 0.0396	(0.0016 from target)

Generated household size distribution:
1: 0.2836	(-0.0001 from target)
2: 0.3334	(-0.0117 from target)
3: 0.1511	(0.0004 from target)
4: 0.1315	(0.0039 from target)
5: 0.0661	(0.0084 from target)
6: 0.0228	(0.0002 from target)
7: 0.0114	(-0.0011 from target)
Num households: 4037
mean household size: 2.448393594

Generating graph for 0-9...


  return adjacency_matrix(G, nodelist, dtype, weight)


Generating graph for 10-19...
Generating graph for 20-59...
Generating graph for 60+...


In [19]:
embedding = embed_nodes(G_baseline)

Computing transition probabilities: 100%|██████████| 10000/10000 [00:44<00:00, 224.80it/s]
Generating walks (CPU: 1): 100%|██████████| 3/3 [00:08<00:00,  2.87s/it]
Generating walks (CPU: 2): 100%|██████████| 3/3 [00:08<00:00,  2.98s/it]
Generating walks (CPU: 3): 100%|██████████| 2/2 [00:05<00:00,  2.82s/it]
Generating walks (CPU: 4): 100%|██████████| 2/2 [00:05<00:00,  2.80s/it]


In [22]:
get_even_clusters(embedding, cluster_size=5)



array([1104,  144,  789, ...,   74,   32,  179])

In [None]:
# clustering time
# (1000,2) 6 sec
# (10000, 2) 7 min