In [145]:
import pandas as pd
import json
from networkx import random_geometric_graph, set_node_attributes,betweenness_centrality
from pyvis.network import Network
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

In [146]:
august_df = pd.read_csv('data/during_august_merged.csv')

In [147]:
august_df = august_df.drop(august_df[august_df['geoid_o'] == august_df['geoid_d']].index)

In [148]:
august_df = august_df.sort_values(by=['pop_flows'], ascending=False)

In [149]:
august_df['pop_flows_normalized'] = august_df['pop_flows'] / august_df['pop_o']

In [150]:
august_df.pop_flows = august_df.pop_flows.astype(int)

In [151]:
august_migration_df = august_df[['county_o', 'county_d', 'pop_flows', 'pop_flows_normalized']]

august_migration_json = august_migration_df.to_json(orient='records', indent=4)

with open('table.json', 'w') as f:
    f.write(august_migration_json)


In [152]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

In [153]:
august_migration_df.head(5)

Unnamed: 0,county_o,county_d,pop_flows,pop_flows_normalized
963,Los Angeles County,Orange County,677050,0.067758
969,Los Angeles County,San Bernardino County,369872,0.037016
1794,San Bernardino County,Los Angeles County,335835,0.153825
1808,San Bernardino County,Riverside County,256839,0.117642
966,Los Angeles County,Riverside County,185927,0.018607


In [154]:
august_migration_df = august_migration_df.drop(august_migration_df[august_migration_df['pop_flows_normalized'] <= 0.003].index)

august_migration_df

Unnamed: 0,county_o,county_d,pop_flows,pop_flows_normalized
963,Los Angeles County,Orange County,677050,0.067758
969,Los Angeles County,San Bernardino County,369872,0.037016
1794,San Bernardino County,Los Angeles County,335835,0.153825
1808,San Bernardino County,Riverside County,256839,0.117642
966,Los Angeles County,Riverside County,185927,0.018607
...,...,...,...,...
59,Alpine County,Amador County,4,0.003486
86,Alpine County,Sierra County,4,0.003380
69,Alpine County,Los Angeles County,4,0.003342
62,Alpine County,Contra Costa County,4,0.003322


In [155]:
august_migration_df.pop_flows_normalized = august_migration_df.pop_flows_normalized * 100

In [156]:
august_migration_df

Unnamed: 0,county_o,county_d,pop_flows,pop_flows_normalized
963,Los Angeles County,Orange County,677050,6.775764
969,Los Angeles County,San Bernardino County,369872,3.701603
1794,San Bernardino County,Los Angeles County,335835,15.382454
1808,San Bernardino County,Riverside County,256839,11.764164
966,Los Angeles County,Riverside County,185927,1.860717
...,...,...,...,...
59,Alpine County,Amador County,4,0.348644
86,Alpine County,Sierra County,4,0.338021
69,Alpine County,Los Angeles County,4,0.334157
62,Alpine County,Contra Costa County,4,0.332226


In [157]:
nodes_df = pd.DataFrame({'binary': pd.unique(august_migration_df[['county_o', 'county_d']].values.ravel('K'))})

network_data = {
    "directed": True,
    "nodes": [],
    "links": [],
    "multigraph": False,
    "graph": []
}

# initiate nodes
for idx, node in nodes_df.iterrows():
    network_data['nodes'].append({
        "id": idx,
        "binary": node['binary'],
        "value": None
    })

# links
for idx, row in august_migration_df.iterrows():
    source_idx = nodes_df[nodes_df['binary'] == row['county_o']].index[0]
    target_idx = nodes_df[nodes_df['binary'] == row['county_d']].index[0]

    network_data['links'].append({
        "source": source_idx,
        "target": target_idx
    })

    # set values
    network_data['nodes'][source_idx]['value'] = float(row['pop_flows_normalized'])

nodes = network_data['nodes']

# dictionary to store the sampled nodes and links
sampled_network_data = {
    "directed": network_data["directed"],
    "nodes": nodes,
    "links": [],
    "multigraph": network_data["multigraph"],
    "graph": network_data["graph"]
}

node_id_mapping = {node["id"]: idx for idx, node in enumerate(nodes)}

# re-index
for node in sampled_network_data["nodes"]:
    old_id = node["id"]
    new_id = node_id_mapping[old_id]
    node["id"] = new_id

for link in network_data["links"]:
    source_id = link["source"]
    target_id = link["target"]

    if source_id in node_id_mapping and target_id in node_id_mapping:
        sampled_network_data["links"].append({
                "source": node_id_mapping[source_id],
                "target": node_id_mapping[target_id]
            })


In [158]:
counties = ['Mendocino County', 'Humboldt County', 'Trinity County', 'Tehama County', 'Glenn County', 'Lake County', 'Colusa County']

# Filter links based on source nodes in the specified counties
filtered_links = [link for link in sampled_network_data['links'] if sampled_network_data['nodes'][link['source']]['binary'] in counties]

# Update the network_data with the filtered links
sampled_network_data['links'] = filtered_links

target_ids = set(link['target'] for link in sampled_network_data['links'])
sampled_network_data['nodes'] = [node for node in sampled_network_data['nodes'] if node['id'] in target_ids]


In [159]:
# Create a mapping of old id values to new id values
id_mapping = {node["id"]: index for index, node in enumerate(sampled_network_data["nodes"])}

# Update the source and target values in links
for link in sampled_network_data["links"]:
    link["source"] = id_mapping.get(link["source"])
    link["target"] = id_mapping.get(link["target"])

In [160]:
sampled_network_data

{'directed': True,
 'nodes': [{'id': 0,
   'binary': 'Los Angeles County',
   'value': 0.3980144835613161},
  {'id': 1, 'binary': 'San Bernardino County', 'value': 0.5091677474252384},
  {'id': 2, 'binary': 'Contra Costa County', 'value': 0.30652767421237137},
  {'id': 4, 'binary': 'Sacramento County', 'value': 0.31858602704462363},
  {'id': 5, 'binary': 'Alameda County', 'value': 0.34314220731357276},
  {'id': 6, 'binary': 'Santa Clara County', 'value': 0.30154532381496485},
  {'id': 7, 'binary': 'Placer County', 'value': 0.3165718293814662},
  {'id': 8, 'binary': 'San Mateo County', 'value': 0.34571670532972565},
  {'id': 9, 'binary': 'Kern County', 'value': 0.3198089046321336},
  {'id': 11, 'binary': 'San Joaquin County', 'value': 0.32579853279678217},
  {'id': 13, 'binary': 'El Dorado County', 'value': 0.3298465422756693},
  {'id': 14, 'binary': 'Yolo County', 'value': 0.30361228461268747},
  {'id': 15, 'binary': 'Merced County', 'value': 0.3191243903079072},
  {'id': 18, 'binary':

In [161]:
json_data = json.dumps(sampled_network_data, indent=4, ensure_ascii=False, separators=(",", ": "), cls=NpEncoder)

with open('data/network_data_sampled.json', 'w') as f:
    f.write(json_data)