In [1]:
import numpy as np
import sys
from scipy import sparse
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd
import networkx as nx
from sklearn.preprocessing import StandardScaler

In [23]:
#load the data
df_migrations = pd.read_csv("NTDS_data/countyinflow1516.csv" )

# create the combined fips county number of destination
df_migrations['statefips_str'] = df_migrations['y2_statefips'].apply(lambda x : str(x).zfill(2))
df_migrations['countyfips_str'] = df_migrations['y2_countyfips'].apply(lambda x : str(x).zfill(3))
df_migrations['combined_fips-destination'] = df_migrations['statefips_str'].apply(lambda x: x.lstrip('0')) + df_migrations['countyfips_str']

# create the combined fips county number of source
df_migrations['statefips_str1'] = df_migrations['y1_statefips'].apply(lambda x : str(x).zfill(2))
df_migrations['countyfips_str1'] = df_migrations['y1_countyfips'].apply(lambda x : str(x).zfill(3))
df_migrations['combined_fips-source'] = df_migrations['statefips_str1'].apply(lambda x: x.lstrip('0')) + df_migrations['countyfips_str1']


# Cleaning the data to have only source and origin counties and unemployment rate as a new column
df_migrations = df_migrations[df_migrations['y1_statefips']<=56]
df_migrations["Unemployment rate"] = df_migrations["n1"]/(df_migrations["n2"] +df_migrations["n1"] )

# drop useless information 
df_migrations = df_migrations.drop(columns=["y1_countyname","y2_statefips", "y2_countyfips", "y1_statefips", "y1_countyfips", "y1_state", "statefips_str", "countyfips_str","statefips_str1", "countyfips_str1"])

# remove nodes where data is undefined undefined data by zero
df_migrations = df_migrations[df_migrations['n1'] != -1]

# convert combined fips to int64
df_migrations['combined_fips-destination'] = df_migrations['combined_fips-destination'].astype('int64')
df_migrations['combined_fips-source'] = df_migrations['combined_fips-source'].astype('int64')

#extracting the combined fips destination and combined fips source for graph in form of numpy arrays
df_graph= df_migrations.drop(columns=["n1","n2","agi","Unemployment rate"])

# extracting all the combinations that have happened in the US between county
dest_source = df_graph.to_numpy()

# reset index starting from 0 (because rows were dropped)
df_migrations = df_migrations.reset_index()
df_migrations = df_migrations.drop(columns=['index'])

In [46]:
nodes_index = np.unique(dest_source)
num_nodes = nodes_index.shape[0]
A_total = np.zeros((num_nodes, num_nodes))
A_returns = np.zeros((num_nodes, num_nodes))
A_exemptions = np.zeros((num_nodes, num_nodes))
count = 0
for dest, source in dest_source : 
    i = np.where(nodes_index == dest)
    j = np.where(nodes_index == source)
    total = df_migrations["n1"][count] + df_migrations["n2"][count]
    
    A_total[j[0], i[0]] = df_migrations["n1"][count] + df_migrations["n2"][count]
    A_returns[j[0], i[0]] = df_migrations["n1"][count]/total
    A_exemptions[j[0], i[0]] = df_migrations["n2"][count]/total
    count += 1

In [26]:
df_presidential_result = pd.read_csv("NTDS_data/2016_US_County_Level_Presidential_Results.csv" )
df_presidential_result = df_presidential_result.drop(columns=["Unnamed: 0","votes_dem", "votes_gop", "total_votes", "diff", "per_point_diff", "state_abbr", "county_name"])

#Sorting according to the fips code to be consistent with the migration data by IRS
df_presidential_result = df_presidential_result.sort_values(by=['combined_fips'])

#Adding a new column of the winners with -1 corresponding to democrat and 1 to republican

df_presidential_result["Winner"] =  np.where((df_presidential_result['per_dem'] > df_presidential_result['per_gop'])
                     , -1, 1)

df_presidential_result = df_presidential_result.drop(columns=["per_dem","per_gop"])

       per_dem   per_gop  combined_fips  Winner
29    0.239569  0.734358           1001       1
30    0.195653  0.773515           1003       1
31    0.466603  0.522714           1005       1
32    0.214220  0.769662           1007       1
33    0.084699  0.898519           1009       1
...        ...       ...            ...     ...
3136  0.194046  0.729428          56037       1
3137  0.600608  0.321945          56039      -1
3138  0.149261  0.764187          56041       1
3139  0.143203  0.783580          56043       1
3140  0.088182  0.869226          56045       1

[3141 rows x 4 columns]


In [96]:
# construct graph for flows with more then 45% returns 
# create adjacency matrix for flows with more then 45% returns 
def create_adjency_nonRGB_returns(threshold_returns, plot_adj_returns=False) : 
    adjacency_nonRGB_returns = A_returns.copy()
    adjacency_nonRGB_returns[adjacency_nonRGB_returns >= threshold_returns] = 1
    adjacency_nonRGB_returns[adjacency_nonRGB_returns < threshold_returns] = 0

    if plot_adj_returns : 
        plt.spy(adjacency_nonRGB_returns)
        plt.show()
        
    return adjacency_nonRGB_returns

def create_graph_nonRGB_returns(threshold_returns, plot_adj_returns=False) : 
    i = np.where(nodes_index == dest)
    graph_nonRGB_returns = nx.from_numpy_array(create_adjency_nonRGB_returns(threshold_returns, plot_adj_returns))
    nodes = np.zeros((nodes_index.shape[0], 2))
    for fips, result in df_presidential_result.values :
        print(fips)
        i = np.where(nodes_index == fips)
        index = i[0][0]
        nodes[index] = result
    print(nodes)
    node = pd.DataFrame(nodes, columns=["id", "result"])
    print(node)
    node_props = node.to_dict()
    for key in node_props:
        nx.set_node_attributes(graph_nonRGB_returns, node_props[key], key)

    nx.write_gexf(graph_nonRGB_returns, 'graph_nonRGB_returns_35.gexf')
    return graph_nonRGB_returns

In [97]:
create_graph_nonRGB_returns(0.6)

1001
1003
1005
1007
1009
1011
1013
1015
1017
1019
1021
1023
1025
1027
1029
1031
1033
1035
1037
1039
1041
1043
1045
1047
1049
1051
1053
1055
1057
1059
1061
1063
1065
1067
1069
1071
1073
1075
1077
1079
1081
1083
1085
1087
1089
1091
1093
1095
1097
1099
1101
1103
1105
1107
1109
1111
1113
1115
1117
1119
1121
1123
1125
1127
1129
1131
1133
2013
2016
2020
2050
2060
2068
2070
2090
2100
2105
2110
2122
2130
2150
2164
2170
2180
2185
2188
2195
2198
2220
2230
2240
2261
2270


IndexError: index 0 is out of bounds for axis 0 with size 0

In [104]:
nodes = np.zeros((nodes_index.shape[0], 2))
i = np.where(nodes_index == 2270)
print(i)

(array([], dtype=int64),)


In [119]:
test = nodes_index - df_presidential_result["combined_fips"].values
df_presidential_result["combined_fips"] = df_presidential_result["combined_fips"] + test
test2 = nodes_index - df_presidential_result["combined_fips"].values

In [122]:
df_presidential_result["combined_fips"]

29       1001
30       1003
31       1005
32       1007
33       1009
        ...  
3136    56037
3137    56039
3138    56041
3139    56043
3140    56045
Name: combined_fips, Length: 3141, dtype: int64

In [114]:
df_presidential_result

Unnamed: 0,combined_fips,Winner
29,1001,1
30,1003,1
31,1005,1
32,1007,1
33,1009,1
...,...,...
3136,56037,1
3137,56039,-1
3138,56041,1
3139,56043,1
