In [1]:
import numpy as np
import sys
from scipy import sparse
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd
import networkx as nx
from sklearn.preprocessing import StandardScaler
%matplotlib inline
import scipy as sci
from sklearn.cluster import KMeans
import sklearn.metrics as sm
from utils import * # contains all helper functions used in the project

# Prepare data

In [46]:
# load the data
df_migrations = pd.read_csv("./NTDS_Data/countyinflow1516.csv" )

In [47]:
# load the data
df_migrations = pd.read_csv("./NTDS_Data/countyinflow1516.csv" )

# keep only summury information of each county
df_migrations = df_migrations[df_migrations['y1_countyname'].str.contains("County Total Migration")]

# create the combined fips county number 
df_migrations['statefips_str'] = df_migrations['y2_statefips'].apply(lambda x : str(x).zfill(2))
df_migrations['countyfips_str'] = df_migrations['y2_countyfips'].apply(lambda x : str(x).zfill(3))
df_migrations['combined_fips'] = df_migrations['statefips_str'].apply(lambda x: x.lstrip('0')) + df_migrations['countyfips_str']

# drop useless information 
df_migrations = df_migrations.drop(columns=["y2_statefips", "y2_countyfips", "y1_statefips", "y1_countyfips", "y1_state", "statefips_str", "countyfips_str"])

# seperate each possible migration into three dataframe 
df_migration_total = df_migrations[df_migrations['y1_countyname'].str.contains("County Total Migration-US and Foreign")]
df_migrations['y1_countyname'] = df_migrations['y1_countyname'].apply(lambda x : x if x.find("County Total Migration-US and Foreign") == -1 else "County Total Migration Both")
df_migration_us = df_migrations[df_migrations['y1_countyname'].str.contains("County Total Migration-US")]
df_migration_for = df_migrations[df_migrations['y1_countyname'].str.contains("County Total Migration-Foreign")]

# drop the name of the column 
df_migration_total = df_migration_total.drop(columns=["y1_countyname"])
df_migration_us = df_migration_us.drop(columns=["y1_countyname"])
df_migration_for = df_migration_for.drop(columns=["y1_countyname"])

# remove nodes where data is undefined undefined data by zero
df_migration_total = df_migration_total[df_migration_total['n1'] != -1]
df_migration_us = df_migration_us[df_migration_us['n1'] != -1]
df_migration_for = df_migration_for[df_migration_for['n1'] != -1]

# convert combined fips to int64
df_migration_total['combined_fips'] = df_migration_total['combined_fips'].astype('int64')
df_migration_us['combined_fips'] = df_migration_us['combined_fips'].astype('int64')
df_migration_for['combined_fips'] = df_migration_for['combined_fips'].astype('int64')

In [48]:
df_presidential_result = pd.read_csv("./NTDS_Data/2016_US_County_Level_Presidential_Results.csv" )
df_presidential_result = df_presidential_result.drop(columns=["Unnamed: 0","votes_dem", "votes_gop", "total_votes", "diff", "per_point_diff", "state_abbr", "county_name"])

In [49]:
# merge the two dataset and drop useless column, add a new column winner 
df_merged_total = pd.merge(df_migration_total, df_presidential_result, on="combined_fips", how='inner')
df_merged_us = pd.merge(df_migration_us, df_presidential_result, on="combined_fips", how='inner')
df_merged_for = pd.merge(df_migration_for, df_presidential_result, on="combined_fips", how='inner')
df_merged_total['difference'] = df_merged_total['per_dem'] - df_merged_total['per_gop']
df_merged_us['difference'] = df_merged_us['per_dem'] - df_merged_total['per_gop']
df_merged_for['difference'] = df_merged_for['per_dem'] - df_merged_total['per_gop']
df_merged_total['winner'] = df_merged_total['difference'].apply(lambda x : -1 if x > 0 else 1)
df_merged_us['winner'] = df_merged_us['difference'].apply(lambda x : -1 if x > 0 else 1)
df_merged_for['winner'] = df_merged_for['difference'].apply(lambda x : -1 if x > 0 else 1)
df_merged_total = df_merged_total.drop(columns=['difference'])
df_merged_us = df_merged_us.drop(columns=['difference'])
df_merged_for = df_merged_for.drop(columns=['difference'])

# Compute Adjacency matrix

### Division by zero in normalized lapacien computation

- some nodes do not have any connection in the graph => create new adjacency matrix A by deleting these nodes

In [6]:
X_total = df_merged_total.drop(columns=['combined_fips', 'per_dem', 'per_gop', 'winner'])
nodes_total = df_merged_total.drop(columns=['n1', 'n2', 'agi', 'per_dem', 'per_gop']).values
X_total['agi'] = (X_total['agi'] - X_total['agi'].mean()) / X_total['agi'].std()
X_total['prop_ret/exempt'] = X_total['n1'] / X_total['n2']
X_total = X_total.drop(columns=['n1', 'n2'])
adjacency_RGB_total = epsilon_similarity_graph(X_total, sigma=0.5284353963018223*0.1, epsilon=0.2)
#plt.spy(adjacency_RGB_total)
#plt.show()

# Randomly choose some points for prediction use

In [7]:
# prepare A and x(signal)
A = adjacency_RGB_total.copy()
#rows_to_be_deleted = np.where(np.sum(adjacency_RGB_total,1) == 0)
#A = np.delete(A, rows_to_be_deleted,0)
#A = np.delete(A, rows_to_be_deleted,1)
y = df_merged_total["winner"].copy()
#y = np.delete(np.array(y), rows_to_be_deleted,0) 
X_total = X_total.values
#X_total = np.delete(np.array(X_total), rows_to_be_deleted,0)
# compute lamb and U
laplacian = compute_laplacian(A, normalize=True)
lamb, U = spectral_decomposition(laplacian)

In [8]:
# prepare filter
n_nodes = A.shape[0]
ideal_lp = np.ones((n_nodes,)) 
ideal_lp[lamb >= 0.1] = 0   # to tune
# apply filter
x_lp = ideal_graph_filter(y.copy(),ideal_lp,U)

2963


In [9]:
iters = 10
n = int(len(y)*0.2)

In [10]:
accuracy_mean, accuracy_var = pred_iteration(A,iters, y, n, x_lp)

The mean is  0.9297656318913955
The variance is  0.005463285965821614


# FOR

In [50]:
X_for = df_merged_for.drop(columns=['combined_fips', 'per_dem', 'per_gop', 'winner'])
nodes_for = df_merged_for.drop(columns=['n1', 'n2', 'agi', 'per_dem', 'per_gop']).values
X_for['agi'] = (X_for['agi'] - X_for['agi'].mean()) / X_for['agi'].std()
X_for['prop_ret/exempt'] = X_for['n1'] / X_for['n2']
X_for = X_for.drop(columns=['n1', 'n2'])
adjacency_RGB_for = epsilon_similarity_graph(X_for, sigma=0.6675252605174871*0.1, epsilon=0.5)
#plt.spy(adjacency_RGB_for)
#plt.show()

In [51]:
# prepare A and x(signal)
A_for = adjacency_RGB_for.copy()
y_for = df_merged_for["winner"].copy()

X_for = X_for.values

# compute lamb and U
laplacian_for = compute_laplacian(A_for, normalize=True)
lamb_for, U_for = spectral_decomposition(laplacian_for)

In [52]:
# prepare filter
ideal_lp_for = np.ones((A_for.shape[0],)) 
ideal_lp_for[lamb_for >= 0.1] = 0   # to tune
# apply filter
x_lp_for = ideal_graph_filter(y_for.copy(),ideal_lp_for,U_for)

455


In [53]:
iters_for = 20
n_for = int(len(y_for)*0.2)

In [54]:
accuracy_mean_for, accuracy_var_for = pred_iteration(A_for,iters_for, y_for, n_for, x_lp_for)

455
0      1
1      1
2      1
3      1
4      1
      ..
450    1
451    1
452    1
453    1
454    1
Name: winner, Length: 455, dtype: int64
-99
The mean is  0.8486809408603457
The variance is  0.03414853457970139


# US

In [16]:
X_us = df_merged_us.drop(columns=['combined_fips', 'per_dem', 'per_gop', 'winner'])
nodes_us = df_merged_us.drop(columns=['n1', 'n2', 'agi', 'per_dem', 'per_gop']).values
X_us['agi'] = (X_us['agi'] - X_us['agi'].mean()) / X_us['agi'].std()
X_us['prop_ret/exempt'] = X_us['n1'] / X_us['n2']
X_us = X_us.drop(columns=['n1', 'n2'])
adjacency_RGB_us = epsilon_similarity_graph(X_us, sigma=0.5310405705207334*0.1, epsilon=0.5)
#plt.spy(adjacency_RGB_us)
#plt.show()

In [17]:
# prepare A and x(signal)
A_us = adjacency_RGB_us.copy()
y_us = df_merged_us["winner"].copy()

X_us = X_us.values
# compute lamb and U
laplacian_us = compute_laplacian(A_us, normalize=True)
lamb_us, U_us = spectral_decomposition(laplacian_us)

In [18]:
# prepare filter
ideal_lp_us = np.ones((A_us.shape[0],)) 
ideal_lp_us[lamb_us >= 0.1] = 0   # to tune
# apply filter
x_lp_us = ideal_graph_filter(y_us.copy(),ideal_lp_us,U_us)

2963


In [19]:
iters_us = 1
n_us = int(len(y_us)*0.2)

In [20]:
accuracy_mean_us, accuracy_var_us = pred_iteration(A_us,iters_us, y_us, n_us, x_lp_us)

The mean is  0.9283018867924528
The variance is  0.0


In [None]:
# comparer avec la version sans filtrage pour avoir combien on améliore
# avec quel method de coup

# GCN

In [21]:
import time

import networkx as nx
from sklearn.linear_model import LogisticRegression

import torch
import torch.nn as nn
import torch.nn.functional as F

import dgl.function as fn
from dgl import DGLGraph
from dgl.data.citation_graph import load_cora

np.random.seed(0)
torch.manual_seed(1)

<torch._C.Generator at 0x7f267024b690>

In [22]:
mean_for,var_for = apply_gcn(1,X_for,y_for,A_for,laplacian_for,lamb_for,U_for)

Computing
100 %
The mean of f1 score is  0.7619047619047619
The variance of f1 score is  0.0


In [23]:
mean_total,var_total = apply_gcn(1,X_total,y,A,laplacian,lamb,U)

Computing
100 %
The mean of f1 score is  0.9341983317886932
The variance of f1 score is  0.0


In [24]:
mean_us,var_us = apply_gcn(1,X_us,y_us,A_us,laplacian_us,lamb_us,U_us)

Computing
100 %
The mean of f1 score is  0.929368029739777
The variance of f1 score is  0.0


# Result 
accuracy: `mean-var` for 20 iterations

              Total         For           Us
            
`GCN :     0.925-0.008  0.835-0.025  0.929-0.008`

`Fourier : 0.931-0.008  0.855-0.023  0.928-0.007`




In [None]:
# overfit? => comparer les ecarts de scores entre train set et test set
# ne petmettre pas de trouver les patterns


# metric when data note balanced: F1 score

# Another adjacency matrix with migration flow

In [18]:
df_migrations = pd.read_csv("NTDS_Data/countyinflow1516.csv" )


# create the combined fips county number of destination
df_migrations['statefips_str'] = df_migrations['y2_statefips'].apply(lambda x : str(x).zfill(2))
df_migrations['countyfips_str'] = df_migrations['y2_countyfips'].apply(lambda x : str(x).zfill(3))
df_migrations['combined_fips-destination'] = df_migrations['statefips_str'].apply(lambda x: x.lstrip('0')) + df_migrations['countyfips_str']

# create the combined fips county number of source
df_migrations['statefips_str1'] = df_migrations['y1_statefips'].apply(lambda x : str(x).zfill(2))
df_migrations['countyfips_str1'] = df_migrations['y1_countyfips'].apply(lambda x : str(x).zfill(3))
df_migrations['combined_fips-source'] = df_migrations['statefips_str1'].apply(lambda x: x.lstrip('0')) + df_migrations['countyfips_str1']

# Experimenting with parametres(not important)
#new_df = df[(df['WindSpeed']>4) & (df['Temperature']>30)]
#df_migrations2 = df_migrations[df_migrations['y1_countyname'].str.contains("County Total Migration-US and Foreign")]
#df_migrations3 = df_migrations[(df_migrations['y1_statefips']== 96) & (df_migrations['y2_countyfips']!= 0)]

#Cleaning the data to have only source and origin counties and unemployment rate as a new column
df_migrations = df_migrations[df_migrations['y1_statefips']<=56]
df_migrations["Unemployment rate"] = df_migrations["n1"]/(df_migrations["n2"] +df_migrations["n1"] )
df_migrations1 = df_migrations[df_migrations['combined_fips-destination'] ==df_migrations['combined_fips-source']]  # keeping Unemployment rate of non migrants
  


 #drop useless information 
df_migrations = df_migrations.drop(columns=["y1_countyname","y2_statefips", "y2_countyfips", "y1_statefips", "y1_countyfips", "y1_state", "statefips_str", "countyfips_str","statefips_str1", "countyfips_str1"])
df_migrations1 = df_migrations1.drop(columns=["y1_countyname","y2_statefips", "y2_countyfips", "y1_statefips", "y1_countyfips", "y1_state", "statefips_str", "countyfips_str","statefips_str1", "countyfips_str1"])
df_migrations1 =df_migrations1.drop(columns=["agi","combined_fips-source"])



# remove nodes where data is undefined undefined data by zero
df_migrations = df_migrations[df_migrations['n1'] != -1]
df_migrations1 = df_migrations1[df_migrations1['n1'] != -1]
#df_migrations3 = df_migrations3[df_migrations3['n1'] != -1]

# convert combined fips to int64
df_migrations['combined_fips-destination'] = df_migrations['combined_fips-destination'].astype('int64')
df_migrations['combined_fips-source'] = df_migrations['combined_fips-source'].astype('int64')
df_migrations1['combined_fips-destination'] = df_migrations1['combined_fips-destination'].astype('int64')

#print(df_migrations1)
df_migrations1= df_migrations1.drop(columns=["n1","n2","combined_fips-destination"])

#extracting the combined fips destination and combined fips source for graph in form of numpy arrays
df_graph= df_migrations.drop(columns=["n1","n2","agi","Unemployment rate"])

arr = df_graph.to_numpy()

In [19]:
df_presidential_result = pd.read_csv("NTDS_Data/2016_US_County_Level_Presidential_Results.csv" )
df_presidential_result = df_presidential_result.drop(columns=["Unnamed: 0","votes_dem", "votes_gop", "total_votes", "diff", "per_point_diff", "state_abbr", "county_name"])

#Sorting according to the fips code to be consistent with the migration data by IRS
df_presidential_result = df_presidential_result.sort_values(by=['combined_fips'])

#Adding a new column of the winners with -1 corresponding to democrat and 1 to republican

df_presidential_result["Winner"] =  np.where((df_presidential_result['per_dem'] > df_presidential_result['per_gop'])
                     , -1, 1)

#print(df_presidential_result)
df_presidential_result = df_presidential_result.drop(columns=["per_dem","per_gop","combined_fips"])


In [21]:
possible_nodes = np.unique(arr)
#print(possible_nodes)
A_migr = np.zeros((len(possible_nodes), len(possible_nodes))) 

for dest, source in arr : 
     
    i = np.where(possible_nodes == dest)
    #print(i)
    j = np.where(possible_nodes == source)
    #print(j)
    A_migr[j[0], i[0]] = 1
 

# Creating a dictionary of attribute of Unemployment rate
#y_unemployment_rate = df_migrations1.to_numpy()
#d = dict(enumerate(z.flatten(), 0))

#Creating a dictionary of attribute of county election result
#y_presidential_result = df_presidential_result.to_numpy()
#d2 = dict(enumerate(z.flatten(), 0))

In [38]:
# prepare A and x(signal)
A_migration = A_migr.copy()
# define two different targets for migration adjacency matrix
y_unemployment_rate = df_migrations1["Unemployment rate"].copy()
y_presidential_result = df_presidential_result["Winner"].copy()
# compute lamb and U
laplacian_migration = compute_laplacian(A_migration, normalize=True)
lamb_migration, U_migration = spectral_decomposition(laplacian_migration)

In [39]:
# prepare filter
ideal_lp_migration = np.ones((A_migration.shape[0],)) 
ideal_lp_migration[lamb_migration >= 0.1] = 0   # to tune
# apply filter
x_lp_migration = ideal_graph_filter(y_unemployment_rate.copy(),ideal_lp_migration,U_migration)

3141


In [40]:
iters = 1
n = int(len(y_presidential_result)*0.2)

In [64]:

# Evaluation the prediction using Fourier analysis
def pred_iteration(A,iters, y, n, filtered_x_lp):
    f1_scores =[]
    y_ = y.copy() # this is training data
    
    for i in range(iters):
        # choose randomly n indices to masking, for evaluating use
        test_idx = np.random.choice(np.arange(len(y_)),n,replace = False)
        # masking some winner
        y_[2828]=0
        # prepare ground truth labels
        truth = (y[test_idx]).values
        # prepare for the prediction
        pred = []
        for i in test_idx:
            l = np.where(A[i] !=0)[0]  # searching neigbhours for a masked node
            if(len(l)!= 0):
                tmp = 0 # filtered_x_lp[i] => add initial node value ? or without mean
                for j in l:
                    # sum over values from neighbour nodes
                    tmp += filtered_x_lp[j]
                # compute mean according to total number of neighbours
                pred.append(tmp/len(l))   
            else:
                # if the node has no neighbour then the value will be its signal values
                pred.append(filtered_x_lp[i])

        # thresholding over the prediction so that only 1 or -1 will be returned
        pred_thres = np.array(pred)
        pred_thres [pred_thres >0 ] = 1
        pred_thres [pred_thres <0 ] = -1

        # compute the f1 score of the prediction and add to scores list
        f1_scores.append(sm.f1_score(truth,pred_thres))
        
    # compute mean of all obtained scores
    mean = np.mean(f1_scores)
    # compute variance of all obtained scores
    var = np.std(f1_scores)
    print("The mean is ",mean)
    print("The variance is ",var)
    return mean,var

In [66]:
y_unemployment_rate

10       0.302760
31       0.316083
81       0.310380
98       0.300111
115      0.293537
           ...   
86255    0.302733
86278    0.329863
86294    0.289025
86310    0.309895
86322    0.317928
Name: Unemployment rate, Length: 3141, dtype: float64

In [69]:
y_presidential_result

29      1
30      1
31      1
32      1
33      1
       ..
3136    1
3137   -1
3138    1
3139    1
3140    1
Name: Winner, Length: 3141, dtype: int64

In [65]:
accuracy_mean_migration, accuracy_var_migration = pred_iteration(A_migration,iters, y_unemployment_rate, n, x_lp_migration)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
accuracy_mean_migration, accuracy_var_migration = pred_iteration(A_migration,iters, y_presidential_result, n, x_lp_migration)

In [None]:
mean_unemployment_rate,var_unemployment_rate = apply_gcn(iters,X_us,unemployment_rate,A_migration,laplacian_migration,lamb_migration,U_migration)

In [None]:
mean_presidential_result,var_presidential_result = apply_gcn(iters,X_us,y_presidential_result,A_migration,laplacian_migration,lamb_migration,U_migration)