In [15]:
import pandas as pd
import numpy as np
from scipy.spatial import distance
from os import listdir, environ
from os.path import isfile, join
from pathlib import Path
import scipy.stats as stats
from tqdm import tqdm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.feature_selection import RFE
from sklearn.base import BaseEstimator, RegressorMixin
from mlxtend.feature_selection import SequentialFeatureSelector
from scipy.stats import rankdata
import warnings
warnings.filterwarnings('ignore')

In [16]:
path_election = '../../data/ronaldo/data/TSE/RS/data/aggregated_by_weighting_area.csv'
path_matrix = '../../data/ronaldo/data/IBGE/adjacency_matrices/weighting_areas/queen.csv'

In [17]:
data_election = pd.read_csv(path_election)
adjacency_matrix = pd.read_csv(path_matrix)
adjacency_matrix.set_index('Unnamed: 0', inplace=True)
adjacency_matrix.sort_index(inplace=True)

#data_election = data_election[data_election['NM_MUNICIPIO']=='PORTO ALEGRE']
#data_election = data_election[data_election['NM_MUNICIPIO']=='RIO GRANDE']
#data_election = data_election[data_election['NM_MUNICIPIO']=='CAXIAS DO SUL']

In [18]:
data_election.set_index('Cod_ap', inplace=True)
data_election.sort_index(inplace=True)

In [19]:
votes = data_election[['JAIR BOLSONARO', 'FERNANDO HADDAD']].copy()
votes['FERNANDO HADDAD'] = votes['FERNANDO HADDAD'] / (votes['FERNANDO HADDAD'] + votes['JAIR BOLSONARO'])
votes['JAIR BOLSONARO'] = 1 - votes['FERNANDO HADDAD']

In [20]:
feature_votes = np.array(votes['JAIR BOLSONARO'].values).reshape(-1,1)
votes_sm = distance.pdist(feature_votes,'cityblock')
votes_sm = pd.DataFrame(distance.squareform(votes_sm), index=votes.index, columns= votes.index)

In [21]:
wa_remove = [x for x in adjacency_matrix.index if x not in votes_sm.index]
for wa in wa_remove:
    adjacency_matrix.drop(wa, axis = 0, inplace=True)
    adjacency_matrix.drop(str(wa), axis = 1, inplace=True)

In [22]:
votes_sm = pd.DataFrame(votes_sm.values*adjacency_matrix.values, columns=votes_sm.index, index=votes_sm.index)

In [23]:
votes_strangness = votes_sm.sum(axis=0)
#votes_strangness.sort_values(ascending=False, inplace=True)
rank_votes = rankdata(-votes_strangness, method='ordinal')
rank_votes = pd.DataFrame(rank_votes, index=votes_strangness.index)

In [24]:

def forward_feature_selection(census_data, rank_votes, n):
    feature_set = []
    for num_features in range(n):
        metric_list = [] # Choose appropriate metric based on business problem
        for feature in census_data.columns:
            if feature not in feature_set:
                f_set = feature_set.copy()
                f_set.append(feature)
                #feature_col = np.array(census_data[f_set].values).reshape(-1,1)
                census_sm = distance.pdist(census_data[f_set],'cityblock')
                census_sm = pd.DataFrame(distance.squareform(census_sm), index=census_data.index, columns= census_data.index)
                census_sm = pd.DataFrame(census_sm.values*adjacency_matrix.values, columns=census_sm.index, index=census_sm.index)
                census_strangness = census_sm.sum(axis=0)
                sorted_rank = rankdata(-census_strangness, method='ordinal')
                sorted_rank = pd.DataFrame(sorted_rank, index=census_strangness.index)
                #tau, p_value = stats.weightedtau(rank_votes, sorted_rank)
                tau = spearman_footrule_normalized(rank_votes,sorted_rank)
                metric_list.append((tau,feature))

        metric_list.sort(key=lambda x : x[0], reverse = False) # In case metric follows "the more, the merrier"
        feature_set.append(metric_list[0][1])
    return feature_set

def spearman_footrule(a,b):
    w = np.arange(1,len(a)+1)/len(a)
    w = w[::-1]
    a = a.sort_values(by=0)
    w = pd.DataFrame(w,index=a.index)
    diff = abs(a.sub(b))
    d = diff.multiply(w)
    return d[0].sum()

def max_dist_spearman_footrule(a):
    w = np.arange(1,len(a)+1)/len(a)
    w = w[::-1]
    a = a.sort_values(by=0)
    w = pd.DataFrame(w,index=a.index)
    inv_rank = np.arange(len(a),0,-1)
    inv_rank = pd.DataFrame(inv_rank,index=a.index)
    d = abs(a-inv_rank).multiply(w)
    return d[0].sum()

def spearman_footrule_normalized(a,b):
    num = spearman_footrule(a,b)
    denom = max_dist_spearman_footrule(a)
    return float(num)/denom
    

In [25]:
path_census = '../../data/ronaldo/data/IBGE/census_2010/RS/aggregated_by_weighting_area/joined/'
filenames = [filename for filename in listdir(path_census) if isfile(join(path_census, filename))]

In [26]:
census_rank = dict()
attributes = []
for file in tqdm(filenames):
    census_data = pd.read_csv(path_census+file)
    census_data.set_index('Cod_ap', inplace=True)
    census_data.sort_index(inplace=True)
    census_data.drop('CD_GEOCODM', axis=1, inplace=True)
    census_data.drop('NM_MUNICIP', axis=1, inplace=True)
    census_data.drop(wa_remove, axis=0, inplace=True)
    census_data=(census_data-census_data.min())/(census_data.max()-census_data.min())
    census_data.fillna(0,inplace=True)
    #Feature Selection Wrapper SFS
    f_selected = forward_feature_selection(census_data, rank_votes, n=86)
    f_selected = census_data[f_selected]
    attributes.append(f_selected.columns.values)
    #
    census_sm = distance.pdist(f_selected,'cityblock')
    census_sm = pd.DataFrame(distance.squareform(census_sm), index=census_data.index, columns= census_data.index)
    census_sm = pd.DataFrame(census_sm.values*adjacency_matrix.values, columns=census_sm.index, index=census_sm.index)
    census_strangness = census_sm.sum(axis=0)
    sorted_rank = rankdata(-census_strangness, method='ordinal')
    sorted_rank = pd.DataFrame(sorted_rank, index=census_strangness.index)
    file = file.split('.')[0]
    census_rank[file] = sorted_rank

100%|████████████████████████████████████████████████████| 8/8 [3:38:22<00:00, 1637.81s/it]


In [27]:
kendal_rank_cor = dict()
spearman_rank_cor = dict()
for file, attr in zip(census_rank,attributes):
    tau, p_value = stats.weightedtau(rank_votes, census_rank[file])
    dist = spearman_footrule_normalized(rank_votes, census_rank[file])
    kendal_rank_cor[file] = [attr,tau]
    spearman_rank_cor[file] = [dist]

In [28]:
kendal_rank_cor = pd.DataFrame.from_dict(kendal_rank_cor,orient='index')
spearman_rank_cor = pd.DataFrame.from_dict(spearman_rank_cor,orient='index')
spearman_rank_cor

Unnamed: 0,0
Alfabetizacao,0.53527
Cor_e_Raca,0.497087
Domicilio,0.456021
Entorno,0.528036
Parentesco,0.495469
Pessoa,0.528872
Renda,0.474815
Responsavel,0.525045
