In [29]:
import pandas as pd
import numpy as np
from scipy.spatial import distance
from os import listdir, environ
from os.path import isfile, join
from pathlib import Path
import scipy.stats as stats
from tqdm import tqdm
from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
from scipy.stats import rankdata
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [30]:
path_election = '../../data/ronaldo/data/TSE/RS/data/aggregated_by_weighting_area.csv'
path_matrix = '../../data/ronaldo/data/IBGE/adjacency_matrices/weighting_areas/queen.csv'

In [31]:
data_election = pd.read_csv(path_election)
adjacency_matrix = pd.read_csv(path_matrix)
adjacency_matrix.set_index('Unnamed: 0', inplace=True)
adjacency_matrix.sort_index(inplace=True)

#data_election = data_election[data_election['NM_MUNICIPIO']=='PORTO ALEGRE']
#data_election = data_election[data_election['NM_MUNICIPIO']=='RIO GRANDE']
data_election = data_election[data_election['NM_MUNICIPIO']=='CAXIAS DO SUL']

In [32]:
data_election.set_index('Cod_ap', inplace=True)
data_election.sort_index(inplace=True)

In [33]:
votes = data_election[['JAIR BOLSONARO', 'FERNANDO HADDAD']].copy()
votes['FERNANDO HADDAD'] = votes['FERNANDO HADDAD'] / (votes['FERNANDO HADDAD'] + votes['JAIR BOLSONARO'])
votes['JAIR BOLSONARO'] = 1 - votes['FERNANDO HADDAD']

In [34]:
feature_votes = np.array(votes['JAIR BOLSONARO'].values).reshape(-1,1)
votes_sm = distance.pdist(feature_votes,'cityblock')
votes_sm = pd.DataFrame(distance.squareform(votes_sm), index=votes.index, columns= votes.index)

In [35]:
wa_remove = [x for x in adjacency_matrix.index if x not in votes_sm.index]
for wa in wa_remove:
    adjacency_matrix.drop(wa, axis = 0, inplace=True)
    adjacency_matrix.drop(str(wa), axis = 1, inplace=True)

In [36]:
votes_sm = pd.DataFrame(votes_sm.values*adjacency_matrix.values, columns=votes_sm.index, index=votes_sm.index)

In [37]:
votes_strangness = votes_sm.sum(axis=0)
#votes_strangness.sort_values(ascending=False, inplace=True)
rank_votes = rankdata(-votes_strangness, method='ordinal')
rank_votes = pd.DataFrame(rank_votes, index=votes_strangness.index)

In [38]:

def forward_feature_selection(census_data, target, model, n):
    feature_set = []
    for num_features in range(n):
        metric_list = [] # Choose appropriate metric based on business problem
        model = model
        for feature in census_data.columns:
            if feature not in feature_set:
                f_set = feature_set.copy()
                f_set.append(feature)
                model.fit(census_data[f_set],target)
                y_pred = model.predict(census_data[f_set])
                mse = mean_squared_error(target,y_pred)
                metric_list.append((mse,feature))
        metric_list.sort(key=lambda x : x[0], reverse = False) # In case metric follows "the less, the merrier"
        feature_set.append(metric_list[0][1])
    return feature_set
def spearman_footrule(a,b):
    w = np.arange(1,len(a)+1)/len(a)
    w = w[::-1]
    a = a.sort_values(by=0)
    w = pd.DataFrame(w,index=a.index)
    diff = abs(a.sub(b))
    d = diff.multiply(w)
    return d[0].sum()

def max_dist_spearman_footrule(a):
    w = np.arange(1,len(a)+1)/len(a)
    w = w[::-1]
    a = a.sort_values(by=0)
    w = pd.DataFrame(w,index=a.index)
    inv_rank = np.arange(len(a),0,-1)
    inv_rank = pd.DataFrame(inv_rank,index=a.index)
    d = abs(a-inv_rank).multiply(w)
    return d[0].sum()

def spearman_footrule_normalized(a,b):
    num = spearman_footrule(a,b)
    denom = max_dist_spearman_footrule(a)
    return float(num)/denom
    

In [39]:
path_census = '../../data/ronaldo/data/IBGE/census_2010/RS/aggregated_by_weighting_area/joined/'
filenames = [filename for filename in listdir(path_census) if isfile(join(path_census, filename))]

In [40]:
census_rank = dict()
attributes = []
for file in filenames:
    print(file)
    census_data = pd.read_csv(path_census+file)
    census_data.set_index('Cod_ap', inplace=True)
    census_data.sort_index(inplace=True)
    census_data.drop('CD_GEOCODM', axis=1, inplace=True)
    census_data.drop('NM_MUNICIP', axis=1, inplace=True)
    census_data.drop(wa_remove, axis=0, inplace=True)
    census_data=(census_data-census_data.min())/(census_data.max()-census_data.min())
    census_data.fillna(0,inplace=True)
    #Feature Selection Wrapper SFS
    model = DecisionTreeRegressor()
    f_selected = forward_feature_selection(census_data, votes_strangness, model, 43)
    #print(f_selected)
    f_selected = census_data[f_selected]
    attributes.append(f_selected.columns.values)
    #
    census_sm = distance.pdist(f_selected,'cityblock')
    census_sm = pd.DataFrame(distance.squareform(census_sm), index=census_data.index, columns= census_data.index)
    census_sm = pd.DataFrame(census_sm.values*adjacency_matrix.values, columns=census_sm.index, index=census_sm.index)
    census_strangness = census_sm.sum(axis=0)
    sorted_rank = rankdata(-census_strangness, method='ordinal')
    sorted_rank = pd.DataFrame(sorted_rank, index=census_strangness.index)
    file = file.split('.')[0]
    census_rank[file] = sorted_rank

Alfabetizacao.csv
Cor_e_Raca.csv
Domicilio.csv
Entorno.csv
Parentesco.csv
Pessoa.csv
Renda.csv
Responsavel.csv


In [41]:
kendal_rank_cor = dict()
spearman_rank_cor = dict()
for file, attr in zip(census_rank,attributes):
    tau, p_value = stats.weightedtau(rank_votes, census_rank[file])
    dist = spearman_footrule_normalized(rank_votes, census_rank[file])
    kendal_rank_cor[file] = [attr,tau]
    spearman_rank_cor[file] = [attr,dist]

In [42]:
kendal_rank_cor = pd.DataFrame.from_dict(kendal_rank_cor,orient='index')
spearman_rank_cor = pd.DataFrame.from_dict(spearman_rank_cor,orient='index')
spearman_rank_cor

Unnamed: 0,0,1
Alfabetizacao,"[01_V001, 01_V002, 01_V003, 01_V004, 01_V005, ...",0.626766
Cor_e_Raca,"[01_V001, 01_V002, 01_V003, 01_V004, 01_V005, ...",0.433477
Domicilio,"[01_V001, 01_V002, 01_V003, 01_V004, 01_V005, ...",0.649333
Entorno,"[01_V001, 01_V002, 01_V003, 01_V004, 01_V005, ...",0.560636
Parentesco,"[02_V001, 01_V001, 01_V002, 01_V003, 01_V004, ...",0.576727
Pessoa,"[01_V001, 01_V002, 01_V003, 01_V004, 01_V005, ...",0.551413
Renda,"[01_V002, 01_V001, 01_V003, 01_V004, 01_V005, ...",0.617151
Responsavel,"[01_V001, 01_V002, 01_V003, 01_V004, 01_V005, ...",0.667779
