In [13]:
import pandas as pd
import numpy as np
from scipy.spatial import distance
from os import listdir, environ
from os.path import isfile, join
from pathlib import Path
import scipy.stats as stats
from tqdm import tqdm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.feature_selection import RFE
from sklearn.base import BaseEstimator, RegressorMixin
from mlxtend.feature_selection import SequentialFeatureSelector
from scipy.stats import rankdata
import warnings
warnings.filterwarnings('ignore')

In [14]:
path_election = '../../data/ronaldo/data/TSE/RS/data/aggregated_by_weighting_area.csv'
path_matrix = '../../data/ronaldo/data/IBGE/adjacency_matrices/weighting_areas/queen.csv'
ref = '../../data/ronaldo/data/IBGE/regioes_geograficas.csv'

In [15]:
data_election = pd.read_csv(path_election)
adjacency_matrix = pd.read_csv(path_matrix)
adjacency_matrix.set_index('Unnamed: 0', inplace=True)
adjacency_matrix.sort_index(inplace=True)

ref_regioes = pd.read_csv(ref, sep=';', encoding='latin') 
data_election = data_election.merge(ref_regioes, on='COD_LOCALIDADE_IBGE', how='left')
#data_election = data_election[data_election['NM_MUNICIPIO']=='PORTO ALEGRE']
#data_election = data_election[data_election['NM_MUNICIPIO']=='RIO GRANDE']
#data_election = data_election[data_election['NM_MUNICIPIO']=='CAXIAS DO SUL']

In [16]:
data_election.set_index('Cod_ap', inplace=True)
data_election.sort_index(inplace=True)
votes = data_election[['JAIR BOLSONARO', 'FERNANDO HADDAD']].copy()
votes['FERNANDO HADDAD'] = votes['FERNANDO HADDAD'] / (votes['FERNANDO HADDAD'] + votes['JAIR BOLSONARO'])
votes['JAIR BOLSONARO'] = 1 - votes['FERNANDO HADDAD']
feature_votes = np.array(votes['JAIR BOLSONARO'].values).reshape(-1,1)
votes_sm = distance.pdist(feature_votes,'cityblock')
votes_sm = pd.DataFrame(distance.squareform(votes_sm), index=votes.index, columns= votes.index)
wa_remove = [x for x in adjacency_matrix.index if x not in votes_sm.index]
for wa in wa_remove:
    adjacency_matrix.drop(wa, axis = 0, inplace=True)
    adjacency_matrix.drop(str(wa), axis = 1, inplace=True)
votes_sm = pd.DataFrame(votes_sm.values*adjacency_matrix.values, columns=votes_sm.index, index=votes_sm.index)
votes_strangness = votes_sm.sum(axis=0)
rank_votes = rankdata(-votes_strangness, method='ordinal')
rank_votes = pd.DataFrame(rank_votes, index=votes_strangness.index)

In [17]:
def dumb():
    city_ranks = dict()
    city_strangeness = dict()
    for index, data in tqdm(data_election.groupby('cod_rgi')):
        data.set_index('Cod_ap', inplace=True)
        data.sort_index(inplace=True)
        votes = data[['JAIR BOLSONARO', 'FERNANDO HADDAD',]].copy()
        votes['FERNANDO HADDAD'] = votes['FERNANDO HADDAD'] / (votes['FERNANDO HADDAD'] + votes['JAIR BOLSONARO'])
        votes['JAIR BOLSONARO'] = 1 - votes['FERNANDO HADDAD']
        feature_votes = np.array(votes['JAIR BOLSONARO'].values).reshape(-1,1)
        votes_sm = distance.pdist(feature_votes,'cityblock')
        votes_sm = pd.DataFrame(distance.squareform(votes_sm), index=votes.index, columns= votes.index)
        wa_remove = [x for x in adjacency_matrix.index if x not in votes_sm.index]
        adj_m = adjacency_matrix.copy()
        for wa in wa_remove:
            adj_m.drop(wa, axis = 0, inplace=True)
            adj_m.drop(str(wa), axis = 1, inplace=True)
        votes_sm = pd.DataFrame(votes_sm.values*adj_m.values, columns=votes_sm.index, index=votes_sm.index)
        votes_strangness = votes_sm.sum(axis=0)
        rank_votes = rankdata(-votes_strangness, method='ordinal')
        rank_votes = pd.DataFrame(rank_votes, index=votes_strangness.index)
        city_ranks[index] = rank_votes
        city_strangeness[index] = votes_strangness

In [18]:
def forward_feature_selection(census_data, rank_votes, n, adjacency_matrix, idx):
    feature_set = []
    for num_features in range(n):
        metric_list = [] # Choose appropriate metric based on business problem
        for feature in census_data.columns:
            if feature not in feature_set:
                f_set = feature_set.copy()
                f_set.append(feature)
                wa_remove_census = [x for x in census_data.index if x not in adjacency_matrix.index]
                census_data.drop(wa_remove_census, axis=0, inplace=True)
                feature_col = np.array(census_data[feature].values).reshape(-1,1)
                census_sm = distance.pdist(feature_col,'cityblock')
                census_sm = pd.DataFrame(distance.squareform(census_sm), index=census_data.index, columns= census_data.index)
                census_sm = pd.DataFrame(census_sm.values*adjacency_matrix.values, columns=census_sm.index, index=census_sm.index)
                census_strangness = census_sm.sum(axis=0)
            
                sorted_rank = rankdata(-census_strangness, method='ordinal')
                sorted_rank = pd.DataFrame(sorted_rank, index=census_strangness.index)
                sorted_rank = rankdata(sorted_rank.loc[idx], method = 'ordinal')
                sorted_rank = pd.DataFrame(sorted_rank, index=idx)
                #sorted_rank = rankdata(sorted_rank, method='ordinal')
                rank_votes = rankdata(rank_votes, method='ordinal')
                rank_votes = pd.DataFrame(rank_votes, index=idx)
                #tau, p_value = stats.weightedtau(rank_votes, sorted_rank)
                tau = spearman_footrule_normalized(rank_votes,sorted_rank)
                metric_list.append((tau,feature))

        metric_list.sort(key=lambda x : x[0], reverse = False) # In case metric follows "the more, the merrier"
        feature_set.append(metric_list[0][1])
    return feature_set 

def spearman_footrule(a,b):
    w = np.arange(1,len(a)+1)/len(a)
    w = w[::-1]
    a = a.sort_values(by=0)
    w = pd.DataFrame(w,index=a.index)
    diff = abs(a.sub(b))
    d = diff.multiply(w)
    return d[0].sum()

def max_dist_spearman_footrule(a):
    w = np.arange(1,len(a)+1)/len(a)
    w = w[::-1]
    a = a.sort_values(by=0)
    w = pd.DataFrame(w,index=a.index)
    inv_rank = np.arange(len(a),0,-1)
    inv_rank = pd.DataFrame(inv_rank,index=a.index)
    d = abs(a-inv_rank).multiply(w)
    return d[0].sum()

def spearman_footrule_normalized(a,b):
    num = spearman_footrule(a,b)
    denom = max_dist_spearman_footrule(a)
    return float(num)/denom    

In [19]:
path_census = '../../data/ronaldo/data/IBGE/census_2010/RS/aggregated_by_weighting_area/joined/'
filenames = [filename for filename in listdir(path_census) if isfile(join(path_census, filename))]

In [20]:
final_rank = []
scores = []
for region, data_votes in tqdm(data_election.groupby('cod_rgi')):
    census_rank = dict()
    census_stran = dict()
    attributes = []
    for file in filenames:
        census_data = pd.read_csv(path_census+file)
       # census_data = census_data.merge(ref_regioes[['CD_GEOCODM','cod_rgi']], on='CD_GEOCODM', how='left')
       # census_data = census_data[census_data['cod_rgi']==region]
        census_data.set_index('Cod_ap', inplace=True)
        census_data.sort_index(inplace=True)
        census_data.drop('CD_GEOCODM', axis=1, inplace=True)
        census_data.drop('NM_MUNICIP', axis=1, inplace=True)
        census_data=(census_data-census_data.min())/(census_data.max()-census_data.min())
        census_data.fillna(0,inplace=True)
        #remove regioes
        #wa_remove_adj = [x for x in adjacency_matrix.index if x not in data_votes.index]
        #wa_remove_census = [x for x in census_data.index if x not in data_votes.index]
        #census_data.drop(wa_remove_census, axis=0, inplace=True)
        adj_m = adjacency_matrix.copy()
        #adj_m.drop(wa_remove_adj, axis = 0, inplace=True)
        #for wa in wa_remove_adj:
        #    adj_m.drop(str(wa), axis = 1, inplace=True)
        #Feature Selection Wrapper SFS
        f_selected = forward_feature_selection(census_data, rank_votes.loc[data_votes.index], 1, adj_m, data_votes.index)
        f_selected = census_data[f_selected]
        attributes.append(f_selected.columns.values)
        # fim
        census_sm = distance.pdist(f_selected,'cityblock')
        census_sm = pd.DataFrame(distance.squareform(census_sm), index=census_data.index, columns= census_data.index)
        census_sm = pd.DataFrame(census_sm.values*adj_m.values, columns=census_sm.index, index=census_sm.index)
        census_strangness = census_sm.sum(axis=0)
        #normalize regarging votes
        census_strangness = census_strangness.loc[data_votes.index]
        b =  votes_strangness[data_votes.index].max()
        a =  votes_strangness[data_votes.index].min()
        census_strangness =  (b-a)*((census_strangness-census_strangness.min())/(census_strangness.max()-census_strangness.min()))+a  
        
        #
        sorted_rank = rankdata(-census_strangness, method='ordinal')
        sorted_rank = pd.DataFrame(sorted_rank, index=census_strangness.index)
        file = file.split('.')[0]
        census_rank[file] = sorted_rank
        census_stran[file] = census_strangness
    #Calculate Wkendal    
    kendal_rank_cor = dict()
    spearman_rank_cor = dict()
    for file, attr in zip(census_rank,attributes):
        tau, p_value = stats.weightedtau(rank_votes.loc[data_votes.index], census_rank[file])
        dist = spearman_footrule_normalized(rank_votes, census_rank[file])
        kendal_rank_cor[file] = [attr,tau]
        spearman_rank_cor[file] = [attr,dist]

    #Get max rank
    kendal_rank_cor = pd.DataFrame.from_dict(kendal_rank_cor,orient='index', columns = ['attr','eval'])
    spearman_rank_cor = pd.DataFrame.from_dict(spearman_rank_cor,orient='index', columns = ['attr','eval'])
    #max_categorie = kendal_rank_cor['eval'].idxmax()
    max_categorie = spearman_rank_cor['eval'].idxmax()
    max_stran = census_stran[max_categorie]
    #scores = scores + [spearman_rank_cor.max()]
    final_rank = final_rank + [max_stran]

100%|███████████████████████████████████████████████████| 43/43 [1:16:11<00:00, 106.31s/it]


In [21]:
#norm_final_rank = [(r-r.min())/(r.max()-r.min()) for r in final_rank]   
rank = pd.concat(final_rank, axis=0)
rank.sort_index(inplace=True)
rank_sorted = rankdata(-rank, method='ordinal')
rank_sorted = pd.DataFrame(rank_sorted, index=votes_strangness.index)

In [22]:
#rank_votes = rankdata(-votes_strangness, method='ordinal')
tau, p_value = stats.weightedtau(rank_votes, rank_sorted)
dist = spearman_footrule_normalized(rank_votes, rank_sorted)
#print(rank.loc[int(rank_votes.sort_values().index[0]),])
#rank_sorted.sort_values(ascending=False)
#rank.sort_values(ascending=False)
#rank_votes = pd.DataFrame(rank_votes, index=votes_strangness.index, columns=['pos'])
#rank_sorted = pd.DataFrame(rank_sorted, index=votes_strangness.index,columns=['pos'])
#rank_sorted.loc[int(rank_votes.sort_values(by='pos',ascending=False).index[2]),]
#rank_votes.loc[int(rank_sorted.sort_values(by='pos',ascending=False).index[0]),]

#print(rank_sorted.sort_values(by='pos',ascending=True))
#rank.sort_values(ascending=False)
#print(votes_strangness.sort_values(ascending=False))
#rank.sort_values(ascending=False)

In [23]:
tau

0.6006907355489031