In [46]:
import pandas as pd
import numpy as np
from scipy.spatial import distance
from os import listdir, environ
from os.path import isfile, join
from pathlib import Path
import scipy.stats as stats
from tqdm import tqdm
from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
from scipy.stats import rankdata
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [47]:
path_election = '../../data/ronaldo/data/TSE/RS/data/aggregated_by_weighting_area.csv'
path_matrix = '../../data/ronaldo/data/IBGE/adjacency_matrices/weighting_areas/queen.csv'

In [48]:
data_election = pd.read_csv(path_election)
adjacency_matrix = pd.read_csv(path_matrix)
adjacency_matrix.set_index('Unnamed: 0', inplace=True)
adjacency_matrix.sort_index(inplace=True)

#data_election = data_election[data_election['NM_MUNICIPIO']=='PORTO ALEGRE']
#data_election = data_election[data_election['NM_MUNICIPIO']=='RIO GRANDE']
data_election = data_election[data_election['NM_MUNICIPIO']=='CAXIAS DO SUL']

In [49]:
data_election.set_index('Cod_ap', inplace=True)
data_election.sort_index(inplace=True)

In [50]:
votes = data_election[['JAIR BOLSONARO', 'FERNANDO HADDAD']].copy()
votes['FERNANDO HADDAD'] = votes['FERNANDO HADDAD'] / (votes['FERNANDO HADDAD'] + votes['JAIR BOLSONARO'])
votes['JAIR BOLSONARO'] = 1 - votes['FERNANDO HADDAD']

In [51]:
feature_votes = np.array(votes['JAIR BOLSONARO'].values).reshape(-1,1)
votes_sm = distance.pdist(feature_votes,'cityblock')
votes_sm = pd.DataFrame(distance.squareform(votes_sm), index=votes.index, columns= votes.index)

In [52]:
wa_remove = [x for x in adjacency_matrix.index if x not in votes_sm.index]
for wa in wa_remove:
    adjacency_matrix.drop(wa, axis = 0, inplace=True)
    adjacency_matrix.drop(str(wa), axis = 1, inplace=True)

In [53]:
votes_sm = pd.DataFrame(votes_sm.values*adjacency_matrix.values, columns=votes_sm.index, index=votes_sm.index)

In [54]:
votes_strangness = votes_sm.sum(axis=0)
#votes_strangness.sort_values(ascending=False, inplace=True)
rank_votes = rankdata(-votes_strangness, method='ordinal')

In [55]:

def forward_feature_selection(census_data, target, model, n):
    feature_set = []
    for num_features in range(n):
        metric_list = [] # Choose appropriate metric based on business problem
        model = model
        for feature in census_data.columns:
            if feature not in feature_set:
                f_set = feature_set.copy()
                f_set.append(feature)
                model.fit(census_data[f_set],target)
                y_pred = model.predict(census_data[f_set])
                mse = mean_squared_error(target,y_pred)
                metric_list.append((mse,feature))
        metric_list.sort(key=lambda x : x[0], reverse = False) # In case metric follows "the less, the merrier"
        feature_set.append(metric_list[0][1])
    return feature_set

    

In [56]:
path_census = '../../data/ronaldo/data/IBGE/census_2010/RS/aggregated_by_weighting_area/joined/'
filenames = [filename for filename in listdir(path_census) if isfile(join(path_census, filename))]
list_df = []
for file in filenames:
    census_data = pd.read_csv(path_census+file)
    census_data.Cod_ap = census_data.Cod_ap.astype('int64')
    census_data.set_index('Cod_ap', inplace=True)
    census_data.sort_index(inplace=True)
    census_data.drop('CD_GEOCODM', axis=1, inplace=True)
    census_data.drop('NM_MUNICIP', axis=1, inplace=True)
    columns = census_data.columns.values.tolist()
    census_data.columns = [file.split('.')[0] + '_' + col_name for col_name in columns]
    list_df.append(census_data)

census_data = pd.concat(list_df, axis=1)
census_data.head()

Unnamed: 0_level_0,Alfabetizacao_01_V001,Alfabetizacao_01_V002,Alfabetizacao_01_V003,Alfabetizacao_01_V004,Alfabetizacao_01_V005,Alfabetizacao_01_V006,Alfabetizacao_01_V007,Alfabetizacao_01_V008,Alfabetizacao_01_V009,Alfabetizacao_01_V010,...,Responsavel_02_V207,Responsavel_02_V208,Responsavel_02_V209,Responsavel_02_V210,Responsavel_02_V211,Responsavel_02_V212,Responsavel_02_V213,Responsavel_02_V214,Responsavel_02_V215,Responsavel_02_V216
Cod_ap,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4300034001001,3841,19,50,61,87,75,79,86,89,90,...,119,119,133,117,96,79,41,53,31,19
4300059001001,3277,25,40,38,46,50,61,62,62,67,...,97,111,125,110,96,71,56,25,15,14
4300109001001,14658,65,119,166,183,233,270,269,217,254,...,389,451,476,409,364,297,179,150,84,65
4300208001001,6417,21,38,77,68,77,107,76,112,115,...,188,223,242,185,169,132,111,99,53,38
4300307001001,6204,11,48,67,77,89,104,109,113,113,...,144,186,244,222,196,224,149,91,53,34


In [57]:
census_data = pd.read_csv(path_census+file)
census_data.set_index('Cod_ap', inplace=True)
census_data.sort_index(inplace=True)
census_data.drop('CD_GEOCODM', axis=1, inplace=True)
census_data.drop('NM_MUNICIP', axis=1, inplace=True)
census_data.drop(wa_remove, axis=0, inplace=True)
census_data=(census_data-census_data.min())/(census_data.max()-census_data.min())
census_data.fillna(0,inplace=True)
#Feature Selection Wrapper SFS
model = DecisionTreeRegressor()
f_selected = forward_feature_selection(census_data, votes['JAIR BOLSONARO'], model, 43)
f_selected = census_data[f_selected]
#
census_sm = distance.pdist(f_selected,'cityblock')
census_sm = pd.DataFrame(distance.squareform(census_sm), index=census_data.index, columns= census_data.index)
census_sm = pd.DataFrame(census_sm.values*adjacency_matrix.values, columns=census_sm.index, index=census_sm.index)
census_strangness = census_sm.sum(axis=0)
sorted_rank = rankdata(-census_strangness, method='ordinal')
tau, p_value = stats.weightedtau(rank_votes, sorted_rank)
tau

-0.009609478579322504

In [58]:
#kendal_rank_cor = dict()
#spearman_rank_cor = dict()
#for file, attr in zip(census_rank,attributes):
#    tau, p_value = stats.weightedtau(rank_votes, census_rank[file])
#    rho, pval = stats.spearmanr(rank_votes, census_rank[file])
#    kendal_rank_cor[file] = [attr,tau]
#    spearman_rank_cor[file] = [rho,pval]

In [59]:
#kendal_rank_cor = pd.DataFrame.from_dict(kendal_rank_cor,orient='index')
#kendal_rank_cor