In [2]:
import pandas as pd
from unidecode import unidecode

df = (
    pd.read_excel('~/Downloads/dados_censo.xlsx', sheet_name=1)
    .query('ano==ano.max()')
    .rename(columns=lambda x: unidecode(x))
    )
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 223 entries, 446 to 668
Columns: 238 entries, ano to idhm_r
dtypes: float64(169), int64(67), object(2)
memory usage: 416.4+ KB


In [3]:
variaveis = ['pop', 't_analf25m', 'pmpob', 'rdpc', 'mort1']
X = df[variaveis]
# Correlação
display(X.corr())
# Tipagem, Valores ausentes
print(X.info())
# Inspeção de valores atípicos (inliers ou outliers)
X.describe()

Unnamed: 0,pop,t_analf25m,pmpob,rdpc,mort1
pop,1.0,-0.367736,-0.392712,0.656641,-0.221032
t_analf25m,-0.367736,1.0,0.695516,-0.615839,0.380096
pmpob,-0.392712,0.695516,1.0,-0.753144,0.490604
rdpc,0.656641,-0.615839,-0.753144,1.0,-0.427142
mort1,-0.221032,0.380096,0.490604,-0.427142,1.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 223 entries, 446 to 668
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   pop         223 non-null    int64  
 1   t_analf25m  223 non-null    float64
 2   pmpob       223 non-null    float64
 3   rdpc        223 non-null    float64
 4   mort1       223 non-null    float64
dtypes: float64(4), int64(1)
memory usage: 10.5 KB
None


Unnamed: 0,pop,t_analf25m,pmpob,rdpc,mort1
count,223.0,223.0,223.0,223.0,223.0
mean,16792.475336,37.479821,39.112915,277.351166,26.688744
std,55127.83844,7.942775,7.932308,92.074926,5.607713
min,1256.0,9.98,11.59,166.28,15.27
25%,4322.5,32.66,34.0,230.12,22.8
50%,7118.0,38.13,39.34,263.84,26.0
75%,14309.5,43.1,44.995,293.82,30.1
max,713290.0,53.08,60.98,1036.21,44.0


- Inlier: Se $X < Q1 - 1.5 IQR$
- Outlier: Se $X > Q3 + 1.5 IQR$

onde, $IQR = Q3 - Q1$

In [4]:
outliers = (X>X.quantile(0.75) + 1.5*(X.quantile(0.75)-X.quantile(0.25))).astype(int)
outliers.describe()

Unnamed: 0,pop,t_analf25m,pmpob,rdpc,mort1
count,223.0,223.0,223.0,223.0,223.0
mean,0.071749,0.0,0.0,0.049327,0.022422
std,0.258652,0.0,0.0,0.217038,0.148383
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0
max,1.0,0.0,0.0,1.0,1.0


## Cenário com normalização (padrão)

$$N = \dfrac{x-min}{max-min}$$

In [5]:
# Normaliza DataFrame
Xnorm = (X - X.min())/(X.max() - X.min())

# Localizo o alvo
alvo = df[df['municipio']=="JOÃO PESSOA"].index
alvo = list(alvo)[0]

# Retirar alvo da matriz X
# Xnorm = Xnorm[Xnorm.index!=alvo]

# Converte em matrizes
xj = Xnorm[Xnorm.index==alvo].values
Xnorm = Xnorm.values

# Calcula os 10-vizinhos mais proximos
from sklearn.neighbors import NearestNeighbors
k=11
knn = NearestNeighbors(n_neighbors=k, metric='euclidean')
knn.fit(Xnorm)
dist, indice = knn.kneighbors(xj)
df.iloc[indice[0]]['municipio']

541       JOÃO PESSOA
497    CAMPINA GRANDE
487          CABEDELO
581             PATOS
493        CAJAZEIRAS
471            BAYEUX
656             SOUSA
529         GUARABIRA
620        SANTA RITA
666            VÁRZEA
616       SANTA LUZIA
Name: municipio, dtype: object

In [6]:
# Localizo o alvo
alvo = df[df['municipio']=="JOÃO PESSOA"].index
alvo = list(alvo)[0]

# Converte em matrizes
xj = X[X.index==alvo].values
Xm = X.values

# Calcula os 10-vizinhos mais proximos
from sklearn.neighbors import NearestNeighbors
k=11
knn = NearestNeighbors(n_neighbors=k, metric='euclidean')
knn.fit(Xm)
dist, indice = knn.kneighbors(xj)
pd.DataFrame({'vizinho':df.iloc[indice[0]]['municipio'],
              'dist': dist[0]}).query('dist>0')


Unnamed: 0,vizinho,dist
497,CAMPINA GRANDE,330719.169523
620,SANTA RITA,593416.341931
581,PATOS,613124.16997
471,BAYEUX,613718.281921
656,SOUSA,647950.209835
493,CAJAZEIRAS,655168.157028
487,CABEDELO,656407.003902
529,GUARABIRA,658311.216875
644,SAPÉ,663255.36919
556,MAMANGUAPE,671046.324965


In [7]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
escala = MinMaxScaler()
# Normalização dos dados
Xnorm = escala.fit_transform(X)
xj = df.query('municipio=="JOÃO PESSOA"')[variaveis]
xjnorm = escala.fit_transform(xj)
xjnorm

array([[0., 0., 0., 0., 0.]])