In [6]:
import numpy as np
import pandas as pd

# pacotes para obter as distâncias
from scipy.spatial.distance import squareform, pdist
from sklearn.metrics.pairwise import euclidean_distances

pd.options.display.float_format = '{:.2f}'.format    # pandas: para todos os números aparecerem com duas casas decimais
np.set_printoptions(precision=2, suppress=True)      # numpy: para todos os números aparecerem com duas casas decimais

**Matriz de dados $\boldsymbol{X}_{n \times p}$**

Seja $X_{4 \times 3}$ =
- $7 \quad 3 \quad 9$  
- $4 \quad 6 \quad 11$  
- $4 \quad 2 \quad 5$  
- $5 \quad 5 \quad 7$  

In [7]:
# montar a matriz
X = np.array([[7, 3, 9], [4, 6, 11], [4, 2, 5], [5, 5, 7]])
print(X)

[[ 7  3  9]
 [ 4  6 11]
 [ 4  2  5]
 [ 5  5  7]]


Obter o vetor de médias: $\boldsymbol{\bar{X}}_{p \times 1}$

In [8]:
Xb = np.mean(X, axis=0).reshape(3, 1)
print(Xb)

[[5.]
 [4.]
 [8.]]


Matriz de covariâncias amostrais: $\boldsymbol{S}_{p \times p}$

In [9]:
S = np.cov(X.T)
print(S)

[[ 2.   -0.67  0.67]
 [-0.67  3.33  3.33]
 [ 0.67  3.33  6.67]]


# Distâncias entre as observações

## Distância euclidiana

$d_{ij} = \sqrt{\sum_{k=1}^{p} (X_{ik} - X_{jk})^2}$

In [10]:
# usando scipy
squareform(pdist(X, metric='euclidean'))

array([[0.  , 4.69, 5.1 , 3.46],
       [4.69, 0.  , 7.21, 4.24],
       [5.1 , 7.21, 0.  , 3.74],
       [3.46, 4.24, 3.74, 0.  ]])

In [11]:
# usando scikit-learn
D = euclidean_distances(X, X)
D

array([[0.  , 4.69, 5.1 , 3.46],
       [4.69, 0.  , 7.21, 4.24],
       [5.1 , 7.21, 0.  , 3.74],
       [3.46, 4.24, 3.74, 0.  ]])

## Dessa forma não precisa, é apenas curiosidade

Matricialmente:

$d^2_{ij} = (\boldsymbol{X_{i \cdot}} - \boldsymbol{X_{j \cdot}})^T (\boldsymbol{X_{i \cdot}} - \boldsymbol{X_{j \cdot}})$

Obter os vetores $\boldsymbol{X}_{1\cdot}$, $\boldsymbol{X}_{2\cdot}$, $\cdots$

In [12]:
X

array([[ 7,  3,  9],
       [ 4,  6, 11],
       [ 4,  2,  5],
       [ 5,  5,  7]])

In [13]:
X1p = X[0,:].reshape(3, 1)
X2p = X[1,:].reshape(3, 1)
X3p = X[2,:].reshape(3, 1)
X4p = X[3,:].reshape(3, 1)

In [14]:
# distância entre as observações 1 e 2
d12 = ((X1p - X2p).T @ (X1p - X2p)) ** 0.5
d12

array([[4.69]])

In [15]:
# da mesma forma obteríamos as outras: d13, d14, d23, d24

## Distância euclidiana padronizada

$d_{ij} = \sqrt{\sum_{k=1}^{p}\dfrac{(X_{ik} - X_{jk})^2}{S_{kk}}}$

In [16]:
# usando scipy
squareform(pdist(X, metric='seuclidean'))

array([[0.  , 2.79, 2.68, 1.95],
       [2.79, 0.  , 3.19, 1.79],
       [2.68, 3.19, 0.  , 1.95],
       [1.95, 1.79, 1.95, 0.  ]])

Matricialmente (não precisa):

$d^2_{ij} = (\boldsymbol{X_{i \cdot}} - \boldsymbol{X_{j \cdot}})^T \boldsymbol{D}^{-1} (\boldsymbol{X_{i \cdot}} - \boldsymbol{X_{j \cdot}})$,

em que $\boldsymbol{D}^{-1} = diag(1/S_{ii})$

In [17]:
S

array([[ 2.  , -0.67,  0.67],
       [-0.67,  3.33,  3.33],
       [ 0.67,  3.33,  6.67]])

In [18]:
# D**{-1}
D1 = np.diag(1 / np.diag(S))
print(D1)

[[0.5  0.   0.  ]
 [0.   0.3  0.  ]
 [0.   0.   0.15]]


In [19]:
# distância entre as observações 1 e 2
d12 = ((X1p - X2p).T @ D1 @ (X1p - X2p)) ** 0.5
d12

array([[2.79]])

In [20]:
# as outras distâncias

## Distância de Mahalanobis

$d^2_{ij} = (\boldsymbol{X_{i \cdot}} - \boldsymbol{X_{j \cdot}})^T \boldsymbol{S}^{-1} (\boldsymbol{X_{i \cdot}} - \boldsymbol{X_{j \cdot}})$,

em que $\boldsymbol{S}^{-1}$ é a inversa de $\boldsymbol{S}$

In [21]:
# usando scipy
squareform(pdist(X, metric='mahalanobis'))

array([[0.  , 2.45, 2.45, 2.45],
       [2.45, 0.  , 2.45, 2.45],
       [2.45, 2.45, 0.  , 2.45],
       [2.45, 2.45, 2.45, 0.  ]])

In [22]:
# S**{-1}
S1 = np.linalg.inv(S)
print(S1)

[[ 0.75  0.45 -0.3 ]
 [ 0.45  0.87 -0.48]
 [-0.3  -0.48  0.42]]


In [23]:
# distância entre as observações 1 e 2
# curiosidade
d12 = ((X1p - X2p).T @ S1 @ (X1p - X2p)) ** 0.5
d12

array([[2.45]])

In [24]:
# e as outras distâncias da mesma forma

# Ler um csv (dataframe)

In [25]:
medidas = pd.read_csv('https://patriciasiqueira.github.io/arquivos/medidas.csv')

In [26]:
medidas.shape

(20, 4)

In [27]:
# apenas variáveis numéricas
medidas = medidas.loc[:, ['torax', 'cintura', 'quadril']]

In [28]:
# primeiras observações
medidas.head()

Unnamed: 0,torax,cintura,quadril
0,34,30,32
1,37,32,37
2,38,30,36
3,36,33,39
4,38,29,33


In [29]:
# médias das variáveis usando o pacote pandas
medidas.mean()

Unnamed: 0,0
torax,37.0
cintura,28.0
quadril,37.05


In [30]:
# covariâncias das variáveis usando o pacote pandas
medidas.cov()

Unnamed: 0,torax,cintura,quadril
torax,6.63,6.37,3.0
cintura,6.37,12.53,3.58
quadril,3.0,3.58,5.94


In [31]:
# correlações entre as variáveis usando o pacote pandas
medidas.corr()

Unnamed: 0,torax,cintura,quadril
torax,1.0,0.7,0.48
cintura,0.7,1.0,0.41
quadril,0.48,0.41,1.0


## Transformando em matriz

In [32]:
# subselecionar variáveis e transformar em matriz
X = medidas.values[:, 0:3]
# mudar o tipo dos dados
X = np.asarray(X, dtype=float)

In [33]:
# ou
X = medidas.to_numpy()
X

array([[34, 30, 32],
       [37, 32, 37],
       [38, 30, 36],
       [36, 33, 39],
       [38, 29, 33],
       [43, 32, 38],
       [40, 33, 42],
       [38, 30, 40],
       [40, 30, 37],
       [41, 32, 39],
       [36, 24, 35],
       [36, 25, 37],
       [34, 24, 37],
       [33, 22, 34],
       [36, 26, 38],
       [37, 26, 37],
       [34, 25, 38],
       [36, 26, 37],
       [38, 28, 40],
       [35, 23, 35]])

In [34]:
# número de observações
n = X.shape[0]
n

20

In [35]:
# número de variáveis
p = X.shape[1]
p

3

Obter as matrizes de distâncias (dos três tipos vistos) entre as observações:

In [72]:
# apenas mostrar a matriz de distâncias euclidiana
squareform(pdist(X, metric='euclidean'))

array([[  0.  ,  42.74,  63.81,  34.85,  46.82,  66.56,  34.5 ,  24.08,
         77.25,  69.  ,  69.72,  42.57,  72.79,  25.28,  36.36,  37.91],
       [ 42.74,   0.  ,  46.49,  30.15,  31.26,  38.99,  20.52,  34.74,
         70.66,  31.78,  35.46,  59.03,  43.58,  29.08,   9.23,  27.97],
       [ 63.81,  46.49,   0.  ,  38.52,  49.59,  33.48,  38.92,  70.13,
         37.96,  37.8 ,  36.91,  93.76,  38.82,  47.92,  42.86,  61.24],
       [ 34.85,  30.15,  38.52,   0.  ,  46.2 ,  50.32,  23.19,  40.98,
         48.8 ,  48.89,  51.78,  62.81,  57.68,  26.87,  24.76,  40.73],
       [ 46.82,  31.26,  49.59,  46.2 ,   0.  ,  38.97,  24.56,  40.  ,
         79.19,  35.88,  33.25,  59.82,  32.96,  37.43,  30.17,  40.6 ],
       [ 66.56,  38.99,  33.48,  50.32,  38.97,   0.  ,  42.1 ,  64.65,
         70.06,  26.92,  22.95,  91.1 ,  22.9 ,  53.52,  40.18,  60.59],
       [ 34.5 ,  20.52,  38.92,  23.19,  24.56,  42.1 ,   0.  ,  34.06,
         60.86,  36.45,  37.89,  55.74,  42.44,  20.03,  1

In [37]:
# salvar a matriz de distâncias euclidiana
De = squareform(pdist(X, metric='euclidean'))

In [38]:
# salvar a matriz de distâncias euclidiana padronizada
Dep = squareform(pdist(X, metric='seuclidean'))
print(Dep)

[[0.   2.43 2.26 3.09 1.63 4.31 4.79 3.63 3.1  3.99 2.23 2.61 2.66 2.44
  2.82 2.62 2.84 2.47 3.67 2.36]
 [2.43 0.   0.8  0.95 1.89 2.37 2.38 1.41 1.29 1.76 2.44 2.02 2.54 3.45
  1.79 1.7  2.33 1.74 1.72 2.78]
 [2.26 0.8  0.   1.68 1.26 2.18 2.72 1.64 0.88 1.79 1.91 1.66 2.34 3.09
  1.6  1.26 2.25 1.43 1.74 2.33]
 [3.09 0.95 1.68 0.   2.82 2.76 1.98 1.22 1.95 1.96 3.03 2.4  2.78 3.9
  2.02 2.18 2.43 2.14 1.66 3.29]
 [1.63 1.89 1.26 2.82 0.   2.95 3.94 2.88 1.84 2.85 1.81 2.14 2.66 2.8
  2.35 1.89 2.81 2.   2.88 2.21]
 [4.31 2.37 2.18 2.76 2.95 0.   2.03 2.18 1.36 0.88 3.74 3.39 4.18 5.07
  3.2  2.91 4.02 3.23 2.39 4.2 ]
 [4.79 2.38 2.72 1.98 3.94 2.03 0.   1.41 2.22 1.32 4.14 3.42 4.01 5.27
  3.   3.08 3.64 3.24 1.81 4.47]
 [3.63 1.41 1.64 1.22 2.88 2.18 1.41 0.   1.46 1.36 2.77 2.03 2.61 3.86
  1.6  1.72 2.25 1.84 0.57 3.08]
 [3.1  1.29 0.88 1.95 1.84 1.36 2.22 1.46 0.   1.07 2.44 2.1  2.88 3.74
  1.96 1.62 2.76 1.92 1.56 2.89]
 [3.99 1.76 1.79 1.96 2.85 0.88 1.32 1.36 1.07 0.   3.4  

In [39]:
# salvar a matriz de distâncias de Mahalanobis
Dm = squareform(pdist(X, metric='mahalanobis'))
print(Dm)

[[0.   2.14 2.49 2.95 2.47 4.54 4.4  3.79 3.53 3.88 3.51 3.64 3.56 3.19
  3.66 3.61 3.66 3.32 4.25 3.55]
 [2.14 0.   1.34 1.36 2.49 3.3  2.28 1.83 2.29 2.19 2.83 2.47 2.5  2.9
  2.24 2.39 2.4  2.08 2.47 2.91]
 [2.49 1.34 0.   2.63 1.42 2.23 2.51 1.88 1.1  1.47 1.8  1.79 2.28 2.33
  1.86 1.52 2.46 1.51 2.14 2.04]
 [2.95 1.36 2.63 0.   3.84 4.38 2.3  2.11 3.43 3.1  3.83 3.2  2.91 3.65
  2.77 3.25 2.54 2.83 2.87 3.79]
 [2.47 2.49 1.42 3.84 0.   2.49 3.87 3.27 1.73 2.47 2.1  2.67 3.23 2.69
  2.97 2.38 3.57 2.53 3.37 2.43]
 [4.54 3.3  2.23 4.38 2.49 0.   3.02 2.91 1.22 1.35 2.77 2.91 3.78 3.9
  3.1  2.47 4.   2.9  2.78 3.17]
 [4.4  2.28 2.51 2.3  3.87 3.02 0.   1.   2.5  1.75 3.36 2.6  2.81 3.8
  2.17 2.5  2.54 2.4  1.49 3.44]
 [3.79 1.83 1.88 2.11 3.27 2.91 1.   0.   2.05 1.62 2.48 1.66 1.83 2.81
  1.2  1.64 1.62 1.43 0.8  2.5 ]
 [3.53 2.29 1.1  3.43 1.73 1.22 2.5  2.05 0.   0.88 1.77 1.86 2.66 2.77
  2.05 1.43 2.91 1.78 2.   2.14]
 [3.88 2.19 1.47 3.1  2.47 1.35 1.75 1.62 0.88 0.   2.41 2

## Outro dataframe: imrs.csv

In [40]:
mg = pd.read_csv('https://patriciasiqueira.github.io/arquivos/imrs.csv')

In [41]:
mg.head()

Unnamed: 0,ibge7,meso,nome_meso,micro,nome_micro,mun,tx_mort_acid_15_29,tx_mort_hom_15_29,tx_mort_mama,tx_mort_inf,...,rdpc,renocup,perc_rdpc_140,fam_pbf,tx_emp_form,rend_pc_form,pib_pc,perc_agro,perc_serv,tx_ativ_18m
0,3100104,3105,Triângulo Mineiro/Alto Paranaíba,31019,Patrocínio,Abadia dos Dourados,66.53,0.0,0.0,0.0,...,596.18,920.1,7.94,589,22.9,114.28,11744.69,37.9,26.86,65.97
1,3100203,3106,Central Mineira,31024,Três Marias,Abaeté,56.22,0.0,8.73,27.27,...,707.24,930.85,6.69,1428,27.5,172.19,11466.39,22.08,40.06,65.9
2,3100302,3112,Zona da Mata,31061,Manhuaçu,Abre Campo,30.9,0.0,0.0,27.21,...,444.32,685.72,23.33,1293,20.1,117.72,7998.32,45.8,31.92,63.8
3,3100401,3112,Zona da Mata,31060,Ponte Nova,Acaiaca,0.0,0.0,0.0,20.83,...,357.03,646.71,26.53,389,11.7,61.49,6049.45,25.16,41.87,57.71
4,3100500,3108,Vale do Rio Doce,31039,Ipatinga,Açucena,0.0,34.13,0.0,19.61,...,325.42,514.6,30.7,1172,10.3,52.78,5603.93,41.22,29.89,54.21


In [42]:
mg.shape

(853, 42)

In [43]:
# selecionar microrregião de Varginha
vgn = mg.query('nome_micro == "Varginha"')
vgn

Unnamed: 0,ibge7,meso,nome_meso,micro,nome_micro,mun,tx_mort_acid_15_29,tx_mort_hom_15_29,tx_mort_mama,tx_mort_inf,...,rdpc,renocup,perc_rdpc_140,fam_pbf,tx_emp_form,rend_pc_form,pib_pc,perc_agro,perc_serv,tx_ativ_18m
75,3107109,3110,Sul/Sudoeste de Minas,31050,Varginha,Boa Esperança,60.75,19.07,25.98,14.93,...,599.17,903.37,8.43,2835,24.8,140.03,13127.48,28.65,39.28,64.92
117,3110905,3110,Sul/Sudoeste de Minas,31050,Varginha,Campanha,25.91,0.0,13.03,6.21,...,686.16,1021.62,6.55,786,27.3,149.06,10611.67,33.56,29.17,70.14
122,3111309,3110,Sul/Sudoeste de Minas,31050,Varginha,Campo do Meio,0.0,32.32,17.71,26.09,...,472.71,709.35,8.82,1111,18.5,105.89,7836.89,48.22,28.09,67.53
125,3111606,3110,Sul/Sudoeste de Minas,31050,Varginha,Campos Gerais,29.9,26.98,22.16,3.04,...,503.91,744.47,9.73,2422,18.3,93.65,11985.82,57.93,25.34,70.96
150,3113909,3110,Sul/Sudoeste de Minas,31050,Varginha,Carmo da Cachoeira,32.41,0.0,0.0,33.11,...,525.35,822.31,9.56,987,27.7,133.98,11871.57,47.49,26.25,71.98
205,3118700,3110,Sul/Sudoeste de Minas,31050,Varginha,Coqueiral,0.0,0.0,21.47,32.97,...,495.77,714.57,13.64,862,15.1,71.0,11402.89,53.59,24.72,66.81
263,3123601,3110,Sul/Sudoeste de Minas,31050,Varginha,Elói Mendes,31.99,15.38,7.85,16.08,...,577.5,921.97,10.46,1194,25.0,155.01,13057.12,31.12,34.04,66.07
318,3128105,3110,Sul/Sudoeste de Minas,31050,Varginha,Guapé,59.26,0.0,15.17,7.94,...,487.68,742.73,7.41,1194,18.4,102.09,8945.31,43.65,27.0,66.59
345,3130507,3110,Sul/Sudoeste de Minas,31050,Varginha,Ilicínea,0.0,65.0,17.77,7.14,...,500.79,784.59,6.93,869,28.5,156.11,9364.19,45.01,22.27,68.14
491,3142601,3110,Sul/Sudoeste de Minas,31050,Varginha,Monsenhor Paulo,0.0,0.0,0.0,18.52,...,555.78,854.92,5.17,492,32.8,194.62,14457.2,40.17,22.52,71.95


In [44]:
vgn.shape

(16, 42)

In [45]:
# zerar os índices
vgn = vgn.reset_index().drop('index', axis=1)

In [46]:
vgn

Unnamed: 0,ibge7,meso,nome_meso,micro,nome_micro,mun,tx_mort_acid_15_29,tx_mort_hom_15_29,tx_mort_mama,tx_mort_inf,...,rdpc,renocup,perc_rdpc_140,fam_pbf,tx_emp_form,rend_pc_form,pib_pc,perc_agro,perc_serv,tx_ativ_18m
0,3107109,3110,Sul/Sudoeste de Minas,31050,Varginha,Boa Esperança,60.75,19.07,25.98,14.93,...,599.17,903.37,8.43,2835,24.8,140.03,13127.48,28.65,39.28,64.92
1,3110905,3110,Sul/Sudoeste de Minas,31050,Varginha,Campanha,25.91,0.0,13.03,6.21,...,686.16,1021.62,6.55,786,27.3,149.06,10611.67,33.56,29.17,70.14
2,3111309,3110,Sul/Sudoeste de Minas,31050,Varginha,Campo do Meio,0.0,32.32,17.71,26.09,...,472.71,709.35,8.82,1111,18.5,105.89,7836.89,48.22,28.09,67.53
3,3111606,3110,Sul/Sudoeste de Minas,31050,Varginha,Campos Gerais,29.9,26.98,22.16,3.04,...,503.91,744.47,9.73,2422,18.3,93.65,11985.82,57.93,25.34,70.96
4,3113909,3110,Sul/Sudoeste de Minas,31050,Varginha,Carmo da Cachoeira,32.41,0.0,0.0,33.11,...,525.35,822.31,9.56,987,27.7,133.98,11871.57,47.49,26.25,71.98
5,3118700,3110,Sul/Sudoeste de Minas,31050,Varginha,Coqueiral,0.0,0.0,21.47,32.97,...,495.77,714.57,13.64,862,15.1,71.0,11402.89,53.59,24.72,66.81
6,3123601,3110,Sul/Sudoeste de Minas,31050,Varginha,Elói Mendes,31.99,15.38,7.85,16.08,...,577.5,921.97,10.46,1194,25.0,155.01,13057.12,31.12,34.04,66.07
7,3128105,3110,Sul/Sudoeste de Minas,31050,Varginha,Guapé,59.26,0.0,15.17,7.94,...,487.68,742.73,7.41,1194,18.4,102.09,8945.31,43.65,27.0,66.59
8,3130507,3110,Sul/Sudoeste de Minas,31050,Varginha,Ilicínea,0.0,65.0,17.77,7.14,...,500.79,784.59,6.93,869,28.5,156.11,9364.19,45.01,22.27,68.14
9,3142601,3110,Sul/Sudoeste de Minas,31050,Varginha,Monsenhor Paulo,0.0,0.0,0.0,18.52,...,555.78,854.92,5.17,492,32.8,194.62,14457.2,40.17,22.52,71.95


In [47]:
# selecionar colunas específicas
vgn = vgn.iloc[:, [5, 6, 7, 8, 9, 11, 12]]

In [48]:
vgn.columns

Index(['mun', 'tx_mort_acid_15_29', 'tx_mort_hom_15_29', 'tx_mort_mama',
       'tx_mort_inf', 'tx_analf_25m', 'prop_fund_25m'],
      dtype='object')

In [49]:
# pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.select_dtypes.html
# selecionar apenas as variáveis do tipo float
vgn.select_dtypes(include=['float64'])

Unnamed: 0,tx_mort_acid_15_29,tx_mort_hom_15_29,tx_mort_mama,tx_mort_inf,tx_analf_25m,prop_fund_25m
0,60.75,19.07,25.98,14.93,9.6,36.9
1,25.91,0.0,13.03,6.21,9.8,39.3
2,0.0,32.32,17.71,26.09,12.5,34.7
3,29.9,26.98,22.16,3.04,14.1,32.0
4,32.41,0.0,0.0,33.11,12.4,33.4
5,0.0,0.0,21.47,32.97,10.4,31.5
6,31.99,15.38,7.85,16.08,14.0,36.9
7,59.26,0.0,15.17,7.94,10.5,30.0
8,0.0,65.0,17.77,7.14,11.8,31.0
9,0.0,0.0,0.0,18.52,13.8,38.1


In [50]:
# mudando o dataframe para ter só as variáveis float
vgn_num = vgn.select_dtypes(include=['float64'])

In [51]:
vgn_num.shape

(16, 6)

In [52]:
# transformar em matriz
X = vgn_num.values

In [53]:
X.dtype

dtype('float64')

Distância euclidiana

In [54]:
De = squareform(pdist(X, metric='euclidean'))
De

array([[  0.  ,  42.74,  63.81,  34.85,  46.82,  66.56,  34.5 ,  24.08,
         77.25,  69.  ,  69.72,  42.57,  72.79,  25.28,  36.36,  37.91],
       [ 42.74,   0.  ,  46.49,  30.15,  31.26,  38.99,  20.52,  34.74,
         70.66,  31.78,  35.46,  59.03,  43.58,  29.08,   9.23,  27.97],
       [ 63.81,  46.49,   0.  ,  38.52,  49.59,  33.48,  38.92,  70.13,
         37.96,  37.8 ,  36.91,  93.76,  38.82,  47.92,  42.86,  61.24],
       [ 34.85,  30.15,  38.52,   0.  ,  46.2 ,  50.32,  23.19,  40.98,
         48.8 ,  48.89,  51.78,  62.81,  57.68,  26.87,  24.76,  40.73],
       [ 46.82,  31.26,  49.59,  46.2 ,   0.  ,  38.97,  24.56,  40.  ,
         79.19,  35.88,  33.25,  59.82,  32.96,  37.43,  30.17,  40.6 ],
       [ 66.56,  38.99,  33.48,  50.32,  38.97,   0.  ,  42.1 ,  64.65,
         70.06,  26.92,  22.95,  91.1 ,  22.9 ,  53.52,  40.18,  60.59],
       [ 34.5 ,  20.52,  38.92,  23.19,  24.56,  42.1 ,   0.  ,  34.06,
         60.86,  36.45,  37.89,  55.74,  42.44,  20.03,  1

In [55]:
# usando scipy
squareform(pdist(X, metric='euclidean'))

array([[  0.  ,  42.74,  63.81,  34.85,  46.82,  66.56,  34.5 ,  24.08,
         77.25,  69.  ,  69.72,  42.57,  72.79,  25.28,  36.36,  37.91],
       [ 42.74,   0.  ,  46.49,  30.15,  31.26,  38.99,  20.52,  34.74,
         70.66,  31.78,  35.46,  59.03,  43.58,  29.08,   9.23,  27.97],
       [ 63.81,  46.49,   0.  ,  38.52,  49.59,  33.48,  38.92,  70.13,
         37.96,  37.8 ,  36.91,  93.76,  38.82,  47.92,  42.86,  61.24],
       [ 34.85,  30.15,  38.52,   0.  ,  46.2 ,  50.32,  23.19,  40.98,
         48.8 ,  48.89,  51.78,  62.81,  57.68,  26.87,  24.76,  40.73],
       [ 46.82,  31.26,  49.59,  46.2 ,   0.  ,  38.97,  24.56,  40.  ,
         79.19,  35.88,  33.25,  59.82,  32.96,  37.43,  30.17,  40.6 ],
       [ 66.56,  38.99,  33.48,  50.32,  38.97,   0.  ,  42.1 ,  64.65,
         70.06,  26.92,  22.95,  91.1 ,  22.9 ,  53.52,  40.18,  60.59],
       [ 34.5 ,  20.52,  38.92,  23.19,  24.56,  42.1 ,   0.  ,  34.06,
         60.86,  36.45,  37.89,  55.74,  42.44,  20.03,  1

In [56]:
# distância entre Varginha e Elói Mendes - pelo índice
De[15, 6]

np.float64(29.598118183425107)

In [57]:
# incluir nomes dos municípios nas distâncias - criar novo dataframe
Dedf = pd.DataFrame(De, index=vgn.mun, columns=vgn.mun)

In [58]:
Dedf

mun,Boa Esperança,Campanha,Campo do Meio,Campos Gerais,Carmo da Cachoeira,Coqueiral,Elói Mendes,Guapé,Ilicínea,Monsenhor Paulo,Santana da Vargem,São Bento Abade,São Tomé das Letras,Três Corações,Três Pontas,Varginha
mun,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Boa Esperança,0.0,42.74,63.81,34.85,46.82,66.56,34.5,24.08,77.25,69.0,69.72,42.57,72.79,25.28,36.36,37.91
Campanha,42.74,0.0,46.49,30.15,31.26,38.99,20.52,34.74,70.66,31.78,35.46,59.03,43.58,29.08,9.23,27.97
Campo do Meio,63.81,46.49,0.0,38.52,49.59,33.48,38.92,70.13,37.96,37.8,36.91,93.76,38.82,47.92,42.86,61.24
Campos Gerais,34.85,30.15,38.52,0.0,46.2,50.32,23.19,40.98,48.8,48.89,51.78,62.81,57.68,26.87,24.76,40.73
Carmo da Cachoeira,46.82,31.26,49.59,46.2,0.0,38.97,24.56,40.0,79.19,35.88,33.25,59.82,32.96,37.43,30.17,40.6
Coqueiral,66.56,38.99,33.48,50.32,38.97,0.0,42.1,64.65,70.06,26.92,22.95,91.1,22.9,53.52,40.18,60.59
Elói Mendes,34.5,20.52,38.92,23.19,24.56,42.1,0.0,34.06,60.86,36.45,37.89,55.74,42.44,20.03,14.33,29.6
Guapé,24.08,34.74,70.13,40.98,40.0,64.65,34.06,0.0,88.02,62.69,64.17,29.08,68.42,34.12,33.22,34.93
Ilicínea,77.25,70.66,37.96,48.8,79.19,70.06,60.86,88.02,0.0,68.74,70.25,106.42,74.21,64.97,66.03,79.32
Monsenhor Paulo,69.0,31.78,37.8,48.89,35.88,26.92,36.45,62.69,68.74,0.0,8.53,84.44,19.71,50.95,35.05,51.2


In [59]:
# escolher municípios para saber as distâncias
Dedf.loc['Varginha', 'Elói Mendes']

np.float64(29.598118183425107)

In [60]:
Dedf.loc['Varginha', 'Três Corações']

np.float64(21.22158099671182)

In [61]:
def compara_dist(df, op='max'):
    if op == 'max':
        maximo = np.amax(df.values)
        a = np.where(df.values == maximo)
    else:
        df = df[df > 0]
        minimo = np.nanmin(df.values)
        a = np.where(df.values == minimo)
    return [df.index[a[0][0]], df.index[a[0][1]]]

In [62]:
# municípios mais semelhantes de acordo com a distância euclidiana
compara_dist(Dedf,'min')

['Monsenhor Paulo', 'Santana da Vargem']

In [63]:
# municípios mais diferentes de acordo com a distância euclidiana
compara_dist(Dedf)

['Ilicínea', 'São Bento Abade']

Distância euclidiana padronizada

In [64]:
Dep = squareform(pdist(X, metric='seuclidean'))

In [65]:
# incluir nomes dos municípios nas distâncias - euclidiana padronizada
Dedf_ep = pd.DataFrame(Dep, index=vgn.mun, columns=vgn.mun)

In [66]:
# municípios mais semelhantes de acordo com a distância euclidiana padronizada
compara_dist(Dedf_ep,'min')

['Campanha', 'Três Pontas']

In [67]:
# municípios mais diferentes de acordo com a distância euclidiana padronizada
compara_dist(Dedf_ep)

['São Tomé das Letras', 'Varginha']

Distância de Mahalanobis

In [68]:
Dm = squareform(pdist(X, metric='mahalanobis'))

In [69]:
# incluir nomes dos municípios nas distâncias - Mahalanobis
Dedf_m = pd.DataFrame(Dm, index=vgn.mun, columns=vgn.mun)

In [70]:
# municípios mais semelhantes de acordo com a distância de Mahalanobis
compara_dist(Dedf_m,'min')

['Campanha', 'Três Pontas']

In [71]:
# municípios mais diferentes de acordo com a distância de Mahalanobis
compara_dist(Dedf_m)

['Coqueiral', 'São Bento Abade']