# Proximity measures

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import io
import requests
import numpy as np

## Distance Matrix

In [None]:
from scipy.spatial.distance import cdist
def get_distance_matrix(X):
    dist_mat = cdist(X, X, metric="minkowski", p=1)
    return pd.DataFrame(dist_mat)

## L Norms

In [4]:
def l1(a,b):
    res = 0
    for x,y in zip(a,b):
        res += abs(x-y)
    return res

def l2(a,b):
    res = 0
    for x,y in zip(a,b):
        res += (x-y)**2
    return np.sqrt(res)

def lmax(a,b):
    res = []
    for x,y in zip(a,b):
        res.append(abs(x-y))
    return max(res)

In [22]:
a = np.array([4,5,9])
b = np.array([3,3,7])
print(l2(a,b))
print(np.linalg.norm(a-b,ord=2))

3.0
3.0


In [24]:
df = pd.read_table('../data/table3.txt', delim_whitespace=True, index_col=0)
# df = df.apply(np.linalg.norm, axis=1)
df

Unnamed: 0_level_0,x,y
dataset1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,10,4
CM,5,15
c,3,-6
d,1,11


In [26]:
print("l1(a,b) = |x_a - x_b| + |y_a - y_b|")
for i in range(len(df)):
    for rest in range(len(df)-i):
        a = df.iloc[i,:]
        b = df.iloc[rest+i,:]
        print(f"l1({df.index[i]},{df.index[rest+i]}) = {l1(a,b)}")

print("\nl2(a,b) = sqrt((x_a - x_b)**2 + (y_a - y_b)**2)")
for i in range(len(df)):
    for rest in range(len(df)-i):
        a = df.iloc[i,:]
        b = df.iloc[rest+i,:]
        print(f"l2({df.index[i]},{df.index[rest+i]}) = {l2(a,b)}")

l1(a,b) = |x_a - x_b| + |y_a - y_b|
l1(a,a) = 0
l1(a,CM) = 16
l1(a,c) = 17
l1(a,d) = 16
l1(CM,CM) = 0
l1(CM,c) = 23
l1(CM,d) = 8
l1(c,c) = 0
l1(c,d) = 19
l1(d,d) = 0

l2(a,b) = sqrt((x_a - x_b)**2 + (y_a - y_b)**2)
l2(a,a) = 0.0
l2(a,CM) = 12.083045973594572
l2(a,c) = 12.206555615733702
l2(a,d) = 11.40175425099138
l2(CM,CM) = 0.0
l2(CM,c) = 21.095023109728988
l2(CM,d) = 5.656854249492381
l2(c,c) = 0.0
l2(c,d) = 17.11724276862369
l2(d,d) = 0.0


## jaccard, simple match

In [27]:
df = pd.read_table('../data/table4.txt', delim_whitespace=True, index_col=0)
df

Unnamed: 0_level_0,test1,test2,test3,test4,test5
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alice,1,0,1,0,0
Bob,0,0,1,0,0
Charlie,1,1,0,1,1


In [28]:
def simple_match(a,b):
    q,t,s,r = 0,0,0,0
    for x,y in zip(a,b):
        if x==1 and y==1: q+=1
        if x==0 and y==0: t+=1
        if x==1 and y==0: r+=1
        if x==0 and y==1: s+=1
    print(f"q={q},t={t},r={r},s={s}")
    return (r+s)/(q+r+s)

def jaccard(a,b):
    q,t,s,r = 0,0,0,0
    for x,y in zip(a,b):
        if x==1 and y==1: q+=1
        if x==0 and y==0: t+=1
        if x==1 and y==0: r+=1
        if x==0 and y==1: s+=1
    print(f"q={q},t={t},r={r},s={s}")
    return (q)/(q+r+s)

In [29]:
print("simple_match = (r+s)/(q+r+s)")
for i in range(len(df)):
    for rest in range(len(df)):
            a = df.iloc[i,:]
            b = df.iloc[rest,:]
            print(f"simple_match({df.index[i]},{df.index[rest]}) = {simple_match(a,b)}")

simple_match = (r+s)/(q+r+s)
q=2,t=3,r=0,s=0
simple_match(Alice,Alice) = 0.0
q=1,t=3,r=1,s=0
simple_match(Alice,Bob) = 0.5
q=1,t=0,r=1,s=3
simple_match(Alice,Charlie) = 0.8
q=1,t=3,r=0,s=1
simple_match(Bob,Alice) = 0.5
q=1,t=4,r=0,s=0
simple_match(Bob,Bob) = 0.0
q=0,t=0,r=1,s=4
simple_match(Bob,Charlie) = 1.0
q=1,t=0,r=3,s=1
simple_match(Charlie,Alice) = 0.8
q=0,t=0,r=4,s=1
simple_match(Charlie,Bob) = 1.0
q=4,t=1,r=0,s=0
simple_match(Charlie,Charlie) = 0.0


In [30]:
print("jaccard = (q)/(q+r+s)")
for i in range(len(df)):
    for rest in range(len(df)):
            a = df.iloc[i,:]
            b = df.iloc[rest,:]
            print(f"jaccard({df.index[i]},{df.index[rest]}) = {jaccard(a,b)}")

jaccard = (q)/(q+r+s)
q=2,t=3,r=0,s=0
jaccard(Alice,Alice) = 1.0
q=1,t=3,r=1,s=0
jaccard(Alice,Bob) = 0.5
q=1,t=0,r=1,s=3
jaccard(Alice,Charlie) = 0.2
q=1,t=3,r=0,s=1
jaccard(Bob,Alice) = 0.5
q=1,t=4,r=0,s=0
jaccard(Bob,Bob) = 1.0
q=0,t=0,r=1,s=4
jaccard(Bob,Charlie) = 0.0
q=1,t=0,r=3,s=1
jaccard(Charlie,Alice) = 0.2
q=0,t=0,r=4,s=1
jaccard(Charlie,Bob) = 0.0
q=4,t=1,r=0,s=0
jaccard(Charlie,Charlie) = 1.0


## simple match categorical

In [31]:
df = pd.read_table('../data/table5.txt', delim_whitespace=True, index_col=0)
df

Unnamed: 0_level_0,EyeColor,Gender,Age,Department
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alice,green,male,21,Maths
Bob,blue,male,19,Maths
Charlie,brown,diverse,25,Maths


In [32]:
def simple_match_categorical(a,b):
    m,p = 0,0
    for x,y in zip(a,b):
        if x==y: 
            m +=1
    p = len(a)
    print(f"p={p},m={m}")
    return (p-m)/p

In [33]:
print("simple_match_categorical = (p-m)/p")
for i in range(len(df)):
    for rest in range(len(df)):
            a = df.iloc[i,:]
            b = df.iloc[rest,:]
            print(f"simple_match_categorical({df.index[i]},{df.index[rest]}) = {simple_match_categorical(a,b)}")

simple_match_categorical = (p-m)/p
p=4,m=4
simple_match_categorical(Alice,Alice) = 0.0
p=4,m=2
simple_match_categorical(Alice,Bob) = 0.5
p=4,m=1
simple_match_categorical(Alice,Charlie) = 0.75
p=4,m=2
simple_match_categorical(Bob,Alice) = 0.5
p=4,m=4
simple_match_categorical(Bob,Bob) = 0.0
p=4,m=1
simple_match_categorical(Bob,Charlie) = 0.75
p=4,m=1
simple_match_categorical(Charlie,Alice) = 0.75
p=4,m=1
simple_match_categorical(Charlie,Bob) = 0.75
p=4,m=4
simple_match_categorical(Charlie,Charlie) = 0.0
