In [119]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean

In [120]:
import os

os.getcwd()

'C:\\Users\\Lance\\Documents\\Homework\\CS470\\HW1'

In [121]:
df_orig = pd.read_csv("data/data.csv")
df_orig.head()

Unnamed: 0,person ID,age,gender,chest pain type,resting blood pressure,serum cholesterol in mg/dl,fasting blood sugar > 120 mg/dl,resting electrocardiographic results,maximum heart rate achieved,exercise induced angina,oldpeak = ST depression induced by exercise relative to rest,the slope of the peak exercise ST segment,number of major vessels (0-3) colored by flourosopy,thal: 3 = normal; 6 = fixed defect; 7 = reversable defect,Has heart disease?
0,1,70,M,Type 4,130,322,No,hypertrophy of heart,109,No,2.4,2,3,3,Yes
1,2,67,F,Type 3,115,564,No,hypertrophy of heart,160,No,1.6,2,0,7,No
2,3,57,M,Type 2,124,261,No,myocardial infarction,141,No,0.3,1,0,7,Yes
3,4,64,M,Type 4,128,263,No,myocardial infarction,105,Yes,0.2,2,1,7,No
4,5,74,F,Type 2,120,269,No,hypertrophy of heart,121,Yes,0.2,1,1,3,No


In [122]:
print(df_orig.columns)
print()
print(df_orig.shape)

Index(['person ID', 'age', 'gender', 'chest pain type',
       'resting blood pressure', 'serum cholesterol in mg/dl',
       'fasting blood sugar > 120 mg/dl',
       'resting electrocardiographic results', 'maximum heart rate achieved',
       'exercise induced angina',
       'oldpeak = ST depression induced by exercise relative to rest',
       'the slope of the peak exercise ST segment',
       'number of major vessels (0-3) colored by flourosopy',
       'thal: 3 = normal; 6 = fixed defect; 7 = reversable defect',
       'Has heart disease?'],
      dtype='object')

(100, 15)


## Computing proximity metrics

In [123]:
df = df_orig.select_dtypes(include=["number"])
print(df.columns)
X = df.values
n, d = df.shape
print(f"Shape of df:\t({n},{d})")

Index(['person ID', 'age', 'resting blood pressure',
       'serum cholesterol in mg/dl', 'maximum heart rate achieved',
       'oldpeak = ST depression induced by exercise relative to rest',
       'the slope of the peak exercise ST segment',
       'number of major vessels (0-3) colored by flourosopy',
       'thal: 3 = normal; 6 = fixed defect; 7 = reversable defect'],
      dtype='object')
Shape of df:	(100,9)


### Computations

We will compute the pair-wise correlation between all possible pairs and store them in a matrix. We can compute just one triangular section of the matrix since these metrics are commutative.

In [124]:
corrmat = np.zeros((n, n))
cosmat = np.zeros((n,n))
distmat = np.zeros((n,n))

In [125]:
def compute_corr(x, y, n, d):    
    xbar = np.mean(x)
    ybar = np.mean(y)

    cov = 0
    std_x = 0
    std_y = 0
    
    for k in range(d):
        cov += (x[k] - xbar) * (y[k] - ybar)
        std_x += (x[k] - xbar) ** 2
        std_y += (y[k] - ybar) ** 2
        
    cov /= n-1
    std_x = np.sqrt(std_x / (n-1))
    std_y = np.sqrt(std_y / (n-1))

    corr = cov / (std_x * std_y)

    return corr

In [126]:
def compute_cos_sim(x, y, d):
    top = 0

    for i in range(d):
        top += x[i] * y[i]

    bx, by = 0, 0
    
    for i in range(d):
        bx += x[i] ** 2
        by += y[i] ** 2
    bottom = np.sqrt(bx) * np.sqrt(by)

    return top/bottom

In [127]:
for i in range(n):
    corrmat[i, i] = 1
    cosmat[i, i] = 1
    for j in range(i+1, n):
        # Correlation
        x, y = X[i], X[j]
        corr = compute_corr(x, y, n, d)
        corrmat[i, j] = corr
        corrmat[j, i] = corr

        # Cosine
        cos = compute_cos_sim(x, y, d)
        cosmat[i, j] = cos
        cosmat[j, i] = cos

        # Euclidean Distance
        dist = 0
        for k in range(d):
            dist += (x[k] - y[k]) ** 2
        dist = np.sqrt(dist)
        distmat[i, j] = dist
        distmat[j, i] = dist
        
# c2 = np.zeros((n, n))
# for i in range(n):
#     for j in range(i+1, n):
#         c2[i][j] = pearsonr(X[i], X[j], )[0]
#         c2[j][i] = c2[i][j]
# print(c2)

# corr_matrix = np.corrcoef(X, rowvar=True)
# print(corr_matrix)

In [128]:
print(corrmat)
np.savetxt("data/correlation_matrix.csv", corrmat, delimiter=",")
print(corr_check)

[[1.         0.97831831 0.98160717 ... 0.88937292 0.94992342 0.89318022]
 [0.97831831 1.         0.94695949 ... 0.87570185 0.9330628  0.87063766]
 [0.98160717 0.94695949 1.         ... 0.92201783 0.94554064 0.92814045]
 ...
 [0.88937292 0.87570185 0.92201783 ... 1.         0.97246885 0.99695477]
 [0.94992342 0.9330628  0.94554064 ... 0.97246885 1.         0.97554645]
 [0.89318022 0.87063766 0.92814045 ... 0.99695477 0.97554645 1.        ]]
[[1.         0.96371798 0.89524298 ... 0.88937292 0.94992342 0.89318022]
 [0.96371798 1.         0.92395532 ... 0.87008178 0.92723929 0.87017587]
 [0.89524298 0.92395532 1.         ... 0.87203904 0.86024171 0.84836876]
 ...
 [0.88937292 0.87008178 0.87203904 ... 1.         0.97246885 0.99695477]
 [0.94992342 0.92723929 0.86024171 ... 0.97246885 1.         0.97554645]
 [0.89318022 0.87017587 0.84836876 ... 0.99695477 0.97554645 1.        ]]


In [129]:
print(cosmat)
np.savetxt("data/cosine_matrix.csv", cosmat, delimiter=",")

[[1.         0.98144694 0.98742449 ... 0.92477917 0.96443456 0.92774033]
 [0.98144694 1.         0.95683736 ... 0.89823719 0.94063923 0.89636215]
 [0.98742449 0.95683736 1.         ... 0.94966677 0.96539584 0.95376808]
 ...
 [0.92477917 0.89823719 0.94966677 ... 1.         0.98397454 0.99831606]
 [0.96443456 0.94063923 0.96539584 ... 0.98397454 1.         0.98589518]
 [0.92774033 0.89636215 0.95376808 ... 0.99831606 0.98589518 1.        ]]


In [130]:
print(distmat)
np.savetxt("data/dist_matrix.csv", distmat, delimiter=",")

[[  0.         247.84196578  70.60035411 ... 142.94156149 102.11679588
  138.5845951 ]
 [247.84196578   0.         303.89914445 ... 344.15439849 271.61730431
  335.17047901]
 [ 70.60035411 303.89914445   0.         ... 102.74745739 110.85580725
  101.09718097]
 ...
 [142.94156149 344.15439849 102.74745739 ...   0.          91.69520162
   24.93992783]
 [102.11679588 271.61730431 110.85580725 ...  91.69520162   0.
   78.84040842]
 [138.5845951  335.17047901 101.09718097 ...  24.93992783  78.84040842
    0.        ]]


In [131]:
corr_check = np.eye(n)
cos_check = np.eye(n)
dist_check = np.zeros((n, n))

for i in range(n):
    for j in range(i + 1, n):
        x, y = X[i], X[j]
        
        # Pearson correlation
        corr, _ = pearsonr(x, y)
        corr_check[i, j] = corr_check[j, i] = corr

        # Cosine similarity
        cos = cosine_similarity([x], [y])[0, 0]
        cos_check[i, j] = cos_check[j, i] = cos

        # Euclidean distance
        dist = euclidean(x, y)
        dist_check[i, j] = dist_check[j, i] = dist

# Check if your matrices match the library results
assert np.allclose(corrmat, corr_check, atol=1e-6), "Correlation matrix mismatch!"
assert np.allclose(cosmat, cos_check, atol=1e-6), "Cosine similarity matrix mismatch!"
assert np.allclose(distmat, dist_check, atol=1e-6), "Euclidean distance matrix mismatch!"

## Summary Statistics

In [138]:
# print(df_orig.columns)

curated_features = [
    'age',
    'resting blood pressure',
    'serum cholesterol in mg/dl', 'maximum heart rate achieved',
    'oldpeak = ST depression induced by exercise relative to rest',
    'the slope of the peak exercise ST segment',
    'number of major vessels (0-3) colored by flourosopy',
    'thal: 3 = normal; 6 = fixed defect; 7 = reversable defect'
]

df_stats = df_orig[curated_features]
print(df_stats.columns)
print(df_stats.shape)

Index(['age', 'resting blood pressure', 'serum cholesterol in mg/dl',
       'maximum heart rate achieved',
       'oldpeak = ST depression induced by exercise relative to rest',
       'the slope of the peak exercise ST segment',
       'number of major vessels (0-3) colored by flourosopy',
       'thal: 3 = normal; 6 = fixed defect; 7 = reversable defect'],
      dtype='object')
(100, 8)
