In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.widgets import Slider
from IPython.display import display
import glob
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

In [23]:
# PCA over all the frames - flattened

# importing all the data, getting all the frames

TRAINING_DATA_PATH = "../data/training_data/"
files = sorted(glob.glob(TRAINING_DATA_PATH + "mixed_samples/*.npy"))

heart_samples = []

for f in files:
    data = np.load(f)
    heart_samples.append(data)
heart_samples = np.array(heart_samples)
 
print(heart_samples.shape)

heart_xyz = heart_samples[..., :3]
X = heart_xyz.reshape(len(heart_xyz), -1)
print(X.shape)

# Importing demographic data
demographics = pd.read_csv(TRAINING_DATA_PATH + "mixed_demographics.csv")

# Doing PCA
pca = PCA(n_components = 50)
X_pca = pca.fit_transform(X)
print(pca.explained_variance_ratio_)
print(X_pca.shape)

(900, 10, 18000, 4)
(900, 540000)
[0.30516536 0.2456048  0.09653941 0.04462171 0.02640237 0.01987551
 0.01842039 0.01475463 0.01414272 0.01354737 0.0124781  0.01121955
 0.01086073 0.00971639 0.00848699 0.00800505 0.00715335 0.00680103
 0.00578201 0.00484285 0.00412508 0.00342057 0.00253414 0.00245197
 0.00238891 0.00235307 0.00233917 0.00232312 0.00230992 0.00229459
 0.0022599  0.00222637 0.00221538 0.00210933 0.00210195 0.00203267
 0.00201275 0.00197166 0.00188905 0.00183544 0.00179916 0.00175936
 0.00171406 0.00169335 0.00167131 0.00162858 0.0016075  0.00158349
 0.00153157 0.00149262]
(900, 50)


In [None]:
# plots for PCA - simular to Cayla's done


Unnamed: 0,age,BMI,height,weight,diastolic_BP,systolic_BP,sex
0,62.593012,25.312186,161.647749,64.949107,76.057670,137.765801,False
1,58.037959,32.100795,172.543391,94.715477,84.272113,143.459426,False
2,65.840496,26.099055,166.521430,71.650562,75.340419,137.100866,False
3,68.518300,27.672836,166.176884,75.779414,75.697258,140.518093,True
4,55.591381,27.266429,161.314821,70.185424,79.973837,133.740780,False
...,...,...,...,...,...,...,...
895,68.332345,26.048104,162.473175,68.271982,77.094380,138.305894,True
896,64.392429,28.965176,167.947292,81.322657,79.736772,141.375206,False
897,63.653352,26.534891,168.596297,75.347960,77.308576,137.919129,True
898,66.775260,27.679240,175.418123,84.825322,78.065167,141.577103,True


In [72]:
# linear regression on point cloud data
y = demographics["MI"]

X_demographic = demographics.iloc[:, 2:]

X_demographic = X_demographic.to_numpy()

X_combined = np.concatenate((X_demographic, X_pca[:, :2]), axis=1)

print(X_combined.shape)
print(X_combined)

X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)  # fit on train only
X_test_std  = scaler.transform(X_test)       # transform test with same params

std_logreg = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000))
])
std_logreg.fit(X_train, y_train)

# Evaluate
y_pred = std_logreg.predict(X_test)

print(classification_report(y_test, y_pred))


(900, 9)
[[62.5930124021224 25.3121861630296 161.647749043888 ... False
  4156.627826479258 -1628.2332542895815]
 [58.0379591747181 32.100795024584 172.54339126402 ... False
  2019.779457071236 -738.4617728987699]
 [65.8404960124373 26.099054502369 166.521429978982 ... False
  1221.0679816844329 -235.0910859194748]
 ...
 [63.6533519003217 26.5348912809169 168.596296693957 ... True
  -2381.9165380271547 -2715.431826003892]
 [66.7752601611179 27.6792404474642 175.418122947101 ... True
  3807.3213540189477 -961.1933815234325]
 [62.6491378887632 27.5465146410527 159.997560083595 ... False
  623.0745550058365 -1044.0661725200005]]
              precision    recall  f1-score   support

     healthy       0.77      0.79      0.78        90
         pMI       0.78      0.77      0.78        90

    accuracy                           0.78       180
   macro avg       0.78      0.78      0.78       180
weighted avg       0.78      0.78      0.78       180



In [58]:
# K-means clusting to try and figure out hearts that aren't anatomicall possible
y = demographics["MI"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize KNN model (k = 2), try and cluster "anatomically possible"
knn = KNeighborsClassifier(n_neighbors=2)

# Train the model
knn.fit(X_train, y_train)

# Predict on test data
y_pred = knn.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5222222222222223
Classification Report:
               precision    recall  f1-score   support

     healthy       0.51      0.77      0.62        90
         pMI       0.54      0.28      0.37        90

    accuracy                           0.52       180
   macro avg       0.53      0.52      0.49       180
weighted avg       0.53      0.52      0.49       180

