In [32]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import minkowski

In [4]:
dataset = pd.read_csv('audio_parkinsons_data.csv')
dataset.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [9]:
data = dataset[['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)',
       'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP',
       'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5',
       'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA',
       'spread1', 'spread2', 'D2', 'PPE', 'status']]

In [19]:
# Standardising the data from 0 to 1

scaler = StandardScaler()
scaler.fit(data.drop('status', axis=1))
scaled_features = scaler.transform(data.drop('status', axis=1)) 
data_scaled = pd.DataFrame(scaled_features, columns=data.columns[:-1])

# Adding the status column back to the scaled data
data_scaled['status'] = data['status']

Matrix column entries (attributes):  
  
*name* - ASCII subject name and recording number  
  
*MDVP:Fo(Hz)* - Average vocal fundamental frequency  
  
*MDVP:Fhi(Hz)* - Maximum vocal fundamental frequency  
  
*MDVP:Flo(Hz)* - Minimum vocal fundamental frequency  
  
*MDVP:Jitter(%), MDVP:Jitter(Abs), MDVP:RAP, MDVP:PPQ, Jitter:DDP* - Several measures of variation in fundamental frequency  
  
*MDVP:Shimmer, MDVP:Shimmer(dB), Shimmer:APQ3, Shimmer:APQ5, MDVP:APQ, Shimmer:DDA* - Several measures of variation in amplitude  
  
*NHR, HNR* - Two measures of ratio of noise to tonal components in the voice  
  
*status* - (one) : Parkinson's, (zero) : healthy  
  
*RPDE, D2* - Two nonlinear dynamical complexity measures  
  
*DFA* - Signal fractal scaling exponent  
  
*spread1, spread2, PPE* - Three nonlinear measures of fundamental frequency variation

In [27]:
# Splitting the data into patient and healthy based on the status column
patient = data_scaled[data_scaled['status'] == 1]
healthy = (data_scaled[data_scaled['status'] == 0]).drop('status', axis=1)
print("Number of patients: ", len(patient))
print("Number of healthy: ", len(healthy))

Number of patients:  147
Number of healthy:  48


In [36]:
# Splitting the healthy data into training and testing data

train = healthy.sample(frac=0.8, random_state=200)
test = healthy.drop(train.index)

# Adding the status column back to the testing data
test = test.assign(status = 0)
test

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE,status
34,1.185825,0.158028,1.839115,-0.918269,-1.00652,-0.799421,-0.86717,-0.798374,-1.070213,-1.014787,...,-1.084978,-0.600205,2.528321,-1.528505,0.431207,-2.097268,-0.757184,-2.510472,-1.802384,0
48,-0.776107,-0.750564,-0.012892,-0.202755,-0.113985,-0.546056,-0.514685,-0.546136,-0.721967,-0.716398,...,-0.702172,-0.408216,0.28906,0.781882,0.281915,-0.694312,0.479395,-0.790705,-0.808614,0
50,-0.721437,-0.679796,0.026961,-0.395076,-0.401899,-0.640646,-0.645504,-0.641851,-0.418381,-0.438587,...,-0.321011,-0.497513,0.736006,0.526632,1.047835,-0.888419,0.336424,-1.418557,-1.145767,0
51,-0.675438,-0.688979,-0.081817,-0.35992,-0.113985,-0.674428,-0.638237,-0.674507,-0.498664,-0.500323,...,-0.418112,-0.498753,0.71222,0.086506,0.765702,-0.923989,0.786662,0.131026,-1.118711,0
52,-0.635301,-0.647108,0.132583,-0.384736,-0.401899,-0.654158,-0.649138,-0.654238,-0.358302,-0.371707,...,-0.240368,-0.497017,0.635653,-0.373953,0.871565,-1.276449,-0.07304,-1.072582,-0.970582,0
166,2.012759,0.510692,2.601525,-0.659774,-0.977729,-0.532543,-0.674575,-0.532623,-0.919218,-0.855303,...,-0.859178,-0.484118,0.632708,-1.096385,-1.655786,-1.226457,-0.603358,-1.386594,-1.278788,0
167,2.564598,0.743114,2.786899,-0.585328,-0.977729,-0.424441,-0.576461,-0.423394,-0.500259,-0.438587,...,-0.37598,-0.390604,-0.181899,-0.555077,-1.631363,-1.686153,-0.794507,-1.309685,-1.456977,0
168,1.049815,0.224883,-0.588132,0.374205,-0.113985,0.538347,0.375611,0.538265,-0.422103,-0.479745,...,-0.284146,-0.284687,-0.592827,-1.218362,0.128945,-0.048173,-0.745043,1.311165,-0.040858,0
188,-0.960804,-0.854052,-0.683664,-0.610143,-0.401899,-0.623755,-0.583729,-0.622708,-0.949523,-0.90675,...,-0.996435,-0.44195,0.654455,0.547553,-1.068364,-0.943364,-0.923366,-1.225291,-0.943115,0
189,1.151671,0.718875,-0.877607,0.148798,-0.401899,0.274847,0.18665,0.273639,-0.211028,-0.140198,...,-0.110682,0.485256,-0.5704,0.09591,-0.622136,-1.149593,-0.800806,-0.171496,-1.042561,0


In [37]:
n_features = len(data.columns) - 1 # number of features
n_features

22

In [38]:
class psa:
    '''
    Positive Selection Algorithm takes the features of the healthy patients and create a healthy sphere around the self points to cover the self space. 
    The points which are not covered by the healthy sphere are considered as the outliers i.e. unhealthy datapoints 
    '''
    
    def __init__(self, radius):
        self.radius = radius
        self.detectors = []
        
    def fit(self, data):
        for subject in data:
            self.detectors.append(subject)
            
    def predict(self, data):
        predictions = []
        min_dis = []
        for subject in data:
            distances = []
            for detector in self.detectors:
                distance = minkowski(subject, detector, p=n_features) # minkowski distance
                distances.append(distance)
                
            min_dis.append(min(distances))
            if min(distances) <= self.radius:
                predictions.append(0) # healthy point
                continue
            else:
                predictions.append(1) # patient point
                continue
        
        return predictions, min_dis

In [39]:
train = train.to_numpy()

In [45]:
combined_test = pd.concat([test, patient], axis=0) # Adding both the healthy and patient data to the testing data
combined_test

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE,status
34,1.185825,0.158028,1.839115,-0.918269,-1.006520,-0.799421,-0.867170,-0.798374,-1.070213,-1.014787,...,-1.084978,-0.600205,2.528321,-1.528505,0.431207,-2.097268,-0.757184,-2.510472,-1.802384,0
48,-0.776107,-0.750564,-0.012892,-0.202755,-0.113985,-0.546056,-0.514685,-0.546136,-0.721967,-0.716398,...,-0.702172,-0.408216,0.289060,0.781882,0.281915,-0.694312,0.479395,-0.790705,-0.808614,0
50,-0.721437,-0.679796,0.026961,-0.395076,-0.401899,-0.640646,-0.645504,-0.641851,-0.418381,-0.438587,...,-0.321011,-0.497513,0.736006,0.526632,1.047835,-0.888419,0.336424,-1.418557,-1.145767,0
51,-0.675438,-0.688979,-0.081817,-0.359920,-0.113985,-0.674428,-0.638237,-0.674507,-0.498664,-0.500323,...,-0.418112,-0.498753,0.712220,0.086506,0.765702,-0.923989,0.786662,0.131026,-1.118711,0
52,-0.635301,-0.647108,0.132583,-0.384736,-0.401899,-0.654158,-0.649138,-0.654238,-0.358302,-0.371707,...,-0.240368,-0.497017,0.635653,-0.373953,0.871565,-1.276449,-0.073040,-1.072582,-0.970582,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,-0.131738,-0.424582,0.522127,-0.647367,-0.689814,-0.603485,-0.576461,-0.603565,-0.742702,-0.721543,...,-0.717971,-0.495281,0.572224,-0.925437,0.804601,-0.577143,-0.529517,-0.341193,-0.514481,1
179,-0.147410,-0.450629,0.431157,-0.475726,-0.401899,-0.427819,-0.412937,-0.427899,-0.808630,-0.778134,...,-0.830871,-0.482382,0.407083,-0.964906,1.091608,-0.024653,0.172669,1.213366,-0.153765,1
180,-0.091771,-0.368893,0.654494,-0.467454,-0.401899,-0.421063,-0.405669,-0.420016,-0.223257,-0.232802,...,-0.242014,-0.464771,0.282490,-1.409595,0.746841,-0.530665,-0.520393,-0.308002,-0.694716,1
181,-0.139683,-0.394787,0.591420,-0.465386,-0.401899,-0.434575,-0.398402,-0.435781,-0.606062,-0.613505,...,-0.609021,-0.457825,0.222006,-0.867496,0.919415,-0.018076,-0.123887,0.778801,-0.098363,1


In [47]:
combined_test_np = combined_test.drop(['status'], axis = 1).to_numpy()
combined_test_np.shape

(157, 22)

In [74]:
model = psa(0.8)
model.fit(train)
predictions, min_dis = model.predict(combined_test_np)
actual = combined_test['status'].to_numpy()

comparison = pd.DataFrame({'Actual': actual, 'Predicted': predictions, 'Min Distance': min_dis, 'Result': np.equal(actual, predictions)})
comparison_healthy = comparison[comparison['Actual'] == 0]
comparison_patient = comparison[comparison['Actual'] == 1]

In [76]:

# Calculating accuracy of the model
total_accuracy = (np.sum(comparison['Result']) / len(comparison))*100
patient_accuracy = (np.sum(comparison_patient['Result']) / len(comparison_patient))*100 # True Positive
healthy_accuracy = (np.sum(comparison_healthy['Result']) / len(comparison_healthy))*100 # False Positive

print(f"Total Accuracy: {total_accuracy} %")
print(f"Patient Accuracy: {patient_accuracy} %")
print(f"Healthy Accuracy: {healthy_accuracy} %")

Total Accuracy: 93.63057324840764 %
Patient Accuracy: 95.91836734693877 %
Healthy Accuracy: 60.0 %


In [80]:
# Computing the ROC score (receiver operating characteristic score) to evaluate the model

from sklearn.metrics import roc_auc_score

roc_score = roc_auc_score(actual, predictions)
print(f"ROC Score: {roc_score}")

ROC Score: 0.7795918367346938
