In [None]:
import glob
import os
import numpy as np
import pandas as pd
import parselmouth
import librosa

from parselmouth.praat import call
from sklearn.preprocessing import StandardScaler

In [None]:
from pyrpde import rpde
from pydub import AudioSegment
import speech_recognition as sr
import io

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
# author: Dominik Krzeminski (dokato)

import numpy as np
import matplotlib.pyplot as plt
import scipy.signal as ss

# detrended fluctuation analysis

def calc_rms(x, scale):
    """
    windowed Root Mean Square (RMS) with linear detrending.

    Args:
    -----
      *x* : numpy.array
        one dimensional data vector
      *scale* : int
        length of the window in which RMS will be calculaed
    Returns:
    --------
      *rms* : numpy.array
        RMS data in each window with length len(x)//scale
    """
    # making an array with data divided in windows
    shape = (x.shape[0]//scale, scale)
    X = np.lib.stride_tricks.as_strided(x,shape=shape)
    # vector of x-axis points to regression
    scale_ax = np.arange(scale)
    rms = np.zeros(X.shape[0])
    for e, xcut in enumerate(X):
        coeff = np.polyfit(scale_ax, xcut, 1)
        xfit = np.polyval(coeff, scale_ax)
        # detrending and computing RMS of each window
        rms[e] = np.sqrt(np.mean((xcut-xfit)**2))
    return rms

def dfa(x, scale_lim=[5,9], scale_dens=0.25, show=False):
    """
    Detrended Fluctuation Analysis - measures power law scaling coefficient
    of the given signal *x*.

    More details about the algorithm you can find e.g. here:
    Hardstone, R. et al. Detrended fluctuation analysis: A scale-free
    view on neuronal oscillations, (2012).

    Args:
    -----
      *x* : numpy.array
        one dimensional data vector
      *scale_lim* = [5,9] : list of length 2
        boundaries of the scale, where scale means windows among which RMS
        is calculated. Numbers from list are exponents of 2 to the power
        of X, eg. [5,9] is in fact [2**5, 2**9].
        You can think of it that if your signal is sampled with F_s = 128 Hz,
        then the lowest considered scale would be 2**5/128 = 32/128 = 0.25,
        so 250 ms.
      *scale_dens* = 0.25 : float
        density of scale divisions, eg. for 0.25 we get 2**[5, 5.25, 5.5, ... ]
      *show* = False
        if True it shows matplotlib log-log plot.
    Returns:
    --------
      *scales* : numpy.array
        vector of scales (x axis)
      *fluct* : numpy.array
        fluctuation function values (y axis)
      *alpha* : float
        estimation of DFA exponent
    """
    # cumulative sum of data with substracted offset
    y = np.cumsum(x - np.mean(x))
    scales = (2**np.arange(scale_lim[0], scale_lim[1], scale_dens)).astype(np.int)
    fluct = np.zeros(len(scales))
    # computing RMS for each window
    for e, sc in enumerate(scales):
        fluct[e] = np.sqrt(np.mean(calc_rms(y, sc)**2))
    # fitting a line to rms data
    coeff = np.polyfit(np.log2(scales), np.log2(fluct), 1)
    if show:
        fluctfit = 2**np.polyval(coeff,np.log2(scales))
        plt.loglog(scales, fluct, 'bo')
        plt.loglog(scales, fluctfit, 'r', label=r'$\alpha$ = %0.2f'%coeff[0])
        plt.title('DFA')
        plt.xlabel(r'$\log_{10}$(time window)')
        plt.ylabel(r'$\log_{10}$<F(t)>')
        plt.legend()
        plt.show()
    return scales, fluct, coeff[0]


if __name__=='__main__':
    n = 1000
    x = np.random.randn(n)


# # Using code modified from https://github.com/drfeinberg/PraatScripts

In [None]:
# This is the function to measure voice pitch
def measurePitch(voiceID, f0min, f0max, unit):
    sound = parselmouth.Sound(voiceID) # read the sound
    pitch = call(sound, "To Pitch", 0.0, f0min, f0max) #create a praat pitch object
    meanF0 = call(pitch, "Get mean", 0, 0, unit) # get mean pitch
    stdevF0 = call(pitch, "Get standard deviation", 0 ,0, unit) # get standard deviation
    harmonicity = call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
    hnr = call(harmonicity, "Get mean", 0, 0)
    pointProcess = call(sound, "To PointProcess (periodic, cc)", f0min, f0max)
    localJitter = call(pointProcess, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
    localabsoluteJitter = call(pointProcess, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3)
    rapJitter = call(pointProcess, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3)
    ppq5Jitter = call(pointProcess, "Get jitter (ppq5)", 0, 0, 0.0001, 0.02, 1.3)
    ddpJitter = call(pointProcess, "Get jitter (ddp)", 0, 0, 0.0001, 0.02, 1.3)
    localShimmer =  call([sound, pointProcess], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    localdbShimmer = call([sound, pointProcess], "Get shimmer (local_dB)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    apq3Shimmer = call([sound, pointProcess], "Get shimmer (apq3)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    aqpq5Shimmer = call([sound, pointProcess], "Get shimmer (apq5)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    apq11Shimmer =  call([sound, pointProcess], "Get shimmer (apq11)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    ddaShimmer = call([sound, pointProcess], "Get shimmer (dda)", 0, 0, 0.0001, 0.02, 1.3, 1.6)


    return meanF0, stdevF0, hnr, localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter, localShimmer, localdbShimmer, apq3Shimmer, aqpq5Shimmer, apq11Shimmer, ddaShimmer

In [None]:
def featuresVoiceNonTraditional(y):
    entropy, histogram = rpde(y, dim=4, tau=35, epsilon = 0.12, tmax = 1500)
    scales, fluct, alpha = dfa(y,show=False)
    return entropy, alpha

In [None]:
def audiosegment_to_librosawav(audiosegment):
    samples = [audiosegment.get_array_of_samples()]
    
    fp_arr = np.array(samples).T.astype(np.float32)
    fp_arr /= np.iinfo(samples[0].typecode).max
    fp_arr = fp_arr.reshape(-1)
    
    return fp_arr

In [None]:
def create_voice_df_pd(path, csv_name):

  file_list = []
  mean_F0_list = []
  sd_F0_list = []
  hnr_list = []
  localJitter_list = []
  localabsoluteJitter_list = []
  rapJitter_list = []
  ppq5Jitter_list = []
  ddpJitter_list = []
  localShimmer_list = []
  localdbShimmer_list = []
  apq3Shimmer_list = []
  aqpq5Shimmer_list = []
  apq11Shimmer_list = []
  ddaShimmer_list = []
  rpde_list = []
  dfa_list = []
  class_list = []

  for wave_file in glob.glob(path):
      sound = parselmouth.Sound(wave_file)
      y = AudioSegment.from_wav(wave_file)
      y = audiosegment_to_librosawav(y)
      entropy, alpha = featuresVoiceNonTraditional(y)
      (meanF0, stdevF0, hnr, localJitter, localabsoluteJitter,
      rapJitter, ppq5Jitter, ddpJitter, localShimmer, localdbShimmer,
      apq3Shimmer, aqpq5Shimmer, apq11Shimmer, ddaShimmer) = measurePitch(sound, 75, 500, "Hertz")
      file_list.append(wave_file)
      class_list.append(1)
      mean_F0_list.append(meanF0) # make a mean F0 list
      sd_F0_list.append(stdevF0) # make a sd F0 list
      hnr_list.append(hnr)
      localJitter_list.append(localJitter)
      localabsoluteJitter_list.append(localabsoluteJitter)
      rapJitter_list.append(rapJitter)
      ppq5Jitter_list.append(ppq5Jitter)
      ddpJitter_list.append(ddpJitter)
      localShimmer_list.append(localShimmer)
      localdbShimmer_list.append(localdbShimmer)
      apq3Shimmer_list.append(apq3Shimmer)
      aqpq5Shimmer_list.append(aqpq5Shimmer)
      apq11Shimmer_list.append(apq11Shimmer)
      ddaShimmer_list.append(ddaShimmer)
      rpde_list.append(entropy)
      dfa_list.append(alpha)


  col = ['voiceName','class','meanF0Hz', 'stdevF0Hz', 'HNR', 'localJitter', 'localabsoluteJitter', 'rapJitter',
                                        'ppq5Jitter', 'ddpJitter', 'localShimmer', 'localdbShimmer', 'apq3Shimmer', 'apq5Shimmer',
                                        'apq11Shimmer', 'ddaShimmer', 'RPDE', 'DFA']

  col_float = ['class','meanF0Hz', 'stdevF0Hz', 'HNR', 'localJitter', 'localabsoluteJitter', 'rapJitter',
                                        'ppq5Jitter', 'ddpJitter', 'localShimmer', 'localdbShimmer', 'apq3Shimmer', 'apq5Shimmer',
                                        'apq11Shimmer', 'ddaShimmer', 'RPDE', 'DFA']


  df = pd.DataFrame(np.column_stack([file_list, class_list, mean_F0_list, sd_F0_list, hnr_list, localJitter_list,
                                   localabsoluteJitter_list, rapJitter_list, ppq5Jitter_list, ddpJitter_list,
                                   localShimmer_list, localdbShimmer_list, apq3Shimmer_list, aqpq5Shimmer_list,
                                   apq11Shimmer_list, ddaShimmer_list, rpde_list, dfa_list]),
                               columns= col)  #add these lists to pandas in the right order

  for column in col_float:
    df[column] = df[column].astype(float)

  # Write out the updated dataframe
  df.to_csv(csv_name, index=False)

In [None]:
def create_voice_df_control(path, csv_name):

  file_list = []
  mean_F0_list = []
  sd_F0_list = []
  hnr_list = []
  localJitter_list = []
  localabsoluteJitter_list = []
  rapJitter_list = []
  ppq5Jitter_list = []
  ddpJitter_list = []
  localShimmer_list = []
  localdbShimmer_list = []
  apq3Shimmer_list = []
  aqpq5Shimmer_list = []
  apq11Shimmer_list = []
  ddaShimmer_list = []
  rpde_list = []
  dfa_list = []
  class_list = []

  for wave_file in glob.glob(path):
      sound = parselmouth.Sound(wave_file)
      y = AudioSegment.from_wav(wave_file)
      y = audiosegment_to_librosawav(y)
      entropy, alpha = featuresVoiceNonTraditional(y)
      (meanF0, stdevF0, hnr, localJitter, localabsoluteJitter,
      rapJitter, ppq5Jitter, ddpJitter, localShimmer, localdbShimmer,
      apq3Shimmer, aqpq5Shimmer, apq11Shimmer, ddaShimmer) = measurePitch(sound, 75, 500, "Hertz")
      file_list.append(wave_file)
      class_list.append(0)
      mean_F0_list.append(meanF0) # make a mean F0 list
      sd_F0_list.append(stdevF0) # make a sd F0 list
      hnr_list.append(hnr)
      localJitter_list.append(localJitter)
      localabsoluteJitter_list.append(localabsoluteJitter)
      rapJitter_list.append(rapJitter)
      ppq5Jitter_list.append(ppq5Jitter)
      ddpJitter_list.append(ddpJitter)
      localShimmer_list.append(localShimmer)
      localdbShimmer_list.append(localdbShimmer)
      apq3Shimmer_list.append(apq3Shimmer)
      aqpq5Shimmer_list.append(aqpq5Shimmer)
      apq11Shimmer_list.append(apq11Shimmer)
      ddaShimmer_list.append(ddaShimmer)
      rpde_list.append(entropy)
      dfa_list.append(alpha)


  col = ['voiceName','class','meanF0Hz', 'stdevF0Hz', 'HNR', 'localJitter', 'localabsoluteJitter', 'rapJitter',
                                        'ppq5Jitter', 'ddpJitter', 'localShimmer', 'localdbShimmer', 'apq3Shimmer', 'apq5Shimmer',
                                        'apq11Shimmer', 'ddaShimmer', 'RPDE', 'DFA']

  col_float = ['class','meanF0Hz', 'stdevF0Hz', 'HNR', 'localJitter', 'localabsoluteJitter', 'rapJitter',
                                        'ppq5Jitter', 'ddpJitter', 'localShimmer', 'localdbShimmer', 'apq3Shimmer', 'apq5Shimmer',
                                        'apq11Shimmer', 'ddaShimmer', 'RPDE', 'DFA']


  df = pd.DataFrame(np.column_stack([file_list, class_list, mean_F0_list, sd_F0_list, hnr_list, localJitter_list,
                                   localabsoluteJitter_list, rapJitter_list, ppq5Jitter_list, ddpJitter_list,
                                   localShimmer_list, localdbShimmer_list, apq3Shimmer_list, aqpq5Shimmer_list,
                                   apq11Shimmer_list, ddaShimmer_list, rpde_list, dfa_list]),
                               columns= col)  #add these lists to pandas in the right order


  for column in col_float:
    df[column] = df[column].astype(float)

  # Write out the updated dataframe
  df.to_csv(csv_name, index=False)

#Import Data

In [None]:
def create_csv(directory):
    if 'Dataset1' in directory:
        for folder in os.listdir(directory):
            if 'Control' in folder:
                create_voice_df_control(directory + folder + '/*.wav', f'{folder.lower()}.csv')
            if 'PD' in folder:
                create_voice_df_pd(directory + folder + '/*.wav', f'{folder.lower()}.csv')
            
    if 'Dataset2' in directory:
        for folder in os.listdir(directory):
            if 'Control' in folder:
                create_voice_df_control(directory + folder + '/*.wav', f'dataset2_{folder.lower()}.csv')
            if 'PD' in folder:
                create_voice_df_pd(directory + folder + '/*.wav', f'dataset2_{folder.lower()}.csv')

In [None]:
#create_csv('DATASET_FULL/Dataset1_half_toserver/')

In [None]:
#create_csv('DATASET_FULL/Dataset2_half_toserver/')

In [None]:
df_pd_01 = pd.read_csv('pd_01.csv')
df_pd_02 = pd.read_csv('pd_02.csv')
df_pd_03 = pd.read_csv('pd_03.csv')
df_pd_04 = pd.read_csv('pd_04.csv')
df_pd_05 = pd.read_csv('pd_05.csv')
df_pd_06 = pd.read_csv('pd_06.csv')
df_pd_07 = pd.read_csv('pd_07.csv')
df_pd_08 = pd.read_csv('pd_08.csv')
df_pd_09 = pd.read_csv('pd_09.csv')
df_pd_10 = pd.read_csv('pd_10.csv')
df_pd_11 = pd.read_csv('pd_11.csv')
df_pd_12 = pd.read_csv('pd_12.csv')
df_pd_13 = pd.read_csv('pd_13.csv')
df_pd_14 = pd.read_csv('pd_14.csv')
df_pd_15 = pd.read_csv('pd_15.csv')
df_pd_16 = pd.read_csv('pd_16.csv')
df_pd_17 = pd.read_csv('pd_17.csv')
df_pd_18 = pd.read_csv('pd_18.csv')
df_pd_19 = pd.read_csv('pd_19.csv')
df_pd_20 = pd.read_csv('pd_20.csv')
df_pd_21 = pd.read_csv('pd_21.csv')
df_pd_22 = pd.read_csv('pd_22.csv')
df_pd_23 = pd.read_csv('pd_23.csv')
df_pd_24 = pd.read_csv('pd_24.csv')
df_pd_25 = pd.read_csv('pd_25.csv')

In [None]:
df_control_01 = pd.read_csv('control_01.csv')
df_control_02 = pd.read_csv('control_02.csv')
df_control_03 = pd.read_csv('control_03.csv')
df_control_04 = pd.read_csv('control_04.csv')
df_control_05 = pd.read_csv('control_05.csv')
df_control_06 = pd.read_csv('control_06.csv')
df_control_07 = pd.read_csv('control_07.csv')
df_control_08 = pd.read_csv('control_08.csv')
df_control_09 = pd.read_csv('control_09.csv')
df_control_10 = pd.read_csv('control_10.csv')
df_control_11 = pd.read_csv('control_11.csv')
df_control_12 = pd.read_csv('control_12.csv')
df_control_13 = pd.read_csv('control_13.csv')
df_control_15 = pd.read_csv('control_15.csv')
df_control_16 = pd.read_csv('control_16.csv')
df_control_17 = pd.read_csv('control_17.csv')
df_control_18 = pd.read_csv('control_18.csv')
df_control_19 = pd.read_csv('control_19.csv')
df_control_20 = pd.read_csv('control_20.csv')
df_control_21 = pd.read_csv('control_21.csv')
df_control_22 = pd.read_csv('control_22.csv')
df_control_23 = pd.read_csv('control_23.csv')
df_control_24 = pd.read_csv('control_24.csv')

In [None]:
df_control_04.head()

In [None]:
df_pd_01.head()

In [None]:
df_set1_1 = pd.concat([
    df_control_01,df_control_06,df_control_11,df_control_20,df_control_15,
    df_pd_25,df_pd_19,df_pd_01,df_pd_06
]).dropna(axis=0)

df_set1_2 = pd.concat([
    df_control_02,df_control_07,df_control_19,df_control_13,
    df_pd_23,df_pd_18,df_pd_02,df_pd_07,df_pd_11
]).dropna(axis=0)

df_set1_3 = pd.concat([
    df_control_04,df_control_08,df_control_23,df_control_18,
    df_pd_24,df_pd_15,df_pd_03,df_pd_08,df_pd_12
]).dropna(axis=0)

df_set1_4 = pd.concat([
    df_control_03,df_control_09,df_control_22,df_control_17,
    df_pd_22,df_pd_16,df_pd_04,df_pd_09,df_pd_13
]).dropna(axis=0)

df_set1_5 = pd.concat([
    df_control_10,df_control_12,df_control_21,df_control_16,df_control_05,
    df_pd_21,df_pd_17,df_pd_05,df_pd_10
]).dropna(axis=0)

#omit pd_14, pd_20 due to file shortage

In [None]:
df_set1_test_1 = pd.concat([
    df_control_01,df_control_06,df_control_11,df_control_20,df_control_15,
    df_pd_25,df_pd_19,df_pd_01,df_pd_06,df_pd_14,df_pd_20,df_control_24
]).dropna(axis=0)

df_set1_test_2 = pd.concat([
    df_control_02,df_control_07,df_control_19,df_control_13,
    df_pd_23,df_pd_18,df_pd_02,df_pd_07,df_pd_11,df_pd_14,df_pd_20,df_control_24
]).dropna(axis=0)

df_set1_test_3 = pd.concat([
    df_control_04,df_control_08,df_control_23,df_control_18,
    df_pd_24,df_pd_15,df_pd_03,df_pd_08,df_pd_12,df_pd_14,df_pd_20,df_control_24
]).dropna(axis=0)

df_set1_test_4 = pd.concat([
    df_control_03,df_control_09,df_control_22,df_control_17,
    df_pd_22,df_pd_16,df_pd_04,df_pd_09,df_pd_13,df_pd_14,df_pd_20,df_control_24
]).dropna(axis=0)

df_set1_test_5 = pd.concat([
    df_control_10,df_control_12,df_control_21,df_control_16, df_control_05,
    df_pd_21,df_pd_17,df_pd_05,df_pd_10,df_pd_14,df_pd_20,df_control_24
]).dropna(axis=0)

In [None]:
df_1 = df_set1_1.drop(['stdevF0Hz'],axis=1)
df_2 = df_set1_2.drop(['stdevF0Hz'],axis=1)
df_3 = df_set1_3.drop(['stdevF0Hz'],axis=1)
df_4 = df_set1_4.drop(['stdevF0Hz'],axis=1)
df_5 = df_set1_5.drop(['stdevF0Hz'],axis=1)

In [None]:
df_test_1 = df_set1_test_1.drop(['stdevF0Hz'],axis=1)
df_test_2 = df_set1_test_2.drop(['stdevF0Hz'],axis=1)
df_test_3 = df_set1_test_3.drop(['stdevF0Hz'],axis=1)
df_test_4 = df_set1_test_4.drop(['stdevF0Hz'],axis=1)
df_test_5 = df_set1_test_5.drop(['stdevF0Hz'],axis=1)

In [None]:
print(df_1.shape)
print(df_2.shape)
print(df_3.shape)
print(df_4.shape)
print(df_5.shape)

In [None]:
print(df_test_1.shape)
print(df_test_2.shape)
print(df_test_3.shape)
print(df_test_4.shape)
print(df_test_5.shape)

In [None]:
os.listdir('DATASET_FULL')

In [None]:
UCI = pd.read_fwf('DATASET_FULL/parkinsons.data')

In [None]:
UCI['split'] = UCI['name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE'].str.split(',')

In [None]:
UCI = pd.DataFrame(UCI['split'].tolist())

In [None]:
UCI.columns = ['voiceName','meanF0Hz','MDVP:Fhi(Hz)','MDVP:Flo(Hz)','localJitter','localabsoluteJitter','rapJitter','ppq5Jitter','ddpJitter','localShimmer','localdbShimmer','apq3Shimmer','apq5Shimmer','apq11Shimmer','ddaShimmer','NHR','HNR','class','RPDE','DFA','spread1','spread2','D2','PPE']
UCI.head(5)

In [None]:
UCI_trim = UCI.drop(['spread1','spread2','D2','PPE','MDVP:Fhi(Hz)','MDVP:Flo(Hz)','NHR'], axis =1)

In [None]:
UCI_trim.head()

In [None]:
x_UCI = UCI_trim.drop(['voiceName','class'], axis = 1)
y_UCI = UCI_trim['class']
name = UCI_trim['voiceName']

In [None]:
x_UCI = x_UCI.apply(pd.to_numeric, errors = 'coerce')
y_UCI = y_UCI.apply(pd.to_numeric, errors = 'coerce')

In [None]:
UCI_new = pd.concat([name,x_UCI,y_UCI],axis = 1)

In [None]:
UCI_new.head()

In [None]:
subjects = ['S01','S34','S44','S20','S24','S26','S08','S39',
            'S33','S32','S02','S22','S37','S21','S04','S19',
            'S35','S05','S18','S16','S27','S25','S06','S10',
            'S07','S13','S43','S17','S42','S50','S49']

all_dataframes = list()
for s in subjects:
  s = UCI_new.voiceName.str.contains(s)
  df_UCI = UCI_new[s]
  all_dataframes.append(df_UCI)

In [None]:
#omint dataframe 2, 7, 8 bc of imbalance & close to mean
cv1 = pd.concat([all_dataframes[0],all_dataframes[21],all_dataframes[23],
                 all_dataframes[5],all_dataframes[20],all_dataframes[22],all_dataframes[28]])
cv2 = pd.concat([all_dataframes[1],all_dataframes[11],all_dataframes[25],
                 all_dataframes[6],all_dataframes[14],all_dataframes[16],all_dataframes[26]])
cv3 = pd.concat([all_dataframes[3],all_dataframes[19],all_dataframes[27],
                 all_dataframes[9],all_dataframes[13],all_dataframes[30],all_dataframes[18]])
cv4 = pd.concat([all_dataframes[4],all_dataframes[12],all_dataframes[24],
                 all_dataframes[10],all_dataframes[15],all_dataframes[29],all_dataframes[17]])

In [None]:
def drop(df):
  x_label = df['voiceName']
  x_drop = df.drop(['voiceName','class'],axis=1)
  y_drop = df['class']
  return x_drop, y_drop, x_label

In [None]:
cv1 = drop(cv1)
cv2 = drop(cv2)
cv3 = drop(cv3)
cv4 = drop(cv4)

In [None]:
x_cv1, y_cv1 = cv1[0], cv1[1]
x_cv2, y_cv2 = cv2[0], cv2[1]
x_cv3, y_cv3 = cv3[0], cv3[1]
x_cv4, y_cv4 = cv4[0], cv4[1]

In [None]:
df_1 = drop(df_1)
df_2 = drop(df_2)
df_3 = drop(df_3)
df_4 = drop(df_4)
df_5 = drop(df_5)

df_test_1 = drop(df_test_1)
df_test_2 = drop(df_test_2)
df_test_3 = drop(df_test_3)
df_test_4 = drop(df_test_4)
df_test_5 = drop(df_test_5)

In [None]:
x_df1, y_df1 = df_1[0], df_1[1]
x_df2, y_df2 = df_2[0], df_2[1]
x_df3, y_df3 = df_3[0], df_3[1]
x_df4, y_df4 = df_4[0], df_4[1]
x_df5, y_df5 = df_5[0], df_5[1]

x_df_test_1, y_df_test_1, test_label_1 = df_test_1[0], df_test_1[1], df_test_1[2]
x_df_test_2, y_df_test_2, test_label_2 = df_test_2[0], df_test_2[1], df_test_2[2]
x_df_test_3, y_df_test_3, test_label_3 = df_test_3[0], df_test_3[1], df_test_3[2]
x_df_test_4, y_df_test_4, test_label_4 = df_test_4[0], df_test_4[1], df_test_4[2]
x_df_test_5, y_df_test_5, test_label_5 = df_test_5[0], df_test_5[1], df_test_5[2]

In [None]:
#remove unneccessary strings from name
test_label_1 = np.array(test_label_1.str.replace(r'/content/drive/MyDrive/pd thesis/PD_DATA_1/PD_data_final_set1', '', regex=True).tolist())
test_label_2 = np.array(test_label_2.str.replace(r'/content/drive/MyDrive/pd thesis/PD_DATA_1/PD_data_final_set1', '', regex=True).tolist())
test_label_3 = np.array(test_label_3.str.replace(r'/content/drive/MyDrive/pd thesis/PD_DATA_1/PD_data_final_set1', '', regex=True).tolist())
test_label_4 = np.array(test_label_4.str.replace(r'/content/drive/MyDrive/pd thesis/PD_DATA_1/PD_data_final_set1', '', regex=True).tolist())
test_label_5 = np.array(test_label_5.str.replace(r'/content/drive/MyDrive/pd thesis/PD_DATA_1/PD_data_final_set1', '', regex=True).tolist())

#Make Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC

In [None]:
svm_rbf = SVC(kernel='rbf', probability= True)
svm_linear = SVC(kernel='linear',probability = True)

C_range1 = [1]
C_range2 = np.logspace(0,2,20,endpoint=True)

gamma_range1 = [0.009, 0.015]
gamma_range2 = np.linspace(0.01,1,100,endpoint=True)

param_grid1 = dict(SVC__C=C_range1, SVC__gamma = gamma_range1)
param_grid2 = dict(SVC__C=C_range2, SVC__gamma = gamma_range2)

In [None]:
import collections
from numpy import sqrt, argmax
from matplotlib import pyplot as plt

In [None]:
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support, roc_curve, precision_recall_curve, auc, f1_score

In [None]:
def roc(yhat,y_test):
  fpr, tpr, thresholds = roc_curve(y_test, yhat)
  roc_auc = auc(fpr, tpr)
  # calculate the g-mean for each threshold
  gmeans = sqrt(tpr * (1-fpr))
  # locate the index of the largest g-mean
  ix = argmax(gmeans)
  plt.figure(figsize=(4,3))
  # plot the roc curve for the model
  plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
  plt.plot(fpr, tpr, marker='.', label='AUC = %0.2f' % roc_auc)
  plt.scatter(fpr[ix], tpr[ix], marker='o', color='black', label='Best')
  # axis labels
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.legend()
  # show the plot
  plt.show()
  #show scores
  print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))
  return thresholds[ix]

In [None]:
def prt(yhat,y_test):
  precision, recall, thresholds = precision_recall_curve(y_test, yhat)
  fscore = (2 * precision * recall) / (precision + recall)
  # locate the index of the largest f score
  ix = argmax(fscore)
  plt.figure(figsize=(4,3))

  # plot the roc curve for the model
  no_skill = len(y_test[y_test==1]) / len(y_test)
  plt.plot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill')
  plt.plot(recall, precision, marker='.', label='SVC')
  plt.scatter(recall[ix], precision[ix], marker='o', color='black', label='Best')
  # axis labels
  plt.xlabel('Recall')
  plt.ylabel('Precision')
  plt.legend()
  # show the plot
  plt.show()
  print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
  return thresholds[ix]

In [None]:
def plot(x,y,label1):
  plt.plot(x,y, label = label1)
  plt.xlabel('Number of features used')
  plt.ylabel('Score')
  plt.legend(loc='best')
  plt.grid(axis = 'x')
  plt.show()

In [None]:
def classifierSVM_PD(x_one,y_one,x_two,y_two,x_three,y_three,x_four,y_four,param):

  pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("SVC", SVC(kernel = 'rbf', probability = True)),
  ])

  nil_idx = np.repeat(0,y_one.shape)
  one_idx = np.repeat(1,y_two.shape)
  two_idx = np.repeat(2,y_three.shape)
  three_idx = np.repeat(3,y_four.shape)

  list_idx = [nil_idx, one_idx, two_idx, three_idx]
  split = np.array([])

  for i in list_idx:
    split = np.append(split,i)

  validation_split = list(PredefinedSplit(split).split())

  x_input = np.concatenate((x_one, x_two, x_three, x_four))
  y_input = np.concatenate((y_one, y_two, y_three, y_four))

  grid_search = GridSearchCV(
      estimator = pipe,
      param_grid = param,
      verbose = 1,
      cv = validation_split,
      n_jobs = -1,
      scoring = 'f1_macro'
  )

  grid_search = grid_search.fit(x_input,y_input)

  return grid_search.best_score_, grid_search.best_params_

In [None]:
def clf_test(c, g, x_train, y_train, x_test, y_test,threshold):

  clf_for_test = SVC(C = c, gamma = g, kernel = 'rbf', probability = True)
  test_pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("SVC", clf_for_test),
  ])

  test_pipe.fit(x_train, y_train)
  y_hat = test_pipe.predict_proba(x_test)[:,1]
  #must set threshold to some value y_hat.astype(bool) will change every result to positive
  y_pred = (y_hat >= threshold).astype(bool)
    
  scores = precision_recall_fscore_support(y_test, y_pred, average='binary')
  f1_macro = f1_score(y_test, y_pred, average = 'macro')

  print('Test set ROC curve result')
  roc(y_hat, y_test)
  print('-------------------------------------------')
  print('Test set Precision-recall curve result')
  prt(y_hat, y_test)
  print('-------------------------------------------')

  print('Test report')
  print(classification_report(y_test, y_pred))
  print(confusion_matrix(y_test, y_pred))

  return y_pred, scores, f1_macro

In [None]:
def right_wrong_clf(y_test,y_pred_test, x_name_list):
  y_test_nparr = np.array(y_test)
  y_pred_test = np.array(y_pred_test)

  classification_is_right = y_test == y_pred_test
  classification_is_wrong = y_test != y_pred_test

  right_clf = x_name_list[classification_is_right]
  wrong_clf = x_name_list[classification_is_wrong]
  print(f'There are {len(wrong_clf)} misclassification from {len(wrong_clf)+len(right_clf)} voice files.')
  return right_clf, wrong_clf

In [None]:
def l1_feature_selection(x_name_list,x_one,y_one,x_two,y_two,x_three,y_three,x_four,y_four,x_test,y_test,num_features,param,name):

  features_list = []
  sel_feat_list = []

  x_one_list =[]
  x_two_list = []
  x_three_list = []
  x_four_list = []
  x_test_list = []

  best_score_list = []
  best_params_list = []

  x_one = pd.concat([x_one,x_cv1])
  x_two = pd.concat([x_two,x_cv2])
  x_three = pd.concat([x_three,x_cv3])
  x_four = pd.concat([x_four,x_cv4])

  y_one = pd.concat([y_one,y_cv1])
  y_two = pd.concat([y_two,y_cv2])
  y_three = pd.concat([y_three,y_cv3])
  y_four = pd.concat([y_four,y_cv4])

  #We won't scale anything in feature selection process -> summing train/val datasets together
  x_for_cl = pd.concat([x_one, x_two, x_three, x_four])
  y_for_cl = pd.concat([y_one, y_two, y_three, y_four])

  #L1-norm feature selection algorithm
  for n in range(1,num_features+1):
    df = pd.DataFrame(x_for_cl, columns = ['meanF0Hz', 'HNR', 'localJitter', 'localabsoluteJitter', 'rapJitter',
                                        'ppq5Jitter', 'ddpJitter', 'localShimmer', 'localdbShimmer', 'apq3Shimmer', 'apq5Shimmer',
                                        'apq11Shimmer', 'ddaShimmer', 'RPDE', 'DFA'])

    sel = SelectFromModel(LinearSVC(penalty = 'l1', max_iter = 10000, dual = False), threshold=-np.inf, max_features = n)
    sel.fit(x_for_cl, y_for_cl)

    #caching feature lists in each iteration
    sel_feat = df.columns[(sel.get_support())]
    sel_feat_list.append(sel_feat)

    #transform data and caching each iteration
    x_one_selected = sel.transform(x_one)
    x_one_list.append(x_one_selected)

    x_two_selected = sel.transform(x_two)
    x_two_list.append(x_two_selected)

    x_three_selected = sel.transform(x_three)
    x_three_list.append(x_three_selected)

    x_four_selected = sel.transform(x_four)
    x_four_list.append(x_four_selected)

    x_test_selected = sel.transform(x_test)
    x_test_list.append(x_test_selected)

    print(f'Results when using {n} features')
    print(f'Selected features are {sel_feat}.')

    cc = classifierSVM_PD(x_one_selected, y_one, x_two_selected, y_two, x_three_selected, y_three, x_four_selected, y_four, param)
    print(' ')
    print(f'The parameters are {cc[1]} with f1-macro score at {cc[0]}')
    print('-------------------------------------------------------------------------- ')

    #caching best_score, best_param of each iteration
    best_score_list.append(cc[0])
    best_params_list.append(cc[1])

    #caching independent features for counting occurrences at the end of the program
    for i in range(0,len(sel_feat)):
      features_list.append(sel_feat[i])
    

  #counting feature occurrences
  feature_dict = collections.Counter(features_list)
  df_feature_count = pd.DataFrame.from_dict(feature_dict, orient='index').transpose()
  df_rename = df_feature_count.rename(index={0:name})

  print(' ')

  best_index = np.argmax(best_score_list)
  print(f'The best model use {best_index+1} features.')

  b_params = best_params_list[best_index]
  b_score = best_score_list[best_index]
  print(f'The parameter of this model are {b_params} with f1-macro score at {b_score}.')

  best_sel_feat = sel_feat_list[best_index]
  print(f'The features of this model consist of {best_sel_feat}')
    
  print('---------------------------------------------------------')

  plot(np.linspace(1,num_features,num_features,endpoint =True,dtype=int),best_score_list,'F1-macro score')

  #selecting datasets that create the best classifier
  x_one_best = x_one_list[best_index]
  x_two_best = x_two_list[best_index]
  x_three_best = x_three_list[best_index]
  x_four_best = x_four_list[best_index]
  x_test_best = x_test_list[best_index]

  if (x_one_best.shape[1] or x_two_best.shape[1] or x_three_best.shape[1] or x_four_best.shape[1]) != (best_index +1):
    print('Error')

  print('---------------------------------------------------------')

 #summing train/val data to feed into a test classifier
  x_all = np.concatenate((x_one_best, x_two_best, x_three_best, x_four_best))
  y_all = np.concatenate((y_one, y_two, y_three, y_four))

  pred_test = clf_test(b_params['SVC__C'], b_params['SVC__gamma'], x_all, y_all, x_test_best, y_test,0.5)
  y_pred_test = pred_test[0]
    
  #checking the number/id of incorrect classification
  rw_fixthreshold = right_wrong_clf(y_test,y_pred_test,x_name_list)
    
  print('---------------------------------------------------------')

  ##thresh = clf_varied_threshold(b_params['SVC__C'], b_params['SVC__gamma'],
                       #x_one_best,y_one,x_two_best,y_two,x_three_best,y_three,x_four_best,y_four)
    
  ##pred_test_thresh = clf_test(b_params['SVC__C'], b_params['SVC__gamma'], x_all, y_all, x_test_best, y_test,thresh) 

  summary_columns =  ['1 feature','2 features','3 features','4 features','5 features',
                                    '6 features','7 features','8 features','9 features','10 features',
                                    '11 features','12 features','13 features','14 features','15 features'
  ]

  df_best_score = pd.DataFrame(
      data = best_score_list,
      columns = [name]
  ).T

  df_best_score.columns = summary_columns
  
  df_best_params = pd.DataFrame.from_dict(
      data =best_params_list
  )
  
  df_best_c = pd.DataFrame(df_best_params['SVC__C'])
  df_best_gamma = pd.DataFrame(df_best_params['SVC__gamma'])

  df_best_c.columns = [name]
  df_best_gamma.columns = [name]

  df_best_c = df_best_c.T
  df_best_gamma = df_best_gamma.T
    
  df_best_c.columns = summary_columns
  df_best_gamma.columns = summary_columns
  
  return (df_rename, rw_fixthreshold[0], rw_fixthreshold[1], df_best_score, df_best_c, 
          df_best_gamma, best_score_list, pred_test[1][1], pred_test[2], best_sel_feat)

In [None]:
def rfe(x_name_list,x_one,y_one,x_two,y_two,x_three,y_three,x_four,y_four,x_test,y_test,num_features,param,name):

  features_list = []
  sel_feat_list = []

  x_one_list =[]
  x_two_list = []
  x_three_list = []
  x_four_list = []
  x_test_list = []

  best_score_list = []
  best_params_list = []

  x_one = pd.concat([x_one,x_cv1])
  x_two = pd.concat([x_two,x_cv2])
  x_three = pd.concat([x_three,x_cv3])
  x_four = pd.concat([x_four,x_cv4])

  y_one = pd.concat([y_one,y_cv1])
  y_two = pd.concat([y_two,y_cv2])
  y_three = pd.concat([y_three,y_cv3])
  y_four = pd.concat([y_four,y_cv4])

  #We won't scale anything in feature selection process -> summing train/val datasets together
  x_for_cl = pd.concat([x_one, x_two, x_three, x_four])
  y_for_cl = pd.concat([y_one, y_two, y_three, y_four])

  #RFE feature selection algorithm

  for n in range(1,num_features+1):
    df = pd.DataFrame(x_for_cl, columns = ['meanF0Hz', 'HNR', 'localJitter', 'localabsoluteJitter', 'rapJitter',
                                        'ppq5Jitter', 'ddpJitter', 'localShimmer', 'localdbShimmer', 'apq3Shimmer', 'apq5Shimmer',
                                        'apq11Shimmer', 'ddaShimmer', 'RPDE', 'DFA'])

    rfe = RFE(estimator= LinearSVC(penalty = 'l2', max_iter = 10000, dual = False), n_features_to_select = n, step = 1)
    rfe.fit(x_for_cl,y_for_cl)

    #caching feature lists in each iteration
    rfe_feat = df.columns[(rfe.support_)]
    sel_feat_list.append(rfe_feat)

    #transform data and caching each iteration
    x_one_selected = rfe.transform(x_one)
    x_one_list.append(x_one_selected)

    x_two_selected = rfe.transform(x_two)
    x_two_list.append(x_two_selected)

    x_three_selected = rfe.transform(x_three)
    x_three_list.append(x_three_selected)

    x_four_selected = rfe.transform(x_four)
    x_four_list.append(x_four_selected)

    x_test_selected = rfe.transform(x_test)
    x_test_list.append(x_test_selected)

    print(f'Results when using {n} features')
    print(f'Selected features are {rfe_feat}.')

    cc = classifierSVM_PD(x_one_selected, y_one, x_two_selected, y_two, x_three_selected, y_three, x_four_selected, y_four, param)
    print(' ')
    print(f'The parameters are {cc[1]} with f1-macro score at {cc[0]}')
    print('-------------------------------------------------------------------------- ')

    #caching best_score, best_param of each iteration
    best_score_list.append(cc[0])
    best_params_list.append(cc[1])

    #caching independent features for counting occurrences at the end of the program
    for i in range(0,len(rfe_feat)):
      features_list.append(rfe_feat[i])


  #counting feature occurrences
  feature_dict = collections.Counter(features_list)
  df_feature_count = pd.DataFrame.from_dict(feature_dict, orient='index').transpose()
  df_rename = df_feature_count.rename(index={0:name})

  print(' ')

  best_index = np.argmax(best_score_list)
  print(f'The best model use {best_index+1} features.')

  b_params = best_params_list[best_index]
  b_score = best_score_list[best_index]
  print(f'The parameter of this model are {b_params} with f1-macro score at {b_score}.')

  best_sel_feat = sel_feat_list[best_index]
  print(f'The features of this model consist of {best_sel_feat}')

  print('---------------------------------------------------------')

  plot(np.linspace(1,num_features,num_features,endpoint =True,dtype=int),best_score_list,'F1-macro score')

    #selecting datasets that create the best classifier
  x_one_best = x_one_list[best_index]
  x_two_best = x_two_list[best_index]
  x_three_best = x_three_list[best_index]
  x_four_best = x_four_list[best_index]
  x_test_best = x_test_list[best_index]

  if (x_one_best.shape[1] or x_two_best.shape[1] or x_three_best.shape[1] or x_four_best.shape[1]) != (best_index +1):
    print('Error')

  print('---------------------------------------------------------')

 #summing train/val data to feed into a test classifier
  x_all = np.concatenate((x_one_best, x_two_best, x_three_best, x_four_best))
  y_all = np.concatenate((y_one, y_two, y_three, y_four))

  pred_test = clf_test(b_params['SVC__C'], b_params['SVC__gamma'], x_all, y_all, x_test_best, y_test,0.5)
  y_pred_test = pred_test[0]

  #checking the number/id of incorrect classification
  rw_fixthreshold = right_wrong_clf(y_test,y_pred_test,x_name_list)
    
  print('---------------------------------------------------------')
    
  ##thresh = clf_varied_threshold(b_params['SVC__C'], b_params['SVC__gamma'],
                       #x_one_best,y_one,x_two_best,y_two,x_three_best,y_three,x_four_best,y_four)
    
  ##pred_test_thresh = clf_test(b_params['SVC__C'], b_params['SVC__gamma'], x_all, y_all, x_test_best, y_test,thresh) 

  summary_columns =  ['1 feature','2 features','3 features','4 features','5 features',
                                      '6 features','7 features','8 features','9 features','10 features',
                                      '11 features','12 features','13 features','14 features','15 features'
  ]

  df_best_score = pd.DataFrame(
      data = best_score_list,
      columns = [name]
  ).T

  df_best_score.columns = summary_columns

  df_best_params = pd.DataFrame.from_dict(
      data =best_params_list
  )
  
  df_best_c = pd.DataFrame(df_best_params['SVC__C'])
  df_best_gamma = pd.DataFrame(df_best_params['SVC__gamma'])

  df_best_c.columns = [name]
  df_best_gamma.columns = [name]

  df_best_c = df_best_c.T
  df_best_gamma = df_best_gamma.T
    
  df_best_c.columns = summary_columns
  df_best_gamma.columns = summary_columns
  
  return (df_rename, rw_fixthreshold[0], rw_fixthreshold[1], df_best_score, df_best_c, 
          df_best_gamma, best_score_list, pred_test[1][1], pred_test[2], best_sel_feat)

In [None]:
subject_list = ['Dataset1_half_toserver/PD_01','Dataset1_half_toserver/PD_02','Dataset1_half_toserver/PD_03',
                'Dataset1_half_toserver/PD_04','Dataset1_half_toserver/PD_05','Dataset1_half_toserver/PD_06',
                'Dataset1_half_toserver/PD_07','Dataset1_half_toserver/PD_08','Dataset1_half_toserver/PD_09',
                'Dataset1_half_toserver/PD_10','Dataset1_half_toserver/PD_11','Dataset1_half_toserver/PD_12',
                'Dataset1_half_toserver/PD_13','Dataset1_half_toserver/PD_14','Dataset1_half_toserver/PD_15',
                'Dataset1_half_toserver/PD_16','Dataset1_half_toserver/PD_17','Dataset1_half_toserver/PD_18',
                'Dataset1_half_toserver/PD_19','Dataset1_half_toserver/PD_20','Dataset1_half_toserver/PD_21',
                'Dataset1_half_toserver/PD_22','Dataset1_half_toserver/PD_23','Dataset1_half_toserver/PD_24','Dataset1_half_toserver/PD_25',
                'Dataset1_half_toserver/Control_01','Dataset1_half_toserver/Control_02','Dataset1_half_toserver/Control_03',
                'Dataset1_half_toserver/Control_04','Dataset1_half_toserver/Control_05','Dataset1_half_toserver/Control_06',
                'Dataset1_half_toserver/Control_07','Dataset1_half_toserver/Control_08','Dataset1_half_toserver/Control_09',
                'Dataset1_half_toserver/Control_10','Dataset1_half_toserver/Control_11','Dataset1_half_toserver/Control_12',
                'Dataset1_half_toserver/Control_13','Dataset1_half_toserver/Control_15','Dataset1_half_toserver/Control_16',
                'Dataset1_half_toserver/Control_17','Dataset1_half_toserver/Control_18','Dataset1_half_toserver/Control_19',
                'Dataset1_half_toserver/Control_20','Dataset1_half_toserver/Control_21','Dataset1_half_toserver/Control_22',
                'Dataset1_half_toserver/Control_23','Dataset1_half_toserver/Control_24'
               ]

In [None]:
subject_dict = {
                'Dataset1_half_toserver/PD_01' : 'PD_01_1','Dataset1_half_toserver/PD_02' : 'PD_02_1','Dataset1_half_toserver/PD_03' : 'PD_03_1',
                'Dataset1_half_toserver/PD_04' : 'PD_04_1','Dataset1_half_toserver/PD_05' : 'PD_05_1','Dataset1_half_toserver/PD_06' : 'PD_06_1',
                'Dataset1_half_toserver/PD_07' : 'PD_07_1','Dataset1_half_toserver/PD_08' : 'PD_08_1','Dataset1_half_toserver/PD_09' : 'PD_09_1',
                'Dataset1_half_toserver/PD_10' : 'PD_10_1','Dataset1_half_toserver/PD_11' : 'PD_11_1','Dataset1_half_toserver/PD_12' : 'PD_12_1',
                'Dataset1_half_toserver/PD_13' : 'PD_13_1','Dataset1_half_toserver/PD_14' : 'PD_14_1','Dataset1_half_toserver/PD_15' : 'PD_15_1',
                'Dataset1_half_toserver/PD_16' : 'PD_16_1','Dataset1_half_toserver/PD_17' : 'PD_17_1','Dataset1_half_toserver/PD_18' : 'PD_18_1',
                'Dataset1_half_toserver/PD_19' : 'PD_19_1','Dataset1_half_toserver/PD_20' : 'PD_20_1','Dataset1_half_toserver/PD_21' : 'PD_21_1',
                'Dataset1_half_toserver/PD_22' : 'PD_22_1','Dataset1_half_toserver/PD_23' : 'PD_23_1','Dataset1_half_toserver/PD_24' : 'PD_24_1',
                'Dataset1_half_toserver/PD_25' : 'PD_25_1',
                'Dataset1_half_toserver/Control_01' : 'Control_01_1','Dataset1_half_toserver/Control_02' : 'Control_02_1',
                'Dataset1_half_toserver/Control_03' : 'Control_03_1','Dataset1_half_toserver/Control_04' : 'Control_04_1',
                'Dataset1_half_toserver/Control_05' : 'Control_05_1','Dataset1_half_toserver/Control_06' : 'Control_06_1',
                'Dataset1_half_toserver/Control_07' : 'Control_07_1','Dataset1_half_toserver/Control_08' : 'Control_08_1',
                'Dataset1_half_toserver/Control_09' : 'Control_09_1','Dataset1_half_toserver/Control_10' : 'Control_10_1',
                'Dataset1_half_toserver/Control_11' : 'Control_11_1','Dataset1_half_toserver/Control_12' : 'Control_12_1',
                'Dataset1_half_toserver/Control_13' : 'Control_13_1','Dataset1_half_toserver/Control_15' : 'Control_15_1',
                'Dataset1_half_toserver/Control_16' : 'Control_16_1','Dataset1_half_toserver/Control_17' : 'Control_17_1',
                'Dataset1_half_toserver/Control_18' : 'Control_18_1','Dataset1_half_toserver/Control_19' : 'Control_19_1',
                'Dataset1_half_toserver/Control_20' : 'Control_20_1','Dataset1_half_toserver/Control_21' : 'Control_21_1',
                'Dataset1_half_toserver/Control_22' : 'Control_22_1','Dataset1_half_toserver/Control_23' : 'Control_23_1',
                'Dataset1_half_toserver/Control_24' : 'Control_24_1'
}

In [None]:
def check_mistake_subjects(classifier,name):
  name_arr = []
  incorrect_arr = []
  total_arr = []
  percent_arr = []

  for subject_name in subject_list:
    correct = sum(subject_name in s for s in classifier[1])
    incorrect = sum(subject_name in s for s in classifier[2])
    total = correct + incorrect
    if total != 0:
      incorrect_percent = (incorrect/total)*100
      name_arr.append(subject_dict.get(subject_name))
      incorrect_arr.append(incorrect)
      total_arr.append(total)
      percent_arr.append(round(incorrect_percent,3))

  matrix = np.array([name_arr,
                        incorrect_arr,
                        total_arr,
                        percent_arr]).transpose()

  df_report = pd.DataFrame(
      data = matrix,
      columns = ['Name', 'Incorrect Samples', 'Total Samples', 'Misclassification (%)']
  ).set_index('Name')

  df_report.to_csv(f'{name}.zip', index = True)
  return df_report

In [None]:
x_df1.shape[1]

In [None]:
#sample = l1_feature_selection(test_label_5,x_df1,y_df1,x_df2,y_df2,x_df3,y_df3,x_df4,y_df4,x_df_test_5,y_df_test_5,x_df1.shape[1],param_grid1,'l1norm_rbf_0')

In [None]:
#sample_no_yhat = l1_feature_selection(test_label_5,x_df1,y_df1,x_df2,y_df2,x_df3,y_df3,x_df4,y_df4,x_df_test_5,y_df_test_5,x_df1.shape[1],param_grid1,'l1norm_rbf_0')

In [None]:
#sample2 = rfe(test_label_5,x_df1,y_df1,x_df2,y_df2,x_df3,y_df3,x_df4,y_df4,x_df_test_5,y_df_test_5,x_df1.shape[1],param_grid1,'rfe_0')

In [None]:
df_all_feature_count = pd.DataFrame()

df_best_val_score_l1 = pd.DataFrame()
df_best_val_score_rfe = pd.DataFrame()

df_best_c_l1 = pd.DataFrame()
df_best_c_rfe = pd.DataFrame()

df_best_gamma_l1 = pd.DataFrame()
df_best_gamma_rfe = pd.DataFrame()

l1norm_recall = ['L1_norm_recall']
l1norm_f1 = ['L1_norm_f1']
rfe_recall = ['RFE_recall']
rfe_f1 = ['RFE_f1']

l1_sel_feat_list = []
rfe_sel_feat_list = []

In [None]:
l1_1 = l1_feature_selection(test_label_5,x_df1,y_df1,x_df2,y_df2,x_df3,y_df3,x_df4,y_df4,x_df_test_5,y_df_test_5,x_df1.shape[1],param_grid2,'l1norm_rbf_1')

In [None]:
l1norm_recall.append(l1_1[7])
l1norm_f1.append(l1_1[8])
l1_sel_feat_list.append(l1_1[9])

In [None]:
cms1 = check_mistake_subjects(l1_1,'L1_1')
cms1

In [None]:
df_all_feature_count = df_all_feature_count.append(l1_1[0])
df_best_val_score_l1 = df_best_val_score_l1.append(l1_1[3])
df_best_c_l1 = df_best_c_l1.append(l1_1[4])
df_best_gamma_l1 = df_best_gamma_l1.append(l1_1[5])

In [None]:
l1_2 = l1_feature_selection(test_label_1,x_df2,y_df2,x_df3,y_df3,x_df4,y_df4,x_df5,y_df5,x_df_test_1,y_df_test_1,x_df1.shape[1],param_grid2,'l1norm_rbf_2')

In [None]:
l1norm_recall.append(l1_2[7])
l1norm_f1.append(l1_2[8])
l1_sel_feat_list.append(l1_2[9])

In [None]:
cms2 = check_mistake_subjects(l1_2,'L1_2')
cms2

In [None]:
df_all_feature_count = df_all_feature_count.append(l1_2[0])
df_best_val_score_l1 = df_best_val_score_l1.append(l1_2[3])
df_best_c_l1 = df_best_c_l1.append(l1_2[4])
df_best_gamma_l1 = df_best_gamma_l1.append(l1_2[5])

In [None]:
l1_3 = l1_feature_selection(test_label_2,x_df3,y_df3,x_df4,y_df4,x_df5,y_df5,x_df1,y_df1,x_df_test_2,y_df_test_2,x_df1.shape[1],param_grid2,'l1norm_rbf_3')

In [None]:
l1norm_recall.append(l1_3[7])
l1norm_f1.append(l1_3[8])
l1_sel_feat_list.append(l1_3[9])

In [None]:
cms3 = check_mistake_subjects(l1_3,'L1_3')
cms3

In [None]:
df_all_feature_count = df_all_feature_count.append(l1_3[0])
df_best_val_score_l1 = df_best_val_score_l1.append(l1_3[3])
df_best_c_l1 = df_best_c_l1.append(l1_3[4])
df_best_gamma_l1 = df_best_gamma_l1.append(l1_3[5])

In [None]:
l1_4 = l1_feature_selection(test_label_3,x_df4,y_df4,x_df5,y_df5,x_df1,y_df1,x_df2,y_df2,x_df_test_3,y_df_test_3,x_df1.shape[1],param_grid2,'l1norm_rbf_4')

In [None]:
l1norm_recall.append(l1_4[7])
l1norm_f1.append(l1_4[8])
l1_sel_feat_list.append(l1_4[9])

In [None]:
cms4 = check_mistake_subjects(l1_4,'L1_4')
cms4

In [None]:
df_all_feature_count = df_all_feature_count.append(l1_4[0])
df_best_val_score_l1 = df_best_val_score_l1.append(l1_4[3])
df_best_c_l1 = df_best_c_l1.append(l1_4[4])
df_best_gamma_l1 = df_best_gamma_l1.append(l1_4[5])

In [None]:
l1_5 = l1_feature_selection(test_label_4,x_df5,y_df5,x_df1,y_df1,x_df2,y_df2,x_df3,y_df3,x_df_test_4,y_df_test_4,x_df1.shape[1],param_grid2,'l1norm_rbf_5')

In [None]:
l1norm_recall.append(l1_5[7])
l1norm_f1.append(l1_5[8])
l1_sel_feat_list.append(l1_5[9])

In [None]:
cms5 = check_mistake_subjects(l1_5,'L1_5')
cms5

In [None]:
df_all_feature_count = df_all_feature_count.append(l1_5[0])
df_best_val_score_l1 = df_best_val_score_l1.append(l1_5[3])
df_best_c_l1 = df_best_c_l1.append(l1_5[4])
df_best_gamma_l1 = df_best_gamma_l1.append(l1_5[5])

In [None]:
def f1_allplot(x1,y1,label1,x2,y2,label2,x3,y3,label3,x4,y4,label4,x5,y5,label5):
  plt.plot(x1,y1, label = label1)
  plt.plot(x2,y2, label = label2)
  plt.plot(x3,y3, label = label3)
  plt.plot(x4,y4, label = label4)
  plt.plot(x5,y5, label = label5)
  plt.title('F1-macro score in each iteration of feature selection process')
  plt.xlabel('Number of features used')
  plt.ylabel('Score')
  plt.legend(loc='best')
  plt.grid(axis = 'x')
  plt.show()

x_range = np.linspace(1,x_df1.shape[1],x_df1.shape[1],endpoint =True,dtype=int)

In [None]:
f1_allplot(
    x_range, l1_1[6],'L1_1',
    x_range, l1_2[6], 'L1_2',
    x_range, l1_3[6],'L1_3',
    x_range, l1_4[6],'L1_4',
    x_range, l1_5[6],'L1_5'    
)

In [None]:
rfe_1 = rfe(test_label_5,x_df1,y_df1,x_df2,y_df2,x_df3,y_df3,x_df4,y_df4,x_df_test_5,y_df_test_5,x_df1.shape[1],param_grid2,'rfe_1')

In [None]:
rfe_recall.append(rfe_1[7])
rfe_f1.append(rfe_1[8])
rfe_sel_feat_list.append(rfe_1[9])

In [None]:
cms1_r = check_mistake_subjects(rfe_1,'rfe_1')
cms1_r

In [None]:
def cms_summary(l1,rfe,name):
    df = pd.concat([l1,rfe],axis=1)
    df.columns = pd.MultiIndex.from_tuples(
    zip(['','L1_norm','','','RFE',''],
       df.columns)
    )
    df.to_csv(f'{name}.csv')
    return df

In [None]:
cms_summary(cms1,cms1_r,'09102023_final_reports/Classifying mistakes summary #1')

In [None]:
df_all_feature_count = df_all_feature_count.append(rfe_1[0])
df_best_val_score_rfe = df_best_val_score_rfe.append(rfe_1[3])
df_best_c_rfe = df_best_c_rfe.append(rfe_1[4])
df_best_gamma_rfe = df_best_gamma_rfe.append(rfe_1[5])

In [None]:
rfe_2 = rfe(test_label_1,x_df2,y_df2,x_df3,y_df3,x_df4,y_df4,x_df5,y_df5,x_df_test_1,y_df_test_1,x_df1.shape[1],param_grid2,'rfe_2')

In [None]:
rfe_recall.append(rfe_2[7])
rfe_f1.append(rfe_2[8])
rfe_sel_feat_list.append(rfe_2[9])

In [None]:
cms2_r = check_mistake_subjects(rfe_2,'rfe_2')
cms2_r

In [None]:
cms_summary(cms2,cms2_r,'09102023_final_reports/Classifying mistakes summary #2')

In [None]:
df_all_feature_count = df_all_feature_count.append(rfe_2[0])
df_best_val_score_rfe = df_best_val_score_rfe.append(rfe_2[3])
df_best_c_rfe = df_best_c_rfe.append(rfe_2[4])
df_best_gamma_rfe = df_best_gamma_rfe.append(rfe_2[5])

In [None]:
rfe_3 = rfe(test_label_2,x_df3,y_df3,x_df4,y_df4,x_df5,y_df5,x_df1,y_df1,x_df_test_2,y_df_test_2,x_df1.shape[1],param_grid2,'rfe_3')

In [None]:
rfe_recall.append(rfe_3[7])
rfe_f1.append(rfe_3[8])
rfe_sel_feat_list.append(rfe_3[9])

In [None]:
cms3_r = check_mistake_subjects(rfe_3,'rfe_3')
cms3_r

In [None]:
cms_summary(cms3,cms3_r,'09102023_final_reports/Classifying mistakes summary #3')

In [None]:
df_all_feature_count = df_all_feature_count.append(rfe_3[0])
df_best_val_score_rfe = df_best_val_score_rfe.append(rfe_3[3])
df_best_c_rfe = df_best_c_rfe.append(rfe_3[4])
df_best_gamma_rfe = df_best_gamma_rfe.append(rfe_3[5])

In [None]:
rfe_4 = rfe(test_label_3,x_df4,y_df4,x_df5,y_df5,x_df1,y_df1,x_df2,y_df2,x_df_test_3,y_df_test_3,x_df1.shape[1],param_grid2,'rfe_4')

In [None]:
rfe_recall.append(rfe_4[7])
rfe_f1.append(rfe_4[8])
rfe_sel_feat_list.append(rfe_4[9])

In [None]:
cms4_r = check_mistake_subjects(rfe_4,'rfe_4')
cms4_r

In [None]:
cms_summary(cms4,cms4_r,'09102023_final_reports/Classifying mistakes summary #4')

In [None]:
df_all_feature_count = df_all_feature_count.append(rfe_4[0])
df_best_val_score_rfe = df_best_val_score_rfe.append(rfe_4[3])
df_best_c_rfe = df_best_c_rfe.append(rfe_4[4])
df_best_gamma_rfe = df_best_gamma_rfe.append(rfe_4[5])

In [None]:
rfe_5 = rfe(test_label_4,x_df5,y_df5,x_df1,y_df1,x_df2,y_df2,x_df3,y_df3,x_df_test_4,y_df_test_4,x_df1.shape[1],param_grid2,'rfe_5')

In [None]:
rfe_recall.append(rfe_5[7])
rfe_f1.append(rfe_5[8])
rfe_sel_feat_list.append(rfe_5[9])

In [None]:
cms5_r = check_mistake_subjects(rfe_5,'rfe_5')
cms5_r

In [None]:
cms_summary(cms5,cms5_r,'09102023_final_reports/Classifying mistakes summary #5')

In [None]:
df_all_feature_count = df_all_feature_count.append(rfe_5[0])
df_best_val_score_rfe = df_best_val_score_rfe.append(rfe_5[3])
df_best_c_rfe = df_best_c_rfe.append(rfe_5[4])
df_best_gamma_rfe = df_best_gamma_rfe.append(rfe_5[5])

In [None]:
f1_allplot(
    x_range, rfe_1[6],'rfe_1',
    x_range, rfe_2[6], 'rfe_2',
    x_range, rfe_3[6],'rfe_3',
    x_range, rfe_4[6],'rfe_4',
    x_range, rfe_5[6],'rfe_5'    
)

In [None]:
df_all_feature_count

In [None]:
df_best_val_score_l1

In [None]:
df_best_val_score_rfe

In [None]:
dataset_col = ['Algorithm','Dataset 1','Dataset 2','Dataset 3','Dataset 4','Dataset 5']

In [None]:
df_recall = pd.DataFrame(data = [l1norm_recall, rfe_recall],
                        columns = dataset_col).set_index(['Algorithm'])

In [None]:
df_f1 = pd.DataFrame(data = [l1norm_f1, rfe_f1],
                    columns = dataset_col).set_index(['Algorithm'])

In [None]:
df_recall['mean'] = df_recall.mean(axis=1)
df_recall = df_recall.style.highlight_max(color = 'pink',axis =0)
df_recall.to_excel('09102023_final_reports/recall.xlsx')
df_recall

In [None]:
df_f1['mean'] = df_f1.mean(axis=1)
df_f1 = df_f1.style.highlight_max(color = 'lightblue',axis =0)
df_f1.to_excel('09102023_final_reports/f1.xlsx')
df_f1

In [None]:
from collections import Counter

In [None]:
def sel_feat_count(sel_feat_list):
    counter_list = []
    for i in sel_feat_list:
        counter_list.append(Counter(i))
        
    return sum(counter_list, Counter())

def sel_feat_count_df(l1,rfe):
    l1_count = sel_feat_count(l1_sel_feat_list)
    rfe_count = sel_feat_count(rfe_sel_feat_list)
    one = pd.DataFrame.from_dict(l1_count, columns = ['l1_count'], orient='index').transpose()
    two = pd.DataFrame.from_dict(rfe_count,columns = ['rfe_count'], orient='index').transpose()
    one = one.append(two).fillna(int(0))
    one.to_csv('09102023_final_reports/selected_feature_count.csv')
    return one

In [None]:
sel_feat_count_df(l1_sel_feat_list, rfe_sel_feat_list)

In [None]:
df_best_val_score_l1 = df_best_val_score_l1.style.highlight_max(color = 'pink',axis =1)
df_best_val_score_rfe = df_best_val_score_rfe.style.highlight_max(color = 'pink',axis =1)

In [None]:
df_best_val_score_l1

In [None]:
df_best_val_score_rfe

In [None]:
df_best_val_score_l1.to_excel('09102023_final_reports/best_score_l1.xlsx')
df_best_val_score_rfe.to_excel('09102023_final_reports/best_score_rfe.xlsx')

df_best_c_l1.to_csv('09102023_final_reports/best_c_l1.csv')
df_best_c_rfe.to_csv('09102023_final_reports/best_c_rfe.csv')

df_best_gamma_l1.to_csv('09102023_final_reports/best_gamma_l1.csv')
df_best_gamma_rfe.to_csv('09102023_final_reports/best_gamma_rfe.csv')

In [None]:
df_all_feature_count = df_all_feature_count.transpose()
df_all_feature_count['average'] =df_all_feature_count.mean(axis=0)
df_all_feature_count = df_all_feature_count.sort_values(by = ['average'],ascending=False)
df_all_feature_count

In [None]:
feat_count_l1 = df_all_feature_count[['l1norm_rbf_1','l1norm_rbf_2','l1norm_rbf_3','l1norm_rbf_4', 'l1norm_rbf_5']]
feat_count_l1['avg'] = feat_count_l1.mean(axis=1)
feat_count_l1 = feat_count_l1.sort_values(by = ['avg'],ascending=False)
feat_count_l1

In [None]:
feat_count_rfe = df_all_feature_count[['rfe_1','rfe_2','rfe_3','rfe_4', 'rfe_5']]
feat_count_rfe['avg'] = feat_count_rfe.mean(axis=1)
feat_count_rfe = feat_count_rfe.sort_values(by = ['avg'],ascending=False)
feat_count_rfe

In [None]:
feat_count_l1.to_csv('09102023_final_reports/features_count_l1.csv')
feat_count_rfe.to_csv('09102023_final_reports/features_count_rfe.csv')