In [1]:
#Parkinsons Telemonitoring Data Set  

#Abstract: Oxford Parkinson's Disease Telemonitoring Dataset

#============================================================

#Data Set Characteristics:  Multivariate
#Attribute Characteristics:  Integer, Real
#Associated Tasks:  Regression
#Number of Instances:  5875
#Number of Attributes:  26
#Area:  Life
#Date Donated:  2009-10-29

#============================================================

#SOURCE:

#The dataset was created by Athanasios Tsanas (tsanasthanasis '@' gmail.com) 
#and Max Little (littlem '@' physics.ox.ac.uk) of the University of Oxford, in 
#collaboration with 10 medical centers in the US and Intel Corporation who 
#developed the telemonitoring device to record the speech signals. The 
#original study used a range of linear and nonlinear regression methods to 
#predict the clinicians Parkinsons disease symptom score on the UPDRS scale.


#============================================================

#DATA SET INFORMATION:

#This dataset is composed of a range of biomedical voice measurements from 42 
#people with early-stage Parkinson's disease recruited to a six-month trial of 
#a telemonitoring device for remote symptom progression monitoring. The 
#recordings were automatically captured in the patient's homes.

#Columns in the table contain subject number, subject age, subject gender, 
#time interval from baseline recruitment date, motor UPDRS, total UPDRS, and 
#16 biomedical voice measures. Each row corresponds to one of 5,875 voice 
#recording from these individuals. The main aim of the data is to predict the 
#motor and total UPDRS scores ('motor_UPDRS' and 'total_UPDRS') from the 16 
#voice measures.

#The data is in ASCII CSV format. The rows of the CSV file contain an instance 
#corresponding to one voice recording. There are around 200 recordings per 
#patient, the subject number of the patient is identified in the first column. 
#For further information or to pass on comments, please contact Athanasios 
#Tsanas (tsanasthanasis '@' gmail.com) or Max Little (littlem '@' 
#physics.ox.ac.uk).

#Further details are contained in the following reference -- if you use this 
#dataset, please cite:
#Athanasios Tsanas, Max A. Little, Patrick E. McSharry, Lorraine O. Ramig (2009),
#'Accurate telemonitoring of Parkinson.s disease progression by non-invasive speech tests',
#IEEE Transactions on Biomedical Engineering (to appear).

#Further details about the biomedical voice measures can be found in:
#Max A. Little, Patrick E. McSharry, Eric J. Hunter, Lorraine O. Ramig (2009),
#'Suitability of dysphonia measurements for telemonitoring of Parkinsons disease',
#IEEE Transactions on Biomedical Engineering, 56(4):1015-1022 

 
#===========================================================

#ATTRIBUTE INFORMATION:

#subject# - Integer that uniquely identifies each subject
#age - Subject age
#sex - Subject gender '0' - male, '1' - female
#test_time - Time since recruitment into the trial. The integer part is the 
#number of days since recruitment.
#motor_UPDRS - Clinicians motor UPDRS score, linearly interpolated
#total_UPDRS - Clinicians total UPDRS score, linearly interpolated
#Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP - Several measures of 
#variation in fundamental frequency
#Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA - 
#Several measures of variation in amplitude
#NHR,HNR - Two measures of ratio of noise to tonal components in the voice
#RPDE - A nonlinear dynamical complexity measure
#DFA - Signal fractal scaling exponent
#PPE - A nonlinear measure of fundamental frequency variation 


#===========================================================

#RELEVANT PAPERS:

#Little MA, McSharry PE, Hunter EJ, Ramig LO (2009),
#'Suitability of dysphonia measurements for telemonitoring of Parkinsons disease',
#IEEE Transactions on Biomedical Engineering, 56(4):1015-1022

#Little MA, McSharry PE, Roberts SJ, Costello DAE, Moroz IM.
#'Exploiting Nonlinear Recurrence and Fractal Scaling Properties for Voice Disorder Detection',
#BioMedical Engineering OnLine 2007, 6:23 (26 June 2007) 

#===========================================================

#CITATION REQUEST:

#If you use this dataset, please cite the following paper:
#A Tsanas, MA Little, PE McSharry, LO Ramig (2009)
#'Accurate telemonitoring of Parkinsons disease progression by non-invasive speech tests',
#IEEE Transactions on Biomedical Engineering (to appear). 





In [2]:
import pandas as pd
import numpy as np
import scipy

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth
from mlxtend.frequent_patterns import association_rules


In [3]:
#Load raw data file
dataFileA=pd.read_csv("parkinsons_updrs.data") #found file


In [4]:
dataFileA_np_raw = np.array(dataFileA)
dataFileA_np = dataFileA_np_raw[:,4:]

normalizedA = (dataFileA_np-np.min(dataFileA_np))/(np.max(dataFileA_np)-np.min(dataFileA_np))

#note that normalizedA isnt meaningful for the first 3 columns
normalizedA_data = normalizedA[:,3:]
normalizedA_data = normalizedA[:,:normalizedA_data.shape[1]-3]



In [5]:
#WAYS OF ENCODING THE DATA
#
# Equal size categories
# Low (<0.2) , Medium-Low (>=0.2, <0.4), Medium (>=0.4, <0.6), Medium-High (>=0.6, <0.8), High(>=0.8)
#
# Normalized distribution based
# Low (<0.024) , Medium-Low (>=0.024, <0.1583), Medium (>=0.1583, <0.8409), Medium-High (>=0.8409, <0.9806), High(>=0.9806)
#
# Extreme magnitudes
# Outlier (<0.02 or >0.98), Different(<0.1583 or >0.8417), Central(>=0.1583 and <=0.8416)


In [6]:
#Equal size categories
# Low (<0.2) , Medium-Low (>=0.2, <0.4), Medium (>=0.4, <0.6), Medium-High (>=0.6, <0.8), High(>=0.8)
dataAsEqualSize = [''] * normalizedA_data.shape[0]

for iter in range(0, normalizedA_data.shape[0]): 
    dataAsEqualSize[iter] = [''] * normalizedA_data.shape[1]
    
    for innerIter in range(0, normalizedA_data.shape[1]):
        
        if normalizedA_data[iter][innerIter] <= 0.2 :
            dataAsEqualSize[iter][innerIter] = "LOW" + str(innerIter)
        elif normalizedA_data[iter][innerIter] > 0.2 and normalizedA_data[iter][innerIter] <= 0.4:
            dataAsEqualSize[iter][innerIter] = "MEDLOW" + str(innerIter)
        elif normalizedA_data[iter][innerIter] > 0.4 and normalizedA_data[iter][innerIter] <= 0.6:
            dataAsEqualSize[iter][innerIter] = "MED" + str(innerIter)
        elif normalizedA_data[iter][innerIter] > 0.6 and normalizedA_data[iter][innerIter] <= 0.8:
            dataAsEqualSize[iter][innerIter] = "MEDHIGH" + str(innerIter)
        elif normalizedA_data[iter][innerIter] >= 0.8 :
            dataAsEqualSize[iter][innerIter] = "HIGH" + str(innerIter)


In [7]:
#Normalized distribution size categories
# Low (<0.024) , Medium-Low (>=0.024, <0.1583), Medium (>=0.1583, <0.8409), Medium-High (>=0.8409, <0.9806), High(>=0.9806)
normalDistribution = [''] * normalizedA_data.shape[0]

for iter in range(0, normalizedA_data.shape[0]): 
    normalDistribution[iter] = [''] * normalizedA_data.shape[1]
    for innerIter in range(0, normalizedA_data.shape[1]):
        if normalizedA_data[iter][innerIter] <= 0.024 :
            normalDistribution[iter][innerIter] = "LOW" + str(innerIter)
        elif normalizedA_data[iter][innerIter] > 0.024 and normalizedA_data[iter][innerIter] <= 0.1583:
            normalDistribution[iter][innerIter] = "MEDLOW" + str(innerIter)
        elif normalizedA_data[iter][innerIter] > 0.1583 and normalizedA_data[iter][innerIter] <= 0.8409:
            normalDistribution[iter][innerIter] = "MED" + str(innerIter)
        elif normalizedA_data[iter][innerIter] > 0.8409 and normalizedA_data[iter][innerIter] <= 0.9806:
            normalDistribution[iter][innerIter] = "MEDHIGH" + str(innerIter)
        elif normalizedA_data[iter][innerIter] >= 0.9806 :
            normalDistribution[iter][innerIter] = "HIGH" + str(innerIter)


In [8]:
#Extreme magnitude categories
# Outlier (<0.02 or >0.98), Different(<0.1583 or >0.8417), Central(>=0.1583 and <=0.8416)
extremeMagnitude = [''] * normalizedA_data.shape[0]

for iter in range(0, normalizedA_data.shape[0]): 
    extremeMagnitude[iter] = [''] * normalizedA_data.shape[1]
    for innerIter in range(0, normalizedA_data.shape[1]):        
        if normalizedA_data[iter][innerIter] <= 0.024 or normalizedA_data[iter][innerIter] >= 0.976:
            extremeMagnitude[iter][innerIter] = "EXTR" + str(innerIter)
        elif normalizedA_data[iter][innerIter] <= 0.1583 or normalizedA_data[iter][innerIter] >= 0.8417:
            extremeMagnitude[iter][innerIter] = "DIFF" + str(innerIter)
        else:
            extremeMagnitude[iter][innerIter] = "MED" + str(innerIter)
            


In [9]:
#NOW TO DO THE TRUTH DATA COLUMNS
rawTruth = dataFileA_np_raw[:,4:6]
normalizedTruth = (rawTruth-np.min(rawTruth))/(np.max(rawTruth)-np.min(rawTruth))


In [10]:
#Equal size categories
# Low (<0.2) , Medium-Low (>=0.2, <0.4), Medium (>=0.4, <0.6), Medium-High (>=0.6, <0.8), High(>=0.8)
normalizedTruthAsEqualSize = [''] * normalizedTruth.shape[0]

for iter in range(0, normalizedTruth.shape[0]): 
    normalizedTruthAsEqualSize[iter] = [''] * normalizedTruth.shape[1]

    for innerIter in range(0, normalizedTruth.shape[1]):
        
        if normalizedTruth[iter][innerIter] <= 0.2 :

            normalizedTruthAsEqualSize[iter][innerIter] = "LOWT" + str(innerIter)
        elif normalizedTruth[iter][innerIter] > 0.2 and normalizedTruth[iter][innerIter] <= 0.4:
            normalizedTruthAsEqualSize[iter][innerIter] = "MEDLOWT" + str(innerIter)
        elif normalizedTruth[iter][innerIter] > 0.4 and normalizedTruth[iter][innerIter] <= 0.6:
            normalizedTruthAsEqualSize[iter][innerIter] = "MEDT" + str(innerIter)
        elif normalizedTruth[iter][innerIter] > 0.6 and normalizedTruth[iter][innerIter] <= 0.8:
            normalizedTruthAsEqualSize[iter][innerIter] = "MEDHIGHT" + str(innerIter)
        elif normalizedTruth[iter][innerIter] >= 0.8 :
            normalizedTruthAsEqualSize[iter][innerIter] = "HIGHT" + str(innerIter)


In [11]:
normalizedTruthAsNormalDistribution = [''] * normalizedTruth.shape[0]

for iter in range(0, normalizedTruth.shape[0]): 
    normalizedTruthAsNormalDistribution[iter] = [''] * normalizedTruth.shape[1]

    for innerIter in range(0, normalizedTruth.shape[1]):
        
        if normalizedTruth[iter][innerIter] <= 0.024 :

            normalizedTruthAsNormalDistribution[iter][innerIter] = "LOWT" + str(innerIter)
        elif normalizedTruth[iter][innerIter] > 0.024 and normalizedTruth[iter][innerIter] <= 0.1583:
            normalizedTruthAsNormalDistribution[iter][innerIter] = "MEDLOWT" + str(innerIter)
        elif normalizedTruth[iter][innerIter] > 0.1583 and normalizedTruth[iter][innerIter] <= 0.8409:
            normalizedTruthAsNormalDistribution[iter][innerIter] = "MEDT" + str(innerIter)
        elif normalizedTruth[iter][innerIter] > 0.8409 and normalizedTruth[iter][innerIter] <= 0.9806:
            normalizedTruthAsNormalDistribution[iter][innerIter] = "MEDHIGHT" + str(innerIter)
        elif normalizedTruth[iter][innerIter] >= 0.9806 :
            normalizedTruthAsNormalDistribution[iter][innerIter] = "HIGHT" + str(innerIter)




In [12]:
#Extreme magnitude categories
# Outlier (<0.02 or >0.98), Different(<0.1583 or >0.8417), Central(>=0.1583 and <=0.8416)
normalizedTruthAsExtremeMagnitude = [''] * normalizedTruth.shape[0]

for iter in range(0, normalizedTruth.shape[0]): 
    normalizedTruthAsExtremeMagnitude[iter] = [''] * normalizedTruth.shape[1]
    for innerIter in range(0, normalizedTruth.shape[1]):        
        if normalizedA_data[iter][innerIter] <= 0.024 or normalizedTruth[iter][innerIter] >= 0.976:
            normalizedTruthAsExtremeMagnitude[iter][innerIter] = "EXTR" + str(innerIter)
        elif normalizedA_data[iter][innerIter] <= 0.1583 or normalizedTruth[iter][innerIter] >= 0.8417:
            normalizedTruthAsExtremeMagnitude[iter][innerIter] = "DIFF" + str(innerIter)
        else:
            normalizedTruthAsExtremeMagnitude[iter][innerIter] = "MED" + str(innerIter)



In [13]:
# SO NOW WE HAVE TWO SETS OF DATA, ONE IS THE MEASURED BY MACHINES AND ONE IS MEASURED BY HUMANS
# EACH SET OF DATA IS ENCODED IN THREE WAYS:
#
# Equal size categories
# Low (<0.2) , Medium-Low (>=0.2, <0.4), Medium (>=0.4, <0.6), Medium-High (>=0.6, <0.8), High(>=0.8)
#
# Normalized distribution based
# Low (<0.024) , Medium-Low (>=0.024, <0.1583), Medium (>=0.1583, <0.8409), Medium-High (>=0.8409, <0.9806), High(>=0.9806)
#
# Extreme magnitudes
# Outlier (<0.02 or >0.98), Different(<0.1583 or >0.8417), Central(>=0.1583 and <=0.8416)
#

#TRUTH (desired future purchase in the transaction model):
#normalizedTruthAsEqualSize,    normalizedTruthAsNormalDistribution,   normalizedTruthAsExtremeMagnitude

#MACHINES OBSERVED (items purchased in the transaction model):
#dataAsEqualSize,    normalDistribution,      extremeMagnitude

#ADDITIONAL DATA POINTS THAT CAN BE USED
#Different frequency calculations (fpgrowth, fpmax, apriori)
#min_support 


In [22]:
#EXAMPLES TAKEN FROM 
#https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/

dataset = np.concatenate((normalizedTruthAsEqualSize,dataAsEqualSize), axis=1)

te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
df_max = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets_grow = fpgrowth(df, min_support=0.6)
frequent_itemsets_ap = apriori(df, min_support=0.6)
frequent_itemsets_max = fpmax(df_max, min_support=0.6)

#print(frequent_itemsets_grow)
#print("HERE")
#print(frequent_itemsets_ap)
#print("HERE")
#print(frequent_itemsets_max)

rules = association_rules(frequent_itemsets_grow, metric="lift", min_threshold=1.2)
print(rules)
rules = association_rules(frequent_itemsets_grow, metric="confidence", min_threshold=0.7)
#print(rules)

rules = association_rules(frequent_itemsets_ap, metric="lift", min_threshold=1.2)
#print(rules)
rules = association_rules(frequent_itemsets_ap, metric="confidence", min_threshold=0.7)
#print(rules)

rules = association_rules(frequent_itemsets_max, metric="lift", min_threshold=1.2, support_only=True)
#print(rules)
rules = association_rules(frequent_itemsets_max, metric="confidence", min_threshold=0.7, support_only=True)
#print(rules)



Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction, zhangs_metric]
Index: []
