In [2]:
import numpy as np

import pandas as pd
import statistics
import math

In [3]:
def func_precision(stringList, answer):
    goal_count = 0
    found = 0
    for result in stringList:
        if result and int(result) == int(answer):
            found = 1
        goal_count += 1

    return found/(goal_count-1)

def func_recall(stringList, answer):
    found = 0
    for result in stringList:
        if result and int(result) == int(answer):
            found = 1
            break
    return found

def func_f1(total, stringList, answer):
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for result in stringList[0:-1]:
        if result and int(result) == int(answer):
            tp += 1
        else:
            fp += 1
    
    fn = 1 - tp
    
    # total is the number of all goals
    tn = total - tp - fp - fn
    return 2*tp/(2*tp + fp + fn)

def func_accuracy(total, stringList, answer):
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for result in stringList[0:-1]:
        if result and int(result) == int(answer):
            tp += 1
        else:
            fp += 1
    
    fn = 1 - tp
    
    # total is the number of all goals
    tn = total - tp - fp - fn
    return (tp + tn)/(tn + tp + fp + fn)


def func_bacc(total, stringList, answer):
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for result in stringList[0:-1]:
        if result and int(result) == int(answer):
            tp += 1
        else:
            fp += 1
    
    fn = 1 - tp
    
    # total is the number of all goals
    tn = total - tp - fp - fn

    tpr = tp/(tp + fn)
    tnr = tn/(tn + fp)
    bacc = (tpr + tnr)/2

    return bacc




# return a list of each statistics for every testing case
def calculate_statistics(rows):
    length = rows.shape[0]

    precision = []
    recall = []
    f1_score = []
    accuracy = []
    b_accuracy = []
        
    for index, row in rows.iterrows():
        
        answer = row["Real_Goal"]
        results = row["Results"].split("/")
        all_candidates = row["Cost"].split("/")
        
        total = len(all_candidates)-1   # the last one is /
        
        p = func_precision(results, answer)
        r = func_recall(results, answer)
        f = func_f1(total, results, answer)
        a = func_accuracy(total, results, answer)
        bacc = func_bacc(total, results, answer)
        
        precision.append(p)
        recall.append(r)
        f1_score.append(f)
        accuracy.append(a)
        b_accuracy.append(bacc)
    
    return precision, recall, f1_score, accuracy, b_accuracy

In [17]:
sub_ids = [1,2,3,4,5,6,7,8,9,10]

# prim_results/f1_def_results_%s.csv
# lda/dynamic_lda_sub_%s.csv
# lstm/lstm_sub_%s.csv
for sid in sub_ids:
    dfr = pd.read_csv("lstm/lstm_sub_%s.csv" % sid)
    p,r,f1,a,bacc = calculate_statistics(dfr)
    metric_list = r
    p10 = []
    p30 = []
    p50 = []
    p70 = []
    p100 = []
    for i in range(len(metric_list)):
        if i%5 == 0:
            p10.append(metric_list[i])
        if i%5 == 1:
            p30.append(metric_list[i])
        if i%5 == 2:
            p50.append(metric_list[i])
        if i%5 == 3:
            p70.append(metric_list[i])
        if i%5 == 4:
            p100.append(metric_list[i])
        
    print(statistics.mean(p10))
    print(statistics.mean(p30))
    print(statistics.mean(p50))
    print(statistics.mean(p70))
    print(statistics.mean(p100))


0.3333333333333333
0.3888888888888889
0.4888888888888889
0.5555555555555556
0.6888888888888889
0.37777777777777777
0.34444444444444444
0.36666666666666664
0.36666666666666664
0.6777777777777778
0.45555555555555555
0.5
0.5444444444444444
0.7111111111111111
0.7111111111111111
0.4444444444444444
0.4111111111111111
0.4222222222222222
0.4777777777777778
0.5333333333333333
0.35555555555555557
0.4444444444444444
0.3888888888888889
0.4888888888888889
0.6333333333333333
0.3111111111111111
0.35555555555555557
0.34444444444444444
0.37777777777777777
0.6333333333333333
0.32222222222222224
0.43333333333333335
0.32222222222222224
0.43333333333333335
0.5333333333333333
0.35555555555555557
0.3888888888888889
0.4666666666666667
0.6
0.6777777777777778
0.35555555555555557
0.3888888888888889
0.3888888888888889
0.45555555555555555
0.6111111111111112
0.4111111111111111
0.4444444444444444
0.5111111111111111
0.6222222222222222
0.7111111111111111


In [27]:
# overall of each subject
for subject_id in [1,2,3,4,5,6,7,8,9,10]:
    #output_results: lstm/lstm_subr_%s.csv
    data = pd.read_csv("prim_results/f1_def_results_%s.csv"%subject_id, usecols=[0,1,2,3,4])
    p, r, f1, a, bacc = calculate_statistics(data)
    
    metric = f1
    print(statistics.mean(metric))

0.6314814814814814
0.5014814814814815
0.6548148148148148
0.5418518518518518
0.6885185185185185
0.5581481481481482
0.51
0.6059259259259259
0.47888888888888886
0.7425925925925926


In [24]:
## each level of observation averaged over all subjects
sub_ids = [1,2,3,4,5,6,7,8,9,10]
p10 = []
p30 = []
p50 = []
p70 = []
p100 = []
# prim_results/f1_def_results_%s.csv
# lda/last_lda_sub_%s.csv
# lstm/lstm_sub_%s.csv
for sid in sub_ids:
    dfr = pd.read_csv("lda/dynamic_lda_sub_%s.csv" % sid)
    p,r,f1,a,bacc = calculate_statistics(dfr)
    metric_list = f1

    for i in range(len(metric_list)):
        if i%5 == 0:
            p10.append(metric_list[i])
        if i%5 == 1:
            p30.append(metric_list[i])
        if i%5 == 2:
            p50.append(metric_list[i])
        if i%5 == 3:
            p70.append(metric_list[i])
        if i%5 == 4:
            p100.append(metric_list[i])
        
print(statistics.mean(p10))
print(statistics.mean(p30))
print(statistics.mean(p50))
print(statistics.mean(p70))
print(statistics.mean(p100))

0.48444444444444446
0.56
0.6311111111111111
0.6511111111111111
0.6622222222222223


In [5]:
# dft = pd.read_csv("PMGR/sub1/kmeans60_f32_results_1.0.csv")
# dft = pd.read_csv("PMGR_new/sub8/kmeans100_f34_sub8.csv")
dft = pd.read_csv("lstm/lstm_sub_1.csv")
p,r,f1,a,bacc = calculate_statistics(dft)

In [19]:
len(p)

450

In [18]:
from scipy import stats

# Data points from cluster 1 and cluster 2
cluster1_data = [1, 2, 3, 4, 5]
cluster2_data = [6, 7, 8, 9, 10]

# Perform t-test
t_statistic, p_value = stats.ttest_ind(cluster1_data, cluster2_data)

# Check significance
alpha = 0.05  # Significance level
if p_value < alpha:
    print("The clusters are significantly different.")
else:
    print("The clusters are not significantly different.")

The clusters are significantly different.


In [22]:
p_value

0.001052825793366539

# Select N features and M clusters

In [15]:
import os
import sys

# "manual", "kmeans"
classifier_option = "kmeans"
selected_num_features = [21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40]
# diff
clusters_num = [10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200]

selected_nf = []
selected_nc = []
max_metric_list = []

for sid in [1,2,3,4,5,6,7,8,9,10]:
    results_dir = "PMGR_new/sub%s"%sid
    cwd = os.getcwd()
    
    max_nf = 0
    max_nc = 0
    max_metric = 0
    
    for fn in selected_num_features:
        for cl in clusters_num:

            csv_file = "%s%s_f%s_sub%s.csv" % (classifier_option, str(cl), str(fn), str(sid) )
            abs_path_csv_file = os.path.join(cwd, results_dir, csv_file)
            if os.path.exists(abs_path_csv_file):
                data = pd.read_csv(abs_path_csv_file)
                p, r, f1, a, bacc = calculate_statistics(data)
                metric = statistics.mean(f1)  #####
                if metric > max_metric:
                    max_nf = fn
                    max_nc = cl
                    max_metric = metric
                
            else:
                print(csv_file + "    Not Exist")
                
    selected_nf.append(max_nf)
    selected_nc.append(max_nc)
    max_metric_list.append(max_metric)


print(selected_nf)  ### features
print(selected_nc)  ### clusters
print(max_metric_list)

[29, 32, 34, 34, 32, 28, 22, 34, 23, 28]
[70, 10, 120, 50, 90, 160, 80, 100, 100, 170]
[0.5766666666666667, 0.5011111111111111, 0.6259259259259259, 0.5214814814814814, 0.5840740740740741, 0.5622222222222222, 0.5307407407407407, 0.5844444444444444, 0.5574074074074074, 0.6037037037037037]


In [16]:
statistics.mean(max_metric_list)

0.5647777777777777

# Remove the last 10 points from source data

In [44]:
## remove last 10 datapoints

# sort the df to traces for every subject and goals
def extract_traces(dfn):
    traces = []

    Subject = 0
    Loc = 0
    Iteration = 0
    tup = (Subject, Loc, Iteration)

    for index, row in dfn.iterrows():
        curr_Subject = row["Subject"]
        curr_Loc = row["Loc"]
        curr_Iteration = row["Iteration"]
        curr_tup = (curr_Subject, curr_Loc, curr_Iteration)

        if curr_tup != tup:
            #print("new trace")
            tup = curr_tup

            rslt_df = dfn[(dfn['Subject'] == curr_Subject) 
                      & (dfn['Loc'] == curr_Loc) 
                      & (dfn['Iteration'] == curr_Iteration)]

            rslt_df.reset_index(drop=True, inplace=True)
            traces.append(rslt_df)
            
    return traces


input_data_file = "Corrected_UpperLimbReachingData_AddVelocity&FinalStatic.csv"
output_data_file = "corrected_final_input_data.csv"

df = pd.read_csv(input_data_file)
df_traces = extract_traces(df)

df_remove_10_last = pd.DataFrame([])

for df_a_trace in df_traces:
    df_remove_10_last = pd.concat([df_remove_10_last, df_a_trace.iloc[0:-10,:]], axis=0)
    
df_remove_10_last.reset_index(drop=True, inplace=True)

df_remove_10_last.to_csv(output_data_file, index=False)