In [15]:
import numpy as np

import pandas as pd
import statistics
import math

from sklearn.cluster import KMeans, AgglomerativeClustering

import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage

import os
import shutil
import random

In [16]:
def convert_goal_to_binary(goal, length):

    results = []
    for i in range(length):
        if i == goal:
            results.append(1)
        else:
            results.append(0)
    return results



def bacc4binary(pred, answer):
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    total = len(pred)
    
    for i in range(total):
        if pred[i] == 1 and answer[i] == 1:
            tp += 1
        if pred[i] == 0 and answer[i] == 0:
            tn += 1
        if pred[i] == 1 and answer[i] == 0:
            fp += 1
        if pred[i] == 0 and answer[i] == 1:
            fn += 1

    tpr = tp/(tp + fn)
    tnr = tn/(tn + fp)
    bacc = (tpr + tnr)/2

    return bacc

def metrics4binary(pred, answer, metric):
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    total = len(pred)
    
    for i in range(total):
        if pred[i] == 1 and answer[i] == 1:
            tp += 1
        if pred[i] == 0 and answer[i] == 0:
            tn += 1
        if pred[i] == 1 and answer[i] == 0:
            fp += 1
        if pred[i] == 0 and answer[i] == 1:
            fn += 1
            
    m = 0
    if metric == "p":
        m = tp/(tp + fp)
    if metric == "r":
        m = tp/(tp + fn)
    if metric == "f1":
        m = 2*tp/(2*tp + fp + fn)

    return m


def select_features(df, labels, features, n_features):
    ## create empty lists as placeholders
    grouped_features = []
    for i in range(n_features):
        new = []
        grouped_features.append(new)

    for i in range(len(features)):
        grouped_features[labels[i]].append(features[i])

    selected_features = []
    for fs in grouped_features:
        matrix = df[fs].corr().abs()
        max_f_id = matrix.sum(axis=1).argmax()
        selected_features.append(fs[max_f_id])
        
    return selected_features

# sort the df to traces for every subject and goals
def extract_traces(dfn):
    traces = []

    Subject = 0
    Loc = 0
    Iteration = 0
    tup = (Subject, Loc, Iteration)

    for index, row in dfn.iterrows():
        curr_Subject = row["Subject"]
        curr_Loc = row["Loc"]
        curr_Iteration = row["Iteration"]
        curr_tup = (curr_Subject, curr_Loc, curr_Iteration)

        if curr_tup != tup:
            #print("new trace")
            tup = curr_tup

            rslt_df = dfn[(dfn['Subject'] == curr_Subject) 
                      & (dfn['Loc'] == curr_Loc) 
                      & (dfn['Iteration'] == curr_Iteration)]

            rslt_df.reset_index(drop=True, inplace=True)
            traces.append(rslt_df)
            
    return traces


def convert2subtraces(traces, goals, subject_id):
    # generate classifiers here
    
    subtraces = []
    for goal in goals:
        subtraces_goalX = []
        for t in traces:
            if t["Subject"][0] == subject_id and t["Loc"][0] == goal:                     
                subtraces_goalX.append(t)
        subtraces.append(subtraces_goalX)
    return subtraces


def reCreateDir(dirName):
    # Check whether the specified path exists or not
    isExist = os.path.exists(dirName)
    if isExist:
        # delete
        shutil.rmtree(dirName)
    
    os.makedirs(dirName)
    
    
def pick_a_point_by_per(trace, header_len, per=None):
    if per == None:
        feature_values = []
        goal = []
        for idx in range(len(trace)):
            row = trace.iloc[idx]
            feature_values.append(row.values[header_len::]) 
            goal.append(row["Loc"])
    else:
        idx = math.ceil(len(trace) * per) - 1
        row = trace.iloc[idx]
        feature_values = row.values[header_len::]
        goal = row["Loc"]
        
    return feature_values, goal


    
def prepare_data(traces, test_id, percent_list, header_len):
    total_num_traces = len(traces)
    training_set_x = []
    training_set_y = []
    test_x = []
    test_y = []

    # test_id = 5
    for i in range(total_num_traces):
        
        if i != test_id:
            x, y = pick_a_point_by_per(traces[i], header_len)  # or remove percent
            training_set_x += x
            training_set_y += y
        else:
            for percent in percent_list:
                x, y = pick_a_point_by_per(traces[i], header_len, percent)
                test_x.append(x)
                test_y.append(y)
            
    return training_set_x, training_set_y, test_x, test_y

def random_features(features, select_nums):
    random.shuffle(features)
    return features[0:select_nums]

In [21]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


# parameters
sub_ids = [1,2,3,4,5,6,7,8,9,10]
features_num_list = [29, 1, 2, 34, 32, 28, 22, 34, 23, 28]
num_candidates = 3
## the number of irrelevant features : index_headers
header_len = 4
index_headers = 4
obs = [0.1, 0.3, 0.5, 0.7, 1.0]



#############################################################################
# main script
input_data = "corrected_final_input_data.csv"
df = pd.read_csv(input_data)
all_features = df.columns.values.tolist()[index_headers::]
df_context = df[all_features]
    
############################## select features ##############################

baccListAllSub = []
wrongProbTupleAllSub = []
for subject_id in sub_ids:
    n_features = features_num_list[subject_id-1]
    
    corr_matrix = df_context.corr().abs()
    hierarchical_cluster = AgglomerativeClustering(n_clusters=n_features, affinity='euclidean', linkage='ward').fit(corr_matrix)
    labels = hierarchical_cluster.labels_

    selected_features = select_features(df, labels, all_features, n_features)
    #selected_features = random_features(all_features, n_features)
    output_results = "lda/dynamic_lda_sub_%s.csv"%subject_id

    df_a_subject = df.loc[df['Subject'] == subject_id].reset_index(drop=True)
    if n_features == 1:
        reduced = df_a_subject[["Subject", "Loc", "Iteration"] + selected_features + selected_features]
    else:
        reduced = df_a_subject[["Subject", "Loc", "Iteration"] + selected_features]
    traces = extract_traces(reduced)
    
    goals = list(df_a_subject["Loc"].unique())
    goals.sort()
    subtraces = convert2subtraces(traces, goals, subject_id)
    
    os.system("rm -rf %s" % output_results)
    df_for_output = pd.DataFrame()
    
    
    for test_id in range(30):
        training_set_x = []
        training_set_y = []
        test_x = []
        test_y = []
        for i in range(len(subtraces)):  # how many goals
            X, Y, x, y = prepare_data(subtraces[i], test_id, obs, header_len)
            training_set_x += X
            training_set_y += Y

            test_x += x
            test_y += y
            
        clf = LinearDiscriminantAnalysis()
        clf.fit(training_set_x, training_set_y)
        pred = clf.predict(test_x)
        prob = clf.predict_proba(test_x)
        
        
        pred_str = [str(int(p))+"/" for p in pred]
        real_str = [str(int(ty)) for ty in test_y]
        
        prob_str = []
        for pl in prob:
            pstr = ""
            for pv in pl:
                pstr += str(pv)+"/"
            prob_str.append(pstr)
        
        new_rows = {'Real_Goal': real_str, 'Cost': prob_str, 'Prob': prob_str, 'Results': pred_str}
        #df_for_output = pd.DataFrame(new_rows)
        df_for_output = df_for_output.append(pd.DataFrame(new_rows), ignore_index=True)

        
    df_for_output.to_csv(output_results, index=False)
        



# Trained with last 10 points

In [12]:
def prepare_data_split10(traces, percent_list, header_len):
    total_num_traces = len(traces)
    training_set_x = []
    training_set_y = []
    test_x = []
    test_y = []

    # test_id = 5
    for i in range(total_num_traces):
        
        x, y = pick_a_point_by_per(traces[i][-10::], header_len)  # or remove percent
        training_set_x += x
        training_set_y += y

        for percent in percent_list:
            x, y = pick_a_point_by_per(traces[i][0:-10], header_len, percent)
            test_x.append(x)
            test_y.append(y)
            
    return training_set_x, training_set_y, test_x, test_y

In [23]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


# parameters
sub_ids = [1,2,3,4,5,6,7,8,9,10]
# sub_ids = [1]
features_num_list = [29, 1, 2, 34, 32, 28, 22, 34, 23, 28]
num_candidates = 3
## the number of irrelevant features : index_headers
header_len = 4
index_headers = 4
obs = [0.1, 0.3, 0.5, 0.7, 1.0]



#############################################################################
# main script
input_data = "Corrected_UpperLimbReachingData_AddVelocity&FinalStatic.csv"
df = pd.read_csv(input_data)
all_features = df.columns.values.tolist()[index_headers::]
df_context = df[all_features]
    
############################## select features ##############################

baccListAllSub = []
wrongProbTupleAllSub = []
for subject_id in sub_ids:
    n_features = features_num_list[subject_id-1]
    
    corr_matrix = df_context.corr().abs()
    hierarchical_cluster = AgglomerativeClustering(n_clusters=n_features, affinity='euclidean', linkage='ward').fit(corr_matrix)
    labels = hierarchical_cluster.labels_

    #selected_features = select_features(df, labels, all_features, n_features)
    selected_features = random_features(all_features, n_features)
    output_results = "lda/static_lda_subr_%s.csv"%subject_id

    df_a_subject = df.loc[df['Subject'] == subject_id].reset_index(drop=True)
    if n_features == 1:
        reduced = df_a_subject[["Subject", "Loc", "Iteration"] + selected_features + selected_features]
    else:
        reduced = df_a_subject[["Subject", "Loc", "Iteration"] + selected_features]
    traces = extract_traces(reduced)
    
    goals = list(df_a_subject["Loc"].unique())
    goals.sort()
    subtraces = convert2subtraces(traces, goals, subject_id)
    
    os.system("rm -rf %s" % output_results)
    df_for_output = pd.DataFrame()
    
    # train model with the last 10 points
    training_set_x = []
    training_set_y = []
    test_x = []
    test_y = []
    for i in range(len(subtraces)):  # how many goals
        X, Y, x, y = prepare_data_split10(subtraces[i], obs, header_len)
        training_set_x += X
        training_set_y += Y

        test_x += x
        test_y += y

    clf = LinearDiscriminantAnalysis()
    clf.fit(training_set_x, training_set_y)
    pred = clf.predict(test_x)
    prob = clf.predict_proba(test_x)
        
    pred_str = [str(int(p))+"/" for p in pred]
    real_str = [str(int(ty)) for ty in test_y]

    prob_str = []
    for pl in prob:
        pstr = ""
        for pv in pl:
            pstr += str(pv)+"/"
        prob_str.append(pstr)

    new_rows = {'Real_Goal': real_str, 'Cost': prob_str, 'Prob': prob_str, 'Results': pred_str}
    #df_for_output = pd.DataFrame(new_rows)
    df_for_output = df_for_output.append(pd.DataFrame(new_rows), ignore_index=True)

        
    df_for_output.to_csv(output_results, index=False)

