In [2]:
import pandas as pd
import numpy as np
import itertools
import random
import ast
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import spearmanr, wilcoxon, friedmanchisquare, kendalltau, pearsonr
from matplotlib import animation
from mpl_toolkits import mplot3d
from scipy.spatial import distance
from pyxdameraulevenshtein import damerau_levenshtein_distance

pd.set_option('display.max_rows', 500)

### Prepare Data

In [3]:
# read data

df = pd.read_csv('automatica_task_environments_no_outliers.csv', header=0,
                 skip_blank_lines=True)

In [5]:
df

Unnamed: 0,ID,sequence_original,sequence_no_duplicates,sequence_clustered_silverware,sequence_clustered_drawers,coordinates_original,coordinates_clustered_silverware,coordinates_clustered_drawers,start_coordinates,strong_k,mid_k,food_k,containment,error,length,error_clustered_silverware,length_clustered_silverware,error_clustered_drawers,length_clustered_drawers
0,a1,pocgkr,pocgkr,pocgz,pocgba,"p: (0.008034,0.957082,0.6890539999999999);o: (...","p: (0.008034,0.957082,0.6890539999999999);o: (...","p: (0.008034,0.957082,0.6890539999999999);o: (...","[-0.451354,-0.413918,0.156247],[0.513,-0.531,0...",t,"p,o",0,0,0.723,6,0.666,5,0.723,5
1,a5,kfsfksppwwggg,kfsfkspwg,zpwg,bpwg,"k: (-0.07699199999999999,0.733425,0.531662);f:...","k: (-0.07699199999999999,0.733425,0.531662);f:...","k: (-0.07699199999999999,0.733425,0.531662);f:...","[-0.451354,-0.413918,0.156247],[0.513,-0.531,0...",t,"p,o",0,0,0.806,9,0.605,4,0.605,4
2,a11,ppfkswwkfsococggg,pfkswkfsococg,pzwzocndg,pbwbocndg,"p: (0.008034,0.957082,0.6890539999999999);p: (...","p: (0.008034,0.957082,0.6890539999999999);p1: ...","p: (0.008034,0.957082,0.6890539999999999);p1: ...","[-0.451354,-0.413918,0.156247],[0.513,-0.531,0...",t,"p,o",0,0,0.862,13,0.806,9,0.806,9
3,a13,wptgkfsoc,wptgkfsoc,wptgzoc,wptgboc,"w: (-0.196022,1.35572,0.466005);p: (0.008034,0...","w: (-0.196022,1.35572,0.466005);p: (0.008034,0...","w: (-0.196022,1.35572,0.466005);p: (0.008034,0...","[-0.451354,-0.413918,0.156247],[0.513,-0.531,0...",t,"p,o",0,0,0.806,9,0.756,7,0.756,7
4,a19,ppoowwcceerr,powcer,powcz,powca,"p: (0.008034,0.957082,0.6890539999999999);p: (...","p: (0.008034,0.957082,0.6890539999999999);p1: ...","p: (0.008034,0.957082,0.6890539999999999);p1: ...","[-0.451354,-0.413918,0.156247],[0.513,-0.531,0...",t,"p,o",0,0,0.723,6,0.666,5,0.666,5
5,a26,wwoppoffkkssskf,wopofkkskf,wopnz,wopnb,"w: (-0.196022,1.35572,0.466005);w: (-0.0570049...","w: (-0.196022,1.35572,0.466005);w1: (-0.057004...","w: (-0.196022,1.35572,0.466005);w1: (-0.057004...","[-0.451354,-0.413918,0.156247],[0.513,-0.531,0...",t,"p,o",0,0,0.824,10,0.666,5,0.666,5
6,a31,ppooccrrfksfks,pocrfksfks,pocz,pocab,"p: (0.008034,0.957082,0.6890539999999999);p: (...","p: (0.008034,0.957082,0.6890539999999999);p1: ...","p: (0.008034,0.957082,0.6890539999999999);p1: ...","[-0.451354,-0.413918,0.156247],[0.513,-0.531,0...",t,"p,o",0,0,0.824,10,0.605,4,0.666,4
7,a37,ppwwooccggfkkfssrere,pwocgfkfsrere,pwocgz,pwocgba,"p: (0.008034,0.957082,0.6890539999999999);p: (...","p: (0.008034,0.957082,0.6890539999999999);p1: ...","p: (0.008034,0.957082,0.6890539999999999);p1: ...","[-0.451354,-0.413918,0.156247],[0.513,-0.531,0...",t,"p,o",0,0,0.862,13,0.723,6,0.756,6
8,a40,wpgsfk,wpgsfk,wpgz,wpgb,"w: (-0.196022,1.35572,0.466005);p: (0.008034,0...","w: (-0.196022,1.35572,0.466005);p: (0.008034,0...","w: (-0.196022,1.35572,0.466005);p: (0.008034,0...","[-0.451354,-0.413918,0.156247],[0.513,-0.531,0...",t,"p,o",0,0,0.723,6,0.605,4,0.605,4
9,a43,pwofsg,pwofsg,pwozg,pwobg,"p: (0.008034,0.957082,0.6890539999999999);w: (...","p: (0.008034,0.957082,0.6890539999999999);w: (...","p: (0.008034,0.957082,0.6890539999999999);w: (...","[-0.451354,-0.413918,0.156247],[0.513,-0.531,0...",t,"p,o",0,0,0.723,6,0.666,5,0.666,5


### Define functions for prediction

In [6]:
def predict_sequence(objects, coordinates, start_coordinates, c, k, dimension=[3,]):
    ''' Predicts sequence based on required objects, object coordinates, start coordinates of subject,
        parameters (c+k) and dimensionality.
        Input: Objects, object coordinates, start coordinates, c, k, dimension
        Output: Sequence of objects as str
    '''
    prediction = []
    possible_items = dict.fromkeys(objects, 0) # generate dict from object list
    coord_index = 0
    start_coords = start_coordinates
    coords = coordinates
    new_coords = {}
    new_start_coords = []
    
    if dimension[0] == 3: # no changes if 3D
        new_coords = coords
        new_start_coords = start_coords
        
    elif dimension[0] == 2: # 2D: remove obsolete coordinate
        if dimension[1] == 'xy':
            new_coords = {key: value[:-1] for key, value in coords.items()}
            new_start_coords = [x[:-1] for x in start_coords]
                
        elif dimension[1] == 'xz':
            new_start_coords = [[x[0], x[-1]] for x in start_coords]
            
            for key, value in coords.items():
                new_value = (value[0], value[-1])
                new_coords[key] = new_value
        
        elif dimension[1] == 'yz':
            new_coords = {key: value[1:] for key, value in coords.items()}
            new_start_coords = [x[1:] for x in start_coords]
                
    elif dimension[0] == 1: # 1D: choose appropriate coordinate
        if dimension[1] == 'x':
            new_coords = {key: value[0] for key, value in coords.items()}
            new_start_coords = [x[0] for x in start_coords]
        
        elif dimension[1] == 'y':
            new_coords = {key: value[1] for key, value in coords.items()}
            new_start_coords = [x[1] for x in start_coords]
        
        elif dimension[1] == 'z':
            new_coords = {key: value[2] for key, value in coords.items()}
            new_start_coords = [x[2] for x in start_coords]
    
    while bool(possible_items) == True: # while dict not empty
        for obj in possible_items.keys():
            possible_items[obj] = ((distance.euclidean(
                                new_start_coords[coord_index], 
                                new_coords[obj])
                                ) ** k[obj]) * c[obj]
        #print(possible_items)                      
        minval = min(possible_items.values())
        minval = [k for k, v in possible_items.items() if v == minval]
        minval = random.choice(minval) # choose prediction randomly if multiple items have same cost
        prediction.append(minval)
        del possible_items[minval]
        coord_index += 1
        
    return prediction

In [7]:
def get_median(objects, coordinates, start_coordinates, c, k, dimension, sequence,n=10):
    ''' Returns average edit distance (Damerau-Levenshtein) for 100 trials of sequence prediction.
    '''
    edit_list = []

    for x in range(0,n):
        #print(objects)
        result = ''.join(predict_sequence(objects, coordinates, start_coordinates, c, k, dimension))
        dl = damerau_levenshtein_distance(sequence,result)
        edit_list.append(dl)
    
    #avg = np.mean(edit_list)
    median = np.median(edit_list)
    return median

In [8]:
def get_avg_editdist(data, dimensions=[[1,'x'],[1,'y'],[1,'z'],[2,'xy'],[2,'xz'],[2,'yz'],[3,'xyz']],n=10,
                    seq='sequence_original', coords='coordinates_original', error='error'):
    ''' Calculates average edit distance for all combinations of parameters (c, k, dimension).
        Input: Dataframe with objects, coordinates, start coordinates, object categories
        Output: Dataframe with edit distance results (col name: parameters used)
    '''
    results = pd.DataFrame()
        
    for row in range(0,len(data)):
        objects = list(data.at[row,seq])
        strong_k = list(data.at[row,'strong_k'].split(','))
        mid_k = list(data.at[row,'mid_k'].split(','))
        #food_k = list(data.at[row,'food_k'].split(','))
        food_k = []
        coordinates = {key: ast.literal_eval(value) for key, value in (elem.split(': ') for elem in data.at[row,coords].split(';'))}
        #print(coordinates)
        start_coordinates = list(ast.literal_eval(data.at[row,'start_coordinates']))
        sequence = str(data.at[row,seq])
        c1 = {obj: 1.2 for obj in objects}
        k1 = {obj: 1.0 for obj in objects}
        
        #for k2 in np.arange(1.1,2.0,0.1):
            #k_food = round(k2,2)
            #k1 = {obj: k_food if obj in food_k else 1.0 for obj in objects}
        
        for k in np.arange(0,0.9,0.1):
            k_strong = round(k,2)
            k_mid = round(k + 0.1,2)
            k1 = {obj: k_strong if obj in strong_k else k_mid if obj in mid_k else round(k1[obj],2) for obj in objects}
            
                #for c in np.arange(1.0,2.0,0.1):
                #    c = round(c, 1)
                #    c1 = {obj: 1.2 for obj in objects}
                
            for dim in dimensions:                
                # get average edit distance
                median = get_median(objects, coordinates, start_coordinates, c1, k1, dim, sequence, n)
                median = median / len(sequence)
                #params = 'c: ' + str(c) + '; k: ' + str(k_strong) + ',' + str(k_mid) + ',' + str(k_food) + '; ' + str(dim[1])
                params = 'c: 1.2' + '; k: ' + str(k_strong) + ',' + str(k_mid) + '; ' + str(dim[1])
                results.at[row,params] = median
                        
        results.at[row,'sequence'] = sequence
        results.at[row,'error'] = data.at[row,error]
        results.at[row,'ID'] = data.at[row,'ID']
                    
    return results

### Calculate edit distances, create df for original sequences

In [9]:
# ~3 min for n=100
%time results_original = get_avg_editdist(df,n=100)

CPU times: user 2min 44s, sys: 764 µs, total: 2min 44s
Wall time: 2min 44s


In [15]:
results_original

Unnamed: 0,"c: 1.2; k: 0.0,0.1; x","c: 1.2; k: 0.0,0.1; y","c: 1.2; k: 0.0,0.1; z","c: 1.2; k: 0.0,0.1; xy","c: 1.2; k: 0.0,0.1; xz","c: 1.2; k: 0.0,0.1; yz","c: 1.2; k: 0.0,0.1; xyz","c: 1.2; k: 0.1,0.2; x","c: 1.2; k: 0.1,0.2; y","c: 1.2; k: 0.1,0.2; z",...,"c: 1.2; k: 0.8,0.9; x","c: 1.2; k: 0.8,0.9; y","c: 1.2; k: 0.8,0.9; z","c: 1.2; k: 0.8,0.9; xy","c: 1.2; k: 0.8,0.9; xz","c: 1.2; k: 0.8,0.9; yz","c: 1.2; k: 0.8,0.9; xyz",sequence,error,ID
0,0.666667,0.833333,0.833333,0.833333,0.833333,0.833333,0.833333,0.666667,0.833333,0.833333,...,0.666667,0.833333,0.833333,0.666667,0.833333,0.833333,0.666667,pocgkr,0.723,a1
1,0.615385,0.769231,0.692308,0.692308,0.615385,0.769231,0.692308,0.615385,0.769231,0.692308,...,0.615385,0.615385,0.692308,0.538462,0.615385,0.615385,0.538462,kfsfksppwwggg,0.806,a5
2,0.647059,0.647059,0.705882,0.647059,0.705882,0.647059,0.647059,0.647059,0.647059,0.705882,...,0.647059,0.647059,0.647059,0.647059,0.764706,0.647059,0.647059,ppfkswwkfsococggg,0.862,a11
3,0.888889,0.777778,0.888889,0.666667,0.888889,0.777778,0.666667,0.888889,0.666667,0.888889,...,0.777778,0.888889,0.666667,1.0,0.777778,0.888889,1.0,wptgkfsoc,0.806,a13
4,0.833333,0.833333,0.833333,0.75,0.833333,0.833333,0.75,0.833333,0.833333,0.833333,...,0.75,0.833333,0.833333,0.666667,0.75,0.833333,0.666667,ppoowwcceerr,0.723,a19
5,0.866667,0.733333,0.733333,0.733333,0.866667,0.733333,0.733333,0.866667,0.733333,0.733333,...,0.8,0.866667,0.666667,0.866667,0.733333,0.866667,0.866667,wwoppoffkkssskf,0.824,a26
6,0.857143,0.714286,0.785714,0.785714,0.857143,0.785714,0.785714,0.857143,0.714286,0.785714,...,0.785714,0.785714,0.785714,0.857143,0.785714,0.857143,0.857143,ppooccrrfksfks,0.824,a31
7,0.8,0.85,0.8,0.8,0.75,0.85,0.8,0.8,0.85,0.8,...,0.75,0.85,0.8,0.85,0.75,0.9,0.85,ppwwooccggfkkfssrere,0.862,a37
8,0.833333,0.833333,0.666667,0.833333,0.833333,0.833333,0.833333,0.833333,0.833333,0.666667,...,0.833333,1.0,0.666667,1.0,0.833333,1.0,1.0,wpgsfk,0.723,a40
9,1.0,0.666667,0.666667,0.666667,1.0,0.666667,0.666667,1.0,0.666667,0.666667,...,0.833333,0.833333,0.5,0.833333,0.833333,0.833333,0.833333,pwofsg,0.723,a43


### Calculate edit distances, create df for clustered sequences (silverware)

In [12]:
results_silverware = get_avg_editdist(df,n=100, seq='sequence_clustered_silverware',
                                    coords='coordinates_clustered_silverware', 
                                    error='error_clustered_silverware')

In [16]:
results_silverware

Unnamed: 0,"c: 1.2; k: 0.0,0.1; x","c: 1.2; k: 0.0,0.1; y","c: 1.2; k: 0.0,0.1; z","c: 1.2; k: 0.0,0.1; xy","c: 1.2; k: 0.0,0.1; xz","c: 1.2; k: 0.0,0.1; yz","c: 1.2; k: 0.0,0.1; xyz","c: 1.2; k: 0.1,0.2; x","c: 1.2; k: 0.1,0.2; y","c: 1.2; k: 0.1,0.2; z",...,"c: 1.2; k: 0.8,0.9; x","c: 1.2; k: 0.8,0.9; y","c: 1.2; k: 0.8,0.9; z","c: 1.2; k: 0.8,0.9; xy","c: 1.2; k: 0.8,0.9; xz","c: 1.2; k: 0.8,0.9; yz","c: 1.2; k: 0.8,0.9; xyz",sequence,error,ID
0,0.8,0.6,1.0,0.6,0.8,0.6,0.6,0.8,0.6,1.0,...,0.6,0.6,0.8,0.4,0.6,0.6,0.4,pocgz,0.666,a1
1,0.25,0.5,1.0,0.25,1.0,0.5,0.25,0.25,0.5,1.0,...,0.0,0.25,0.5,0.0,0.5,0.25,0.0,zpwg,0.605,a5
2,0.777778,0.777778,0.777778,0.666667,0.777778,0.777778,0.666667,0.777778,0.777778,0.777778,...,0.777778,0.666667,0.777778,0.555556,0.777778,0.666667,0.555556,pzwzocndg,0.806,a11
3,0.857143,1.0,0.857143,0.857143,0.857143,1.0,0.857143,0.857143,0.857143,0.857143,...,0.714286,0.857143,0.571429,0.857143,0.714286,0.857143,0.857143,wptgzoc,0.756,a13
4,0.8,0.6,0.8,0.6,1.0,0.6,0.6,0.8,0.6,0.8,...,0.6,0.6,0.8,0.6,0.6,0.6,0.6,powcz,0.666,a19
5,1.0,0.6,0.6,0.6,1.0,0.6,0.6,1.0,0.6,0.6,...,0.8,0.4,0.6,0.6,0.8,0.4,0.6,wopnz,0.666,a26
6,1.0,0.5,1.0,0.5,1.0,0.5,0.5,1.0,0.5,1.0,...,0.5,0.75,1.0,0.5,0.5,0.75,0.5,pocz,0.605,a31
7,0.833333,0.666667,1.0,0.833333,0.833333,0.666667,0.833333,0.833333,0.666667,1.0,...,0.5,0.666667,1.0,0.666667,0.5,0.666667,0.666667,pwocgz,0.723,a37
8,0.5,0.75,0.5,1.0,1.0,0.75,1.0,0.5,0.75,0.5,...,0.75,0.5,0.25,0.75,0.5,0.5,0.75,wpgz,0.605,a40
9,0.6,0.8,0.8,0.8,0.8,0.8,0.8,0.6,0.8,0.8,...,0.4,0.8,0.6,0.6,0.4,0.8,0.6,pwozg,0.666,a43


### Calculate edit distances, create df for clustered sequences (drawers)

In [17]:
results_drawers = get_avg_editdist(df, n = 100, seq = 'sequence_clustered_drawers',
                                    coords = 'coordinates_clustered_drawers',
                                    error = 'error_clustered_drawers')

In [18]:
results_drawers

Unnamed: 0,"c: 1.2; k: 0.0,0.1; x","c: 1.2; k: 0.0,0.1; y","c: 1.2; k: 0.0,0.1; z","c: 1.2; k: 0.0,0.1; xy","c: 1.2; k: 0.0,0.1; xz","c: 1.2; k: 0.0,0.1; yz","c: 1.2; k: 0.0,0.1; xyz","c: 1.2; k: 0.1,0.2; x","c: 1.2; k: 0.1,0.2; y","c: 1.2; k: 0.1,0.2; z",...,"c: 1.2; k: 0.8,0.9; x","c: 1.2; k: 0.8,0.9; y","c: 1.2; k: 0.8,0.9; z","c: 1.2; k: 0.8,0.9; xy","c: 1.2; k: 0.8,0.9; xz","c: 1.2; k: 0.8,0.9; yz","c: 1.2; k: 0.8,0.9; xyz",sequence,error,ID
0,0.666667,0.833333,0.833333,0.833333,0.666667,0.833333,0.833333,0.666667,0.833333,0.833333,...,0.666667,0.833333,0.833333,0.666667,0.666667,0.833333,0.666667,pocgba,0.723,a1
1,0.75,0.5,1.0,0.25,0.75,0.5,0.25,0.75,0.5,1.0,...,0.5,0.25,0.5,0.0,0.5,0.25,0.0,bpwg,0.605,a5
2,0.777778,0.777778,0.777778,0.666667,0.777778,0.777778,0.666667,0.777778,0.777778,0.777778,...,0.777778,0.666667,0.777778,0.555556,0.777778,0.666667,0.555556,pbwbocndg,0.806,a11
3,0.571429,1.0,0.857143,0.857143,0.571429,1.0,0.857143,0.571429,0.857143,0.857143,...,0.857143,0.857143,0.571429,0.857143,0.714286,0.857143,0.857143,wptgboc,0.756,a13
4,0.8,0.6,0.8,0.6,0.8,0.6,0.6,0.8,0.6,0.8,...,0.8,0.6,0.8,0.6,0.8,0.6,0.6,powca,0.666,a19
5,1.0,0.6,0.6,0.4,0.4,0.6,0.4,1.0,0.6,0.6,...,0.8,0.4,0.6,0.6,0.4,0.4,0.6,wopnb,0.666,a26
6,0.8,0.6,1.0,0.6,0.8,0.6,0.6,0.8,0.6,1.0,...,0.8,1.0,0.8,0.8,0.8,1.0,0.8,pocab,0.666,a31
7,0.714286,0.714286,1.0,0.857143,0.714286,0.714286,0.857143,0.714286,0.714286,1.0,...,0.571429,0.714286,0.857143,0.857143,0.571429,0.714286,0.857143,pwocgba,0.756,a37
8,0.5,0.75,0.5,1.0,0.5,0.75,1.0,0.5,0.75,0.5,...,0.5,0.5,0.25,0.75,0.5,0.5,0.75,wpgb,0.605,a40
9,0.6,0.8,0.8,0.8,0.6,0.8,0.8,0.6,0.8,0.8,...,0.4,0.8,0.6,0.6,0.4,0.8,0.6,pwobg,0.666,a43


### Get lowest error, compare edit distances for all variants

In [19]:
def get_lowest_error(results):
    ''' Returns lowest error in dataframe and index of lowest error.
    '''
    for col in list(results):
        if col != 'sequence' and col != 'error' and col != 'ID':
            results.loc['mean',col] = results[col].mean()
            #results.loc['median',col] = results[col].median()
    lowest = min(results.loc['mean'])
    mean = list(results.loc['mean'])
    
    return lowest, results.columns[(results.loc['mean'] == lowest)], mean, results

In [28]:
# original sequences

lowest, lowest_idx, list_mean, results_mean = get_lowest_error(results_original)
lowest, lowest_idx, np.mean(results_original['error'])

(0.7254395188887167,
 Index(['c: 1.2; k: 0.7,0.8; z', 'c: 1.2; k: 0.8,0.9; z'], dtype='object'),
 0.7903272727272728)

In [29]:
# silverware clustered

lowest_s, lowest_idx_s, list_mean_s, results_mean_silverware = get_lowest_error(results_silverware)
lowest_s, lowest_idx_s, np.mean(results_silverware['error'])

(0.6003823953823954,
 Index(['c: 1.2; k: 0.8,0.9; xy'], dtype='object'),
 0.6809636363636364)

In [31]:
# drawers clustered

lowest_d, lowest_idx_d, list_mean_d, results_mean_drawers = get_lowest_error(results_drawers)
lowest_d, lowest_idx_d, np.mean(results_drawers['error'])

(0.6550131181949365,
 Index(['c: 1.2; k: 0.7,0.8; xz', 'c: 1.2; k: 0.8,0.9; xz'], dtype='object'),
 0.7041818181818181)

### Plot best model for all variants

In [101]:
#%matplotlib inline
%matplotlib qt

error_original = df['error']
error_silverware = df['error_clustered_silverware']
error_drawers = df['error_clustered_drawers']

seqs_original = results_mean['sequence'][:-1].values
seqs_silverware = results_mean_silverware['sequence'][:-1].values
seqs_drawers = results_mean_drawers['sequence'][:-1].values

res_original = results_original['c: 1.2; k: 0.7,0.8; z'][:-1].values
res_silverware = results_silverware['c: 1.2; k: 0.8,0.9; xy'][:-1].values
res_drawers = results_drawers['c: 1.2; k: 0.7,0.8; xz'][:-1].values


x_original = [x for x in range (0,len(seqs_original))]
x_silverware = [x for x in range (0,len(seqs_silverware))]
x_drawers = [x for x in range (0,len(seqs_drawers))]

# create subplots
fig, axs = plt.subplots(3,1)

# plot scatter + lines for all simulations
axs[0].scatter(x_original, res_original, marker='o', s=20, c='blue', alpha=0.5, label='model-generated (avg. 0.72)')
#axs[0].plot(x_original, res_original, c='blue', alpha=0.5)
axs[1].scatter(x_silverware, res_silverware, marker='o', s=20, c='green', alpha=0.5, label='model-generated (avg. 0.60)')
#axs[1].plot(x_silverware, res_silverware, c='green', alpha=0.5)
axs[2].scatter(x_drawers, res_drawers, marker='o', s=20, c='magenta', alpha=0.5, label='model-generated (avg. 0.66)')
#axs[2].plot(x_drawers, res_drawers, c='magenta', alpha=0.5)

# error function + connection for original
axs[0].scatter(x_original, error_original, c='red', s=20, marker='o', alpha=0.5, label='baseline (avg. 0.79)')
axs[0].plot(x_original, error_original, c='red', alpha=0.5)
axs[0].plot((x_original,x_original),(error_original,res_original), '--', c='black', alpha=0.6)

# error function + connection for silverware
axs[1].scatter(x_silverware, error_silverware, c='red', s=20, marker='o', alpha=0.5, label='baseline (avg. 0.68)')
axs[1].plot(x_silverware, error_silverware, c='red', alpha=0.5)
axs[1].plot((x_silverware,x_silverware),(error_silverware,res_silverware), '--', c='black', alpha=0.6)

# error function + connection for drawers
axs[2].scatter(x_drawers, error_drawers, c='red', s=20, marker='o', alpha=0.5, label='baseline (avg. 0.70)')
axs[2].plot(x_drawers, error_drawers, c='red', alpha=0.5)
axs[2].plot((x_drawers,x_drawers),(error_drawers,res_drawers), '--', c='black', alpha=0.6)

plt.suptitle('Comparison of simulation variants', fontsize=14)
axs[2].set_xticks(x_original)
axs[2].set_xticklabels(seqs_original, rotation=90, fontsize=6)

axs[0].set_ylabel('item type', fontsize=10)
axs[1].set_ylabel('silverware', fontsize=10)
axs[2].set_ylabel('drawers', fontsize=10)
axs[2].set_xlabel('sequence', fontsize=14)
#plt.legend()

axs[0].legend()
axs[1].legend()
axs[2].legend()

#plt.savefig('plot_median_editdist_individualerrors_diff.png', bbox_inches='tight')
plt.show()

In [137]:
# Compare avg edit distance for xy and xyz
list_xy = []
list_xyz = []

for col in results_mean:
    if col != 'sequence' and col != 'error' and col.split(';')[2].strip() == 'xy':
        list_xy.append(results_mean.at['mean',col])
    elif col != 'sequence' and col != 'error' and col.split(';')[2].strip() == 'xyz':
        list_xyz.append(results_mean.at['mean',col])
        
avg_xy = np.mean(list_xy)
med_xy = np.median(list_xy)
std_xy = np.std(list_xy)
print('Average xy: ' + str(avg_xy)  + ', stdev: ' + str(std_xy) + ', median: ' + str(med_xy))
        
avg_xyz = np.mean(list_xyz)
med_xyz = np.median(list_xyz)
std_xyz = np.std(list_xyz)
print('Average xyz: ' + str(avg_xyz) + ', stdev: ' + str(std_xyz) + ', median: ' + str(med_xyz))

Average xy: 0.6900369243196247, stdev: 0.011328124693162715, median: 0.6936019676526005
Average xyz: 0.6918003897961703, stdev: 0.011819520043020365, median: 0.6964794347705741


In [138]:
# Compare avg edit distances for x, y, z
list_x = []
list_y = []
list_z = []
list_xz = []
list_yz = []

for col in results_mean:
    if col != 'sequence' and col != 'error' and col.split(';')[2].strip() == 'x':
        list_x.append(results_mean.at['mean',col])
    elif col != 'sequence' and col != 'error' and col.split(';')[2].strip() == 'y':
        list_y.append(results_mean.at['mean',col])
    elif col != 'sequence' and col != 'error' and col.split(';')[2].strip() == 'z':
        list_z.append(results_mean.at['mean',col])
    elif col != 'sequence' and col != 'error' and col.split(';')[2].strip() == 'xz':
        list_xz.append(results_mean.at['mean',col])
    elif col != 'sequence' and col != 'error' and col.split(';')[2].strip() == 'yz':
        list_yz.append(results_mean.at['mean',col])

avg_xz = np.mean(list_xz)
avg_yz = np.mean(list_yz)
        
avg_x = np.mean(list_x)
lowest_x = min(list_x)
print('Average x: ' + str(avg_x) + ', min: ' + str(lowest_x)) 
        
avg_y = np.mean(list_y)
lowest_y = min(list_y)
print('Average y: ' + str(avg_y) + ', min: ' + str(lowest_y)) 
        
avg_z = np.mean(list_z)
lowest_z = min(list_z)
print('Average z: ' + str(avg_z) + ', min: ' + str(lowest_z)) 

print('Average xz: ' + str(avg_xz)) 
print('Average yz: ' + str(avg_yz)) 

Average x: 0.7063103641162712, min: 0.6769747130506624
Average y: 0.7121979754891147, min: 0.6737235338501161
Average z: 0.7603876862315682, min: 0.7048700771802038
Average xz: 0.6801632692982904
Average yz: 0.7091010606833392


### Statistical analysis

In [139]:
# Compare all error predictions for dimensions
stat, p = friedmanchisquare(list_x,list_y,list_z,list_xy,list_xz,list_yz,list_xyz)
print('Friedman: stat = %.3f, p = % 10.3E' % (stat, p))

Friedman: stat = 20.784, p =  2.006E-03


In [140]:
# 2D vs 3D median
stat, p = wilcoxon(list_xz, list_xy, zero_method='wilcox')
print('Wilcoxon: W = %.3f, p = %.5f' % (stat, p))

Wilcoxon: W = 12.000, p = 0.25000


### Plots for dataframe

In [141]:
# Define input for plots

c = [float(x[3:6]) for x in results_new.columns.tolist() if x != 'sequence' and x != 'error']
#k = [float(x[11:14]) for x in results_new.columns.tolist() if x != 'sequence']
k_strong = [float(x[11:14]) for x in results_new.columns.tolist() if x != 'sequence' and x != 'error']
k_mid = [float(x[15:18]) for x in results_new.columns.tolist() if x != 'sequence' and x != 'error']
dim = [x.strip() for x in results_new.columns[:-2].str.split(';').str[2]]
median = [x for x in results_mean.loc['mean'][:-1].tolist() if pd.notnull(x) and x != 'error']

In [142]:
%matplotlib qt

cm = ['red','blue','green','magenta','cyan','orange','grey']
dim_num = [0 if x=='x' else 1 if x=='y' else 2 if x=='z' else 3 if x=='xy' else 4 if x=='xz' else 5 if x=='yz' else 6 for x in dim]

cmap = matplotlib.colors.ListedColormap(cm)

ticks = ['x', 'y', 'z', 'xy', 'xz', 'yz', 'xyz']
norm = matplotlib.colors.BoundaryNorm(ticks, cmap.N)

# create figure, 3d grid, set background to white
fig2 = plt.figure(figsize=(12,8))
ax2 = fig2.add_subplot(111, projection='3d')
ax2.w_xaxis.set_pane_color((1.0,1.0,1.0,1.0))
ax2.w_yaxis.set_pane_color((1.0,1.0,1.0,1.0))
ax2.w_zaxis.set_pane_color((1.0,1.0,1.0,1.0))

# create plot w/ median edit distance
#img = ax2.scatter(c, k, median, alpha=0.5, s=38, c=dim_num, cmap=cmap)
img = ax2.scatter(k_strong, k_mid, median, alpha=0.5, s=38, c=dim_num, cmap=cmap)

# plot horizontal plane for baseline
#xx, yy = np.meshgrid(np.linspace(1,1.9), np.linspace(0,0.9))
xx, yy = np.meshgrid(np.linspace(0,0.9), np.linspace(0,0.9))
#yy = np.meshgrid(range(2), range(2))
zz = xx * 0 + 0.706
ax2.plot_surface(xx, yy, zz, alpha=0.5)

# set labels
ax2.set_ylabel('strong k', fontsize=20, labelpad=10)
ax2.set_xlabel('mid k', fontsize=20, labelpad=10)
ax2.set_zlabel('normalized edit distance', fontsize=20, labelpad=7)
#plt.title('Average edit distance (1 step)', fontsize=24)

# create colorbar
cb = plt.colorbar(img, cax = fig2.add_axes([0.9,0.3,0.03,0.4]))
cb.ax.set_yticklabels(ticks, fontsize=16)
plt.show()



In [152]:
#%matplotlib inline

error_individual = df['error_clustered_drawers']
seqs = results_mean['sequence'][:-1].values
res = results_new['c: 1.2; k: 0.7,0.8; xz'][:-1].values
x = [x for x in range (0,len(seqs))]

plt.figure(figsize=(18,6))
plt.scatter(x, res, marker='o', label='model-generated sequences')
plt.scatter(x, error_individual, c='red', marker='o', linewidth=2, label='baseline for sequence length')
plt.plot(x, error_individual, c='red', alpha=0.5)
plt.plot((x,x),(error_individual,res), '--', c='black', alpha=0.6)


plt.title('silverware clustered for drawers, 0.659, c=1.2; k=0.7, 0.8; xz', fontsize=16)
plt.xticks(x, seqs, rotation=90)
plt.ylabel('median edit distance', fontsize=14)
plt.xlabel('sequence', fontsize=14)
plt.legend()

#plt.savefig('plot_median_editdist_individualerrors_diff.png', bbox_inches='tight')
plt.show()