In [1]:
import pandas as pd
import numpy as np
import itertools
import random
import ast
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import spearmanr, wilcoxon, friedmanchisquare, kendalltau, pearsonr
#from matplotlib import animation
#from mpl_toolkits import mplot3d
from scipy.spatial import distance
from pyxdameraulevenshtein import damerau_levenshtein_distance
from matplotlib.ticker import PercentFormatter

pd.set_option('display.max_rows', 500)

### Prepare Data

In [2]:
# read data

df = pd.read_csv('h03_task_environments.csv', header=0,
                 skip_blank_lines=True, skiprows=[])

In [3]:
df

Unnamed: 0,ID,sequence,coordinates,start_coordinates,length,error,strong_k,mid_k,food_k,containment
0,h0,ocsfgkbp,"o: (6,8);c: (4,8);s: (3,8);f: (5,8);g: (2,8);k...","[8.5,6.5],[5.5,4],[5.5,4],[5.5,4],[5.5,4],[5.5...",8,0.785,p,0,0,0
1,h1,posfkcgb,"p: (6,8);o: (5,8);s: (3,8);f: (4,8);k: (7,8);c...","[8.5,6.5],[5.5,4],[5.5,4],[5.5,4],[5.5,4],[5.5...",8,0.785,p,0,0,0
2,h2,pogbfkcs,"p: (2,8);o: (5,8);g: (3,8);b: (4,8);f: (6,8);k...","[8.5,6.5],[5.5,4],[5.5,4],[5.5,4],[5.5,4],[5.5...",8,0.785,p,0,0,0
3,h3,gpocksfb,"g: (4,8);p: (8,8);o: (7,8);c: (6,8);k: (5,8);s...","[8.5,6.5],[5.5,4],[5.5,4],[5.5,4],[5.5,4],[5.5...",8,0.785,p,0,0,0
4,h4,pofkscgb,"p: (5,8);o: (8,8);f: (7,8);k: (4,8);s: (2,8);c...","[8.5,6.5],[5.5,4],[5.5,4],[5.5,4],[5.5,4],[5.5...",8,0.785,p,0,0,0
5,h5,posgkbfc,"p: (4,8);o: (3,8);s: (2,8);g: (5,8);k: (6,8);b...","[5.5,6.5],[5.5,4],[5.5,4],[5.5,4],[5.5,4],[5.5...",8,0.785,p,0,0,0
6,h6,pocskfgb,"p: (3,8);o: (6,8);c: (4,8);s: (2,8);k: (5,8);f...","[8.5,6.5],[5.5,4],[5.5,4],[5.5,4],[5.5,4],[5.5...",8,0.785,p,0,0,0
7,h7,psfkbgco,"p: (6,8);s: (7,8);f: (8,8);k: (3,8);b: (9,8);g...","[5.5,6.5],[5.5,4],[5.5,4],[5.5,4],[5.5,4],[5.5...",8,0.785,p,0,0,0
8,h8,pfkscbgo,"p: (8,8);f: (4,8);k: (6,8);s: (7,8);c: (5,8);b...","[5.5,6.5],[5.5,4],[5.5,4],[5.5,4],[5.5,4],[5.5...",8,0.785,p,0,0,0
9,h9,pfskbgco,"p: (9,8);f: (8,8);s: (6,8);k: (4,8);b: (7,8);g...","[5.5,6.5],[5.5,4],[5.5,4],[5.5,4],[5.5,4],[5.5...",8,0.785,p,0,0,0


### Define functions for prediction

In [4]:
def predict_sequence(objects, coordinates, start_coordinates, c, k, dimension=[3,]):
    ''' Predicts sequence based on required objects, object coordinates, start coordinates of subject,
        parameters (c+k) and dimensionality.
        Input: Objects, object coordinates, start coordinates, c, k, dimension
        Output: Sequence of objects as str
    '''
    prediction = []
    possible_items = dict.fromkeys(objects, 0) # generate dict from object list
    coord_index = 0
    start_coords = start_coordinates
    coords = coordinates
    new_coords = {}
    new_start_coords = []
    
    if dimension[0] == 3: # no changes if 3D
        new_coords = coords
        new_start_coords = start_coords
        
    elif dimension[0] == 2: # 2D: remove obsolete coordinate
        if dimension[1] == 'xy':
            new_coords = {key: value[:-1] for key, value in coords.items()}
            new_start_coords = [x[:-1] for x in start_coords]
                
        elif dimension[1] == 'xz':
            new_start_coords = [[x[0], x[-1]] for x in start_coords]
            
            for key, value in coords.items():
                new_value = (value[0], value[-1])
                new_coords[key] = new_value
        
        elif dimension[1] == 'yz':
            new_coords = {key: value[1:] for key, value in coords.items()}
            new_start_coords = [x[1:] for x in start_coords]
                
    elif dimension[0] == 1: # 1D: choose appropriate coordinate
        if dimension[1] == 'x':
            new_coords = {key: value[0] for key, value in coords.items()}
            new_start_coords = [x[0] for x in start_coords]
        
        elif dimension[1] == 'y':
            new_coords = {key: value[1] for key, value in coords.items()}
            new_start_coords = [x[1] for x in start_coords]
        
        elif dimension[1] == 'z':
            new_coords = {key: value[2] for key, value in coords.items()}
            new_start_coords = [x[2] for x in start_coords]
    
    while bool(possible_items) == True: # while dict not empty
        for obj in possible_items.keys():
            possible_items[obj] = ((distance.euclidean(
                                new_start_coords[coord_index], 
                                new_coords[obj])
                                ) ** k[obj]) * c[obj]
        #print(possible_items)                      
        minval = min(possible_items.values())
        minval = [k for k, v in possible_items.items() if v == minval]
        minval = random.choice(minval) # choose prediction randomly if multiple items have same cost
        prediction.append(minval)
        del possible_items[minval]
        coord_index += 1
        
    return prediction

In [5]:
def get_median(objects, coordinates, start_coordinates, c, k, dimension, sequence,n=10):
    ''' Returns average edit distance (Damerau-Levenshtein) for 100 trials of sequence prediction.
    '''
    edit_list = []

    for x in range(0,n):
        #print(objects)
        result = ''.join(predict_sequence(objects, coordinates, start_coordinates, c, k, dimension))
        dl = damerau_levenshtein_distance(sequence,result)
        edit_list.append(dl)
    
    #avg = np.mean(edit_list)
    median = np.median(edit_list)
    return median

In [23]:
def get_avg_editdist(data, dimensions=[[1,'x'],[1,'y'],[1,'z'],[2,'xy'],[2,'xz'],[2,'yz'],[3,'xyz']],n=10,
                    seq='sequence_original', coords='coordinates_original', error='error_original'):
    ''' Calculates average edit distance for all combinations of parameters (c, k, dimension).
        Input: Dataframe with objects, coordinates, start coordinates, object categories
        Output: Dataframe with edit distance results (col name: parameters used)
    '''
    results = pd.DataFrame()
        
    for row in range(0,len(data)):
        objects = list(data.at[row,seq])
        strong_k = list(data.at[row,'strong_k'].split(','))
        #mid_k = list(data.at[row,'mid_k'].split(','))
        mid_k = []
        #food_k = list(data.at[row,'food_k'].split(','))
        food_k = []
        coordinates = {key: ast.literal_eval(value) for key, value in (elem.split(': ') for elem in data.at[row,coords].split(';'))}
        #print(coordinates)
        start_coordinates = list(ast.literal_eval(data.at[row,'start_coordinates']))
        sequence = str(data.at[row,seq])
        #print(sequence)
        c1 = {obj: 1.2 for obj in objects}
        k1 = {obj: 1.0 for obj in objects}
        
        for k2 in np.arange(1.1,2.0,0.1):
            k_food = round(k2,2)
            k1 = {obj: k_food if obj in food_k else 1.0 for obj in objects}
        
            for k in np.arange(0,0.9,0.1):
                k_strong = round(k,2)
                k_mid = round(k + 0.1,2)
                k1 = {obj: k_strong if obj in strong_k else k_mid if obj in mid_k else round(k1[obj],2) for obj in objects}
            
                for c in np.arange(1.0,2.0,0.1):
                    c = round(c, 1)
                    c1 = {obj: 1.2 for obj in objects}
                
                    for dim in dimensions:                
                    # get average edit distance
                        median = get_median(objects, coordinates, start_coordinates, c1, k1, dim, sequence, n)
                        median = median / len(sequence)
                        params = 'c: ' + str(c) + '; k: ' + str(k_strong) + ',' + str(k_mid) + ',' + str(k_food) + '; ' + str(dim[1])
                    #params = 'c: 1.2' + '; k: ' + str(k_strong) + ',' + str(k_mid) + '; ' + str(dim[1])
                        results.at[row,params] = median
                        
        results.at[row,'sequence'] = sequence
        results.at[row,'error'] = data.at[row,error]
        results.at[row,'ID'] = data.at[row,'ID']
                    
    return results

### Calculate edit distances, create df for original sequences

In [7]:
# ~3 min for n=100
%time results_original = get_avg_editdist(df, seq='sequence', coords='coordinates', error='error', n=100, \
                                         dimensions=[[1,'x'],[1,'y'],[2,'xy']])

CPU times: user 1min 37s, sys: 312 ms, total: 1min 37s
Wall time: 1min 37s


In [8]:
results_original
#results_original.T.to_csv('results/results_original_all_n100_2020-11-03.csv', header=True, index=True)

Unnamed: 0,"c: 1.2; k: 0.0,0.1; x","c: 1.2; k: 0.0,0.1; y","c: 1.2; k: 0.0,0.1; xy","c: 1.2; k: 0.1,0.2; x","c: 1.2; k: 0.1,0.2; y","c: 1.2; k: 0.1,0.2; xy","c: 1.2; k: 0.2,0.3; x","c: 1.2; k: 0.2,0.3; y","c: 1.2; k: 0.2,0.3; xy","c: 1.2; k: 0.3,0.4; x",...,"c: 1.2; k: 0.6,0.7; xy","c: 1.2; k: 0.7,0.8; x","c: 1.2; k: 0.7,0.8; y","c: 1.2; k: 0.7,0.8; xy","c: 1.2; k: 0.8,0.9; x","c: 1.2; k: 0.8,0.9; y","c: 1.2; k: 0.8,0.9; xy",sequence,error,ID
0,0.875,0.875,0.875,0.875,0.8125,0.875,0.875,0.75,0.875,0.875,...,0.875,0.875,0.75,0.875,0.875,0.875,0.875,ocsfgkbp,0.785,h0
1,0.625,0.625,0.625,0.625,0.75,0.625,0.625,0.625,0.625,0.625,...,0.625,0.625,0.625,0.625,0.625,0.75,0.625,posfkcgb,0.785,h1
2,0.75,0.625,0.75,0.75,0.625,0.75,0.75,0.625,0.75,0.75,...,0.75,0.75,0.75,0.75,0.75,0.75,0.75,pogbfkcs,0.785,h2
3,0.875,0.75,0.875,0.875,0.75,0.75,0.75,0.75,0.875,0.75,...,0.875,0.8125,0.75,0.875,0.875,0.75,0.75,gpocksfb,0.785,h3
4,0.75,0.625,0.875,0.75,0.625,0.875,0.875,0.75,0.75,0.875,...,0.875,0.875,0.625,0.875,0.75,0.625,0.75,pofkscgb,0.785,h4
5,0.875,0.625,0.875,0.875,0.75,0.875,0.875,0.625,0.875,0.875,...,0.875,0.875,0.625,0.875,0.875,0.625,0.875,posgkbfc,0.785,h5
6,0.875,0.75,0.75,0.75,0.625,0.75,0.75,0.75,0.875,0.75,...,0.75,0.75,0.625,0.75,0.75,0.625,0.75,pocskfgb,0.785,h6
7,0.625,0.625,0.625,0.625,0.625,0.625,0.625,0.75,0.625,0.625,...,0.625,0.625,0.625,0.75,0.625,0.625,0.75,psfkbgco,0.785,h7
8,0.625,0.625,0.625,0.625,0.6875,0.625,0.625,0.6875,0.625,0.75,...,0.625,0.625,0.75,0.625,0.625,0.625,0.625,pfkscbgo,0.785,h8
9,0.75,0.625,0.75,0.75,0.625,0.75,0.75,0.625,0.75,0.75,...,0.75,0.75,0.75,0.75,0.875,0.75,0.875,pfskbgco,0.785,h9


In [None]:
%time test

### Get lowest error, compare edit distances for all variants

In [9]:
def get_lowest_error(results):
    ''' Returns lowest error in dataframe and index of lowest error.
    '''
    for col in list(results):
        if col != 'sequence' and col != 'error' and col != 'ID':
            results.loc['mean',col] = results[col].mean()
            #results.loc['median',col] = results[col].median()
    lowest = min(results.loc['mean'])
    mean = list(results.loc['mean'])
    
    return lowest, results.columns[(results.loc['mean'] == lowest)], mean, results

In [10]:
# original sequences

lowest, lowest_idx, list_mean, results_mean = get_lowest_error(results_original)
lowest, lowest_idx, np.mean(results_original['error'])

(0.6948529411764706,
 Index(['c: 1.2; k: 0.0,0.1; y'], dtype='object'),
 0.7849999999999987)

### Plot best model for all variants

In [21]:
#%matplotlib inline
%matplotlib qt

IDs = df['ID']

error_original = df['error']

seqs_original = results_mean['sequence'][:-1].values

res_original = results_original['c: 1.2; k: 0.0,0.1; y'][:-1].values

x_original = [x for x in range (0,len(seqs_original))]

# plot scatter + lines for all simulations
plt.scatter(x_original, res_original, marker='o', s=20, c='blue', alpha=0.5, label='model-generated (avg. 0.69)')
#axs[0].plot(x_original, res_original, c='blue', alpha=0.5)

# error function + connection for original
plt.scatter(x_original, error_original, c='red', s=20, marker='o', alpha=0.5, label='baseline (avg. 0.78)')
plt.plot(x_original, error_original, c='red', alpha=0.5)
plt.plot((x_original,x_original),(error_original,res_original), '--', c='black', alpha=0.6)

plt.title('h03 data', fontsize=14, y=1)
plt.xticks(x_original, IDs, rotation=90, fontsize=6)
plt.xlabel('sequence')
plt.ylabel('edit distance')

#plt.set_ylabel('original', fontsize=10)
#axs[3].set_xlabel('sequence', fontsize=14)

plt.legend(fontsize=8)

#plt.savefig('plot_median_editdist_individualerrors_diff.png', bbox_inches='tight')
plt.show()

In [21]:
diff_original = error_original - res_original
diff_silverware = error_silverware - res_silverware
diff_drawers = error_drawers - res_drawers
diff_noduplicates = error_noduplicates - res_noduplicates

In [22]:
stat, p = wilcoxon(diff_original, diff_drawers, zero_method='wilcox')
print('Original vs drawers: W = %.3f, p = %.5f' % (stat, p))

stat, p = wilcoxon(diff_original, diff_silverware, zero_method='wilcox')
print('Original vs silverware: W = %.3f, p = %.5f' % (stat, p))

stat, p = wilcoxon(diff_original, diff_noduplicates, zero_method='wilcox')
print('Original vs no duplicates: W = %.3f, p = %.5f' % (stat, p))

stat, p = wilcoxon(diff_noduplicates, diff_silverware, zero_method='wilcox')
print('No duplicates vs silverware: W = %.3f, p = %.5f' % (stat, p))

stat, p = wilcoxon(diff_noduplicates, diff_drawers, zero_method='wilcox')
print('No duplicates vs drawers: W = %.3f, p = %.5f' % (stat, p))

Original vs drawers: W = 886.000, p = 0.00111
Original vs silverware: W = 1237.000, p = 0.06621
Original vs no duplicates: W = 1440.500, p = 0.75677
No duplicates vs silverware: W = 1008.000, p = 0.04091
No duplicates vs drawers: W = 1014.000, p = 0.01331


In [23]:
def calculate_percentual_difference(var1, var2):
    
    return [(((np.abs(x - y)) / ((x + y) / 2)) * 100) for x,y in zip(var1, var2)]

In [24]:
def calculate_percentage_change(var1, var2):
    
    return [(((x - y) / np.abs(x)) * 100) if x != 0 else (((0.01 - y) / 0.01) * 100) for x,y in zip(var1, var2)]
    #return [(((x - y) / np.abs(x)) * 100) for x,y in zip(var1, var2) if x != 0]

In [67]:
percent_original = calculate_percentual_difference(res_original, error_original)

In [25]:
original_change = calculate_percentage_change(res_original, error_original)
noduplicates_change = calculate_percentage_change(res_noduplicates, error_noduplicates)
silverware_change = calculate_percentage_change(res_silverware, error_silverware)
drawers_change = calculate_percentage_change(res_drawers, error_drawers)

In [26]:
np.nanmedian(original_change), np.nanmedian(noduplicates_change),\
np.nanmedian(silverware_change), np.nanmedian(drawers_change)

(-17.923809523809528,
 -17.714285714285733,
 -10.999999999999991,
 -8.450000000000003)

In [27]:
stat, p = wilcoxon(original_change, drawers_change, zero_method='wilcox')
print('Original vs drawers: W = %.3f, p = %.5f' % (stat, p))

stat, p = wilcoxon(original_change, silverware_change, zero_method='wilcox')
print('Original vs silverware: W = %.3f, p = %.5f' % (stat, p))

stat, p = wilcoxon(original_change, noduplicates_change, zero_method='wilcox')
print('Original vs no duplicates: W = %.3f, p = %.5f' % (stat, p))

stat, p = wilcoxon(noduplicates_change, silverware_change, zero_method='wilcox')
print('No duplicates vs silverware: W = %.3f, p = %.5f' % (stat, p))

stat, p = wilcoxon(noduplicates_change, drawers_change, zero_method='wilcox')
print('No duplicates vs drawers: W = %.3f, p = %.5f' % (stat, p))

Original vs drawers: W = 1178.000, p = 0.07099
Original vs silverware: W = 1521.000, p = 0.63491
Original vs no duplicates: W = 1398.500, p = 0.60099
No duplicates vs silverware: W = 1260.000, p = 0.49216
No duplicates vs drawers: W = 1264.000, p = 0.22785
