In [1]:
%reset -f

import pandas as pd
import os
import re
import numpy as np
from six.moves import xrange
import math
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
# from minisom import MiniSom
import pickle

In [2]:
def ability_level_mapper(data, groups=None, col='front', how='naive', n_level=19, 
                         invert=True, parameters=None, divide_points=None, 
                         target_col_name='performance', balanced_scale=True, avg_perf=None):
    """
    balanced_scale: allow to have nagative grades
    """
    
    # the raw data is divided into groups according to its exc_num, ability levels are calculated respectively
    # how: 1 is mapping without any other processing, called 'naive'
    origin = data.copy()
    # if target col is imbalanced, here we seperate it in parts with different scale value
    if parameters is not None:
        col, how, n_level, invert, v_max, v_min, divide_points, balanced_scale, avg_perf = parameters
        
    if divide_points is not None:
        assert n_level==len(divide_points), 'false length of imbalanced_data'
        # elements in inbalaced_data are ordered increasely
        origin[target_col_name] = 0
        for i,v in enumerate(divide_points):
            if invert:
                origin.loc[(origin.performance==0) & (origin[col]<=v), target_col_name] = n_level-i+1
            else:
                origin.loc[(origin.performance==0) & (origin[col]<=v), target_col_name] = i+1
        if invert:
            origin.loc[origin.performance==0, target_col_name] = 1
        else:
            origin.loc[origin.performance==0, target_col_name] = n_level+1
            
        if balanced_scale and avg_perf is not None:
            origin[target_col_name] -= avg_perf

        parameters = (col, how, n_level, invert, None, None, divide_points, balanced_scale, avg_perf)
        return origin, parameters
        
    if how == 'cluster':
        origin = origin.reset_index()
        centroids,_ = kmeans(whiten(origin[col]), k_or_guess=n_level+1)
#         origin = origin.sort_values(by=[col])
        clx,_ = vq(whiten(origin[col]),centroids)
        
        origin[target_col_name] = clx+1
        tmp = origin[[target_col_name, col]]
        
        tmp = tmp[[target_col_name, col]].groupby([target_col_name]).mean()
        tmp['tmp'] = 1
        tmp = tmp.sort_values(by=col)
        tmp = tmp['tmp'].cumsum().reset_index()
        origin = origin.merge(tmp, how='left', on=[target_col_name])
        origin[target_col_name] = origin['tmp']
#         origin = origin.sort_values(by=['index'])
        origin = origin.set_index(['index'])
        divide_points = origin.groupby([target_col_name])[col].max().tolist()
        divide_points = divide_points[:-1]
        
        if balanced_scale:
            if avg_perf is None:
                avg_perf = origin[target_col_name].mean()
                avg_perf = int(avg_perf)
            origin[target_col_name] -= avg_perf
        else:
            avg_perf = None
        parameters = (col, how, n_level, invert, None, None, divide_points, balanced_scale, avg_perf)
        
        return origin, parameters
        
        
    if groups == None:
        v_max = origin[col].max()
        v_min = origin[col].min()
        
        interval = (v_max- v_min)/n_level
        assert interval!=0, 'zero dividend'
        origin[target_col_name] = (origin[col]-v_min)/interval

        origin[target_col_name] = origin[target_col_name].astype(int)
        if invert:
            origin[target_col_name] = n_level-origin[target_col_name]+1
        else:
            origin[target_col_name] = origin[target_col_name]+1

        if balanced_scale:
            if avg_perf is None:
                avg_perf = origin[target_col_name].mean()
                avg_perf = int(avg_perf)
            origin[target_col_name] -= avg_perf
        else:
            avg_perf = None
        parameters = (col, how, n_level, invert, None, None, divide_points, balanced_scale, avg_perf)
        return origin, parameters
    
    if how == 1 or how=='naive':
        tmp = origin[col]
        for index, group in groups:
            
            v_max = group[col].max()
            v_min = group[col].min()
            
            interval = (v_max-v_min)/n_level
            
            assert interval!=0, 'zero dividend'
            
            origin.loc[index, col] = (origin.loc[index, col]-v_min)/interval
        origin[col] = origin[col].astype(int)
        if invert:
            origin[target_col_name] = n_level-origin[col]+1
            
        else:
            origin[target_col_name] = origin[target_col_name]+1
        origin[col] = tmp
        
        if balanced_scale:
            if avg_perf is None:
                avg_perf = origin[target_col_name].mean()
                avg_perf = int(avg_perf)
            origin[target_col_name] -= avg_perf
        else:
            avg_perf = None
        parameters = (col, how, n_level, invert, None, None, divide_points, balanced_scale, avg_perf)
        
        return origin, parameters
    

def calc_force(df):
    df['force'] = df['Fx']**2 + df['Fy']**2 + df['Fz']**2
    df['force'] = df['force'].pow(1/2)
    return df

def calc_resultent_force(df):
    # calc resultent force
    df['resultent_force'] = (df['Fx']-df['Fx.1'])**2+(df['Fy']-df['Fy.1'])**2+(df['Fz']-df['Fz.1'])**2
    df['resultent_force'] = df['resultent_force'].pow(1/2)
    return df

def calc_resultent_force_xy(df):
    # calc resultent force on flat xoy
    df['resultent_force_xy'] = (df['Fx']-df['Fx.1'])**2+(df['Fy']-df['Fy.1'])**2
    df['resultent_force_xy'] = df['resultent_force_xy'].pow(1/2)
    return df


def calc_velocity(df, del_low=True):
    df['velocity'] = df['Lx']**2 + df['Ly']**2 + df['Lz']**2
    df['velocity'] = df['velocity'].pow(1/2)
    if del_low:
        df = del_low_velocity(df)
    return df

def calc_velocity_xy(df):
    df['velocity_xy'] = df['Lx']**2 + df['Ly']**2
    df['velocity_xy'] = df['velocity_xy'].pow(1/2)
    return df

def calc_force_velocity_angle_xy(df):
    # calculate angle between walk direction and force direction
    if 'resultent_force_xy' not in df.columns:
        df = calc_resultent_force_xy(df)
        
    if 'velocity_xy' in df.columns: 
        df = calc_velocity_xy(df)
    
    df['angle_fv_xy'] = ((df['Fx']-df['Fx.1'])*df['Lx']+(df['Fy']-df['Fy.1'])*df['Ly'])/  \
        (df['velocity_xy']*df['resultent_force_xy'])
    df['angle_fv_xy'] = df['angle_fv_xy'].apply(np.arccos)
    tmp = df['angle_fv_xy'][df['angle_fv_xy']>np.pi/2]
    tmp -= np.pi
    
    # here is maybe a bug
    df.loc[df.angle_fv_xy>np.pi/2, 'angle_fv_xy'] = tmp
    del tmp
    return df

def calc_torque_xy(df, threshold=0):
    df['torque_xy'] = df['Mz']-df['DMz']
    return df

def calc_torque_xy_avg(df, threshold=0, n=100, dropna=True):
    print('torque')
    print(len(df))
    if 'torque_xy' not in df.columns:
        df = calc_torque_xy(df)
        
    df['torque_xy_avg'] = df.groupby(['uid','day','exc_num', 'exc_times'])['torque_xy'].\
        rolling(n).mean().reset_index()['torque_xy']
        
    if threshold!=0:
        assert threshold>0, 'threshold must be positive'
        df[(df.torque_xy_avg<threshold) & (df.torque_xy_avg>-threshold)] = 1
        df[df.torque_xy_avg<=-threshold] = 0
        df[df.torque_xy_avg>=threshold] = 2
        
    if dropna:
        df = df.dropna()
    print(len(df.dropna()))
    return df

def calc_abs_force_velocity_angle_xy(df):
    # calculate absolute angle between walk direction and force direction
    if 'resultent_force_xy' not in df.columns:
        df = calc_resultent_force_xy(df)
        
    if 'velocity_xy' in df.columns: 
        df = calc_velocity_xy(df)
    
    df['abs_angle_fv_xy'] = ((df['Fx']-df['Fx.1'])*df['Lx']+(df['Fy']-df['Fy.1'])*df['Ly'])/  \
        (df['velocity_xy']*df['resultent_force_xy'])
    df['abs_angle_fv_xy'] = abs(df['abs_angle_fv_xy'])
    df['abs_angle_fv_xy'] = df['abs_angle_fv_xy'].apply(np.arccos)
    return df

def calc_torque_turning_freq(df,interval=100, groups=None, dropna=True):
    # calculate the frequency of the changing of the direction of the torque on flat xoy
    
    print('torque turning freq:')
    print(df.shape)
    result = pd.Series()
    if 'torque_xy' not in df.columns:
        df = calc_torque_xy(df)
    if groups == None:
        groups = df[['uid', 'day', 'exc_num', 'exc_times', 'torque_xy']].   \
            groupby(['uid', 'day', 'exc_num', 'exc_times'])
        
    for index, group in groups:
        # calculate the direction of the changing of the values
        df_tmp = group.reset_index()
        df_tmp['tt_freq'] = df_tmp['torque_xy']
        df_tmp.loc[0, 'tt_freq'] = 0
        tmp1 = df_tmp['torque_xy'][:-1].reset_index(drop=True)
        tmp2 = df_tmp['torque_xy'][1:].reset_index(drop=True)

        tmp3 = tmp2-tmp1
        tmp3.index = tmp3.index+1
        df_tmp.loc[1:, 'tt_freq'] = tmp3

        # delete 0 value, and compare two adjecend value, if the direction changes, then the product is negative.
        tmp3 = df_tmp[df_tmp['tt_freq']!=0][['tt_freq']]
        tmp3 = tmp3.reset_index()
        tmp1 = tmp3['tt_freq'][:-1].reset_index()
        tmp2 = tmp3['tt_freq'][1:].reset_index()
        tmp4 = tmp1*tmp2
        tmp4.index += 1
        tmp3[0, 'tt_freq'] = 0
        tmp3.loc[1:, 'tt_freq'] = tmp4
        tmp3.index = tmp3['index']
        df_tmp.loc[df_tmp.tt_freq!=0, 'tt_freq'] = tmp3['tt_freq']

        # count in interval
        df_tmp.loc[df_tmp.tt_freq>=0, 'tt_freq'] = np.nan
#         df_tmp['tt_freq'] = df_tmp['tt_freq'].rolling(interval).count()
        df_tmp = df_tmp.set_index(['index'])
        df_tmp = group_rolling_count(df_tmp, 'tt_freq', 'tt_freq', by=['uid', 'day', 'exc_num', 'exc_times'], n=interval)
#         df_tmp.index = df_tmp['index']
        
        result = pd.concat([result, df_tmp['tt_freq']])
        del tmp1
        del tmp2
        del tmp3
        del tmp4
#     print(result)
    # index aligning
    df['tt_freq'] = result
#     print(all_data[['torque_xy','tt_freq']])

    if dropna:
        df = df.dropna()
    print(df.shape)        
    del result
    return df   

def calc_abs_angle_rotation_velocity_xy(df, groups=None):
    # calculate relative absolute angle rotation of velocity on flat xoy
    
    if 'velocity_xy' not in df.columns:
        df = calc_velocity_xy(df)
        
    result = pd.Series()
    if groups == None:
        groups = df[['uid', 'day', 'exc_num', 'exc_times', 'Lx', 'Ly', 'velocity_xy']].   \
            groupby(['uid', 'day', 'exc_num', 'exc_times'])
        
    for index, group in groups:
        # calculate angle rotation from two adjecend time point
        df_tmp = group.reset_index()
        df_tmp['v_rotation'] = 0
        tmp1 = df_tmp.loc[0:len(df_tmp)-2, ['Lx', 'Ly', 'velocity_xy']].reset_index(drop=True)
        tmp2 = df_tmp.loc[1:, ['Lx', 'Ly', 'velocity_xy']].reset_index(drop=True)
    
        tmp3 = (tmp1['Lx']*tmp2['Lx']+tmp1['Ly']*tmp2['Ly'])/  \
        (tmp1['velocity_xy']*tmp2['velocity_xy'])
#         tmp3 = tmp3.fillna(0)
#         print(tmp3)
        tmp3 = tmp3.apply(np.arccos)
#         print(tmp3)
        tmp3.index += 1
        df_tmp.loc[1:, 'v_rotation'] = tmp3
        df_tmp.loc[df_tmp.v_rotation>np.pi/2, 'v_rotation'] -= np.pi
        df_tmp.loc[df_tmp.v_rotation<0.0001, 'v_rotation'] = 0
        df_tmp['v_rotation'] = df_tmp['v_rotation'].fillna(0)
#         print(df_tmp.loc[:, ['v_rotation', 'Lx', 'Ly']])
        
        df_tmp.index = df_tmp['index']
        result = pd.concat([result, df_tmp['v_rotation']])
        del tmp1
        del tmp2
        del tmp3
        del df_tmp
    df['v_abs_rotation'] = result
    del result
    return df

def calc_angle_rotation_velocity_xy(df, groups=None):
    # calculate relative angle rotation of velocity on flat xoy
    
    if 'velocity_xy' not in df.columns:
        df = calc_velocity_xy(df)
        
    result = pd.Series()
    if groups == None:
        groups = df[['uid', 'day', 'exc_num', 'exc_times', 'Lx', 'Ly', 'velocity_xy']].   \
            groupby(['uid', 'day', 'exc_num', 'exc_times'])
        
    for index, group in groups:
        # calculate angle rotation from two adjecend time point
        df_tmp = group.reset_index()
        df_tmp['v_rotation'] = 0
#         tmp1 = df_tmp['Ly']/df_tmp['Lx']
        tmp1 = np.arctan2(df_tmp['Ly'], df_tmp['Lx'])
        tmp2 = tmp1[1:].reset_index(drop=True) - tmp1[:-1].reset_index(drop=True)

        tmp2.index += 1
        df_tmp.loc[1:, 'v_rotation'] = tmp2
        df_tmp['v_rotation'] = df_tmp['v_rotation'].fillna(0)
#         print(df_tmp.loc[:, ['v_rotation', 'Lx', 'Ly']])
        
        df_tmp.index = df_tmp['index']
        result = pd.concat([result, df_tmp['v_rotation']])
        del tmp1
        del tmp2
        del df_tmp
    df['v_rotation'] = result
    df.loc[df.v_rotation<-np.pi, 'v_rotation'] += 2*np.pi
    df.loc[df.v_rotation>np.pi, 'v_rotation'] -= 2*np.pi
    del result
    return df

def calc_velocity_angle(df):
#     tan = df['Ly']/df['Lx']
    df['v_angle'] = np.arctan2(df['Ly'], df['Lx'])
#     del tan
    return df

def calc_velocity_angle_without_backward(df):
    # here we calculate velocity angle regardless of move direction
    if 'v_angle' not in df.columns:
        df = calc_velocity_angle(df)
        
    df['va_wo'] = df['v_angle']
    df.loc[(df.v_angle>np.pi/2) , 'va_wo'] = np.pi - df.loc[(df.v_angle>np.pi/2) , 'va_wo']
    df.loc[(df.v_angle<-np.pi/2), 'va_wo'] += np.pi
    return df

def calc_sd_velocity(df, by=['day', 'uid', 'exc_num', 'exc_times'], n=100, dropna=True):
    print('sd')
    print(len(df))
    if 'velocity' not in df.columns:
        df = calc_velocity(df)
#     df['v_sd'] = df.groupby(['uid','day','exc_num', 'exc_times'])['velocity'].\
#         rolling(n).std().reset_index()['velocity']
    df = group_rolling_std(df, 'velocity', 'v_sd', by, n)
    if dropna:
        df = df.dropna()
    print(len(df.dropna()))
    return df

def calc_velocity_skew(df, by=['day', 'uid', 'exc_num', 'exc_times'], n=100, dropna=True):
    print('skew')
    print(len(df))
    if 'velocity' not in df.columns:
        df = calc_velocity(df)
#     df['v_skew'] = df.groupby(['uid','day','exc_num', 'exc_times'])['velocity'].\
#         rolling(n).skew().reset_index()['velocity']
    df = group_rolling_skew(df, 'velocity', 'v_skew', by, n)
    if dropna:
        df = df.dropna()
    print(len(df.dropna()))
    return df

def calc_velocity_kurtosis(df, by=['day', 'uid', 'exc_num', 'exc_times'], n=100, dropna=True):
    print('kurtosis')
    print(len(df))
    if 'velocity' not in df.columns:
        df = calc_velocity(df)
#     df['v_kurt'] = df.groupby(['uid','day','exc_num', 'exc_times'])['velocity'].\
#         rolling(n).skew().reset_index()['velocity']

    df = group_rolling_kurt(df, 'velocity', 'v_kurt', by, n)
    if dropna:
        df = df.dropna()
    print(len(df.dropna()))
    return df

def calc_avg_velocity(df, by=['day', 'uid', 'exc_num', 'exc_times'], n=100, dropna=True):
    if 'velocity' not in df.columns:
        df = calc_velocity(df)
        
#     df['avg_velocity'] = df.groupby(['uid','day','exc_num', 'exc_times']).\
#         rolling(n).mean().reset_index(drop=True)['velocity']
    
    df = group_rolling_mean(df, 'velocity', 'avg_velocity', by, n)
    
    if dropna:
        df = df.dropna()
    print(len(df.dropna()))

    return df

def calc_velocity_deviation_score(df, a=10**-8, avg=False):
    if avg:
        if 'avg_velocity' not in df.columns:
            df = calc_avg_velocity(df)
        
        df['vd_score'] = df['avg_velocity']/(a+df['deviation'])
    else:
        df = stardard_scaler(df, target_col='velocity')
        df = stardard_scaler(df, target_col='deviation')
        df['vd_score'] = df['velocity']/(a+df['deviation'])
    return df

def calc_reciprocal_deviation(df, bias=0.1):
    if 'deviation' not in df.columns:
        df = get_small_deviation(df)
    
    df['re_dev'] = 1/(df['deviation']+bias)
    return df

def calc_inverted_tt_freq(df, bias=0):
    if 'tt_freq' not in df.columns:
        df = calc_torque_turning_freq(df)
    df['inv_tt_freq'] = df['tt_freq'].max()-df['tt_freq']+bias
    return df

def calc_inverted_force(df, bias=0):
    if 'force' not in df.columns:
        df = calc_force(df)
    df['inv_force'] = df['force'].max()-df['force']+bias
    return df

def calc_score(df, avg=False):
    if avg:
        if 'avg_velocity' not in df.columns:
            df = calc_avg_velocity(df)

        df['score'] = df['avg_velocity']*df['deviation']
    else:
        df['score'] = df['velocity']*df['deviation']
    return df

def calc_assess_feature(data, c=0.1):
    print('assess:', data.shape)
    if 'velocity' not in data.columns:
        data = calc_velocity(data)
    if 'force' not in data.columns:
        data = calc_force(data)
    if 're_dev' not in data.columns:
        data = calc_reciprocal_deviation(data)
        
    data = stardard_scaler(data, target_col='velocity')
    data = stardard_scaler(data, target_col='re_dev')    
    data = stardard_scaler(data, target_col='force')
    data['assess'] = data['velocity']*data['re_dev'] / ((data['force']+c))
    data = data.replace([-np.inf, np.inf], [np.nan, np.nan])
    data = data.dropna()
    print(data.loc[data.assess==data['assess'].max(),])
    print('assess(drop_na):', data.shape)
    return data

def calc_angle_velocity_xy(df):
    df['angle_velocity_xy'] = df['Az']
    return df

def calc_avg_angle_velocity_xy(df, by=['day', 'uid', 'exc_num', 'exc_times'], n=100, dropna=True):
    print('angle_velocity')
    print(len(df))
    if 'angle_velocity_xy' not in df.columns:
        df = calc_angle_velocity_xy(df)
        
    df = group_rolling_mean(df, 'Az', 'avg_av_xy', by, n)
    if dropna:
        df = df.dropna()
    print(len(df.dropna()))
    
    return df

def calc_va_sum(df, by=['day', 'uid', 'exc_num', 'exc_times'], n=100, dropna=True):
    print('velocity angle cumsum')
    print(len(df))
    if 'va_wo' not in df.columns:
        df = calc_velocity_angle_without_backward(df)
        
    df = group_rolling_sum(df, 'va_wo', 'sum_va', by, n)
    if dropna:
        df = df.dropna()
    print(len(df.dropna()))
    
    return df

def calc_confidence(df):
    mx = df['front'].max()
    mi = df['front'].min()
    df['conf'] = 1-(df['front']-mi)/(mx-mi)
    return df

def get_small_deviation(df):
    assert 'left' in df.columns, 'lack necessary columns'
    assert 'right' in df.columns, 'lack necessary columns'
    assert 'front' in df.columns, 'lack necessary columns'
    print('get small deviation')
    print(df.shape)
    df['deviation'] = df[['front', 'left', 'right']].min(axis=1)
    df = df.dropna(subset=['deviation'])
    print(df.shape)
    return df

def del_outlier(df, by, test_col, threshold=10):
    
    # assume that the distribution of test column in each group are normal, 
    # the data outside 2 standard deviation are considered as outlier.
    # the data with too few samples will be deleted
    print('delete outlier')
    print(len(df))
    
    if by is None:
        mean = df[test_col].mean()
        std = df[test_col].std()
        
        left = mean-2*std
        right = mean+2*std
        
        df = df[(df[test_col]>left) & (df[test_col]<right)]
        df = df.reset_index(drop=True)
        print(len(df))
        return df
    
    cols = df.columns
    # delete data with a few samples
    count = df[by+[test_col]].groupby(by).count()
    count = count[count[test_col]>threshold].reset_index()
    count = count[by]
    df = df.merge(count, on=by)
    df = df.dropna().reset_index(drop=True)
    
    # delete outlier
    mean = df[by+[test_col]].groupby(by).mean()
    std = df[by+[test_col]].groupby(by).std()
    left = mean-2*std
    right = mean+2*std
    left = left.reset_index()
    right = right.reset_index()
    left.columns = by+['left']
    right.columns = by+['right']
    
    df = df.merge(left, on=by)
    df = df.merge(right, on=by)
    df['left'] = df[test_col]-df['left']
    df['right'] = df[test_col] - df['right']
    df = df[(df['left'])>0 & (df['right']<0)]
    df = df[cols]
    df = df.reset_index(drop=True)
    print(len(df))
    return df
    
    
def set_inverted_task(df, tasks=[2.4, 3.4, 4.1, 4.2, 4.3]):
    df['inverted'] = 0
    for task in tasks:
        df.loc[df.exc_num==task, 'inverted'] = 1
        
    return df


def set_sidewards_task(df, tasks=[4.1, 4.2]):
    df['sidewards'] = 0
    for task in tasks:
        df.loc[df.exc_num==task, 'sidewards'] = 1
        
    return df

def is_disturbed(df):
    df['disturbed'] = 0
    df.loc[(df.DFx>0) | (df.DFy>0) | (df.DFz>0), 'disturbed'] = 1
    
    return df

def is_backward(df):
    if 'v_angle' not in df.columns:
        df = calc_velocity_angle(df)
    df['backward'] = 0
    df.loc[(df.v_angle>np.pi/2) | (df.v_angle<-np.pi/2), 'backward'] = 1
    return df
    


def set_difficulty(df, def_cols=['v_angle', 'v_sd', 'torque_xy_avg'], levels=[20, 20, 20]):
    print('set difficulty')
    print(len(df))
    df['diff_description'] = ''
    for col in def_cols:
        df['diff_description'] = df['diff_description'] + '_' +col+df[col].astype(str)
        
    if levels is not None:
        for col, level in zip(def_cols, levels):
            df,_ = ability_level_mapper(df, col=col, n_level=level, target_col_name=col)
    df['difficulty'] = 1
    cnt = 1
    tmp = df.groupby(def_cols)[['difficulty']].mean().cumsum().reset_index()
    df = df.drop(['difficulty'], axis=1)
    df = df.merge(tmp, on=def_cols)
#     print(df.head())
#     print(len(df.groupby(def_cols).mean()))
#     print(len(df.groupby(def_cols+['difficulty']).mean()))

    tmp = df[['difficulty', 'diff_description']].drop_duplicates()
    print(tmp)
    tmp.to_csv('../data/diff_def.csv', index=False)
    del tmp
    print('difficulty description:')
    print(df[['difficulty', 'diff_description']].drop_duplicates())
    print('# of diff: ' +str(len(df['difficulty'].unique())))
    print(len(df))
    
    return df

def difficulty_mapper(df, cols, levels, invert=True):
    print('difficulty mapper')
    for col, level in zip(cols, levels):
        df,_ = ability_level_mapper(df, col= col, n_level=level, target_col_name=col, invert=invert)
        
    return df

def del_low_velocity(df, thres=0.05):
    df = df[df['velocity']>thres]
    return df

def stardard_scaler(df, target_col):
    mx = df[target_col].max()
    mi = df[target_col].min()
    df[target_col] = (df[target_col]-mi)/(mx-mi)
    return df

def group_rolling_mean(df, target_col, new_col, by, n):
    assert 'index' not in df.columns, 'check you data columns, and ensure no index column in it'
    df = df.reset_index()
    tmp = df.groupby(by).rolling(n).mean()[target_col]
    tmp = tmp.reset_index().sort_values(by).reset_index(drop=True)
    df = df.sort_values(by).reset_index(drop=True)
    df[new_col] = tmp[target_col]
    del tmp
    df = df.set_index('index')
    return df

def group_rolling_kurt(df, target_col, new_col, by, n):
    assert 'index' not in df.columns, 'check you data columns, and ensure no index column in it'
    df = df.reset_index()
    tmp = df.groupby(by).rolling(n).kurt()[target_col]
    tmp = tmp.reset_index().sort_values(by).reset_index(drop=True)
    df = df.sort_values(by).reset_index(drop=True)
    df[new_col] = tmp[target_col]
    del tmp
    df = df.set_index('index')
    return df

def group_rolling_skew(df, target_col, new_col, by, n):
    assert 'index' not in df.columns, 'check you data columns, and ensure no index column in it'
    df = df.reset_index()
    tmp = df.groupby(by).rolling(n).skew()[target_col]
    tmp = tmp.reset_index().sort_values(by).reset_index(drop=True)
    df = df.sort_values(by).reset_index(drop=True)
    df[new_col] = tmp[target_col]
    del tmp
    df = df.set_index('index')
    return df

def group_rolling_std(df, target_col, new_col, by, n):
    assert 'index' not in df.columns, 'check you data columns, and ensure no index column in it'
    df = df.reset_index()
    tmp = df.groupby(by).rolling(n).std()[target_col]
    tmp = tmp.reset_index().sort_values(by).reset_index(drop=True)
    df = df.sort_values(by).reset_index(drop=True)
    df[new_col] = tmp[target_col]
    del tmp
    df = df.set_index('index')
    return df

def group_rolling_count(df, target_col, new_col, by, n):
    assert 'index' not in df.columns, 'check you data columns, and ensure no index column in it'
    df = df.reset_index()
    tmp = df.groupby(by).rolling(n).count()[target_col]
    tmp = tmp.reset_index().sort_values(by).reset_index(drop=True)
    df = df.sort_values(by).reset_index(drop=True)
    df[new_col] = tmp[target_col]
    del tmp
    df = df.set_index('index')
    return df

def group_rolling_sum(df, target_col, new_col, by, n):
    assert 'index' not in df.columns, 'check you data columns, and ensure no index column in it'
    df = df.reset_index()
    tmp = df.groupby(by).rolling(n).sum()[target_col]
    tmp = tmp.reset_index().sort_values(by).reset_index(drop=True)
    df = df.sort_values(by).reset_index(drop=True)
    df[new_col] = tmp[target_col]
    del tmp
    df = df.set_index('index')
    return df

def def_env(test_col, diff_def, n_class, group, if_del_outlier):
    
    tmp = [test_col, diff_def, n_class, group, if_del_outlier]
    with open('../data/parameter/def_env.p', 'wb') as f:
        pickle.dump(tmp, f)
        
    del tmp
    return
    
def add_necessary_cols(all_cols, cols):
    for col in cols:
        if col not in all_cols:
            all_cols.append(col)
            
    return all_cols

the initial data is already cleared according to their length and correlation

### clear outlier that recorded in file

In [3]:
# all_data = pd.read_csv('../data/new_all_data_original.csv')

# print(len(all_data))
# outliers = []

# with open('../data/deviation_curves_outlier.csv') as f:
#     for line in f.readlines():
#         tmp = line.split('\n')[0].split(' ')
       
#         outliers.append([float(x) for x in tmp])
        
# groups = all_data.set_index(['day', 'exc_num', 'exc_times', 'uid'])
# groups = groups.groupby(by=groups.index)

# new_data = pd.DataFrame()

# for index, group in groups:
#     if len(group)<10:
#         print('short length')
#         print(index)
#         continue
#     day, exc_num, exc_times, uid = index
#     exc_num = round(exc_num, 1)
#     if [day, exc_num, exc_times, uid] in outliers:
#         print([day, exc_num, exc_times, uid])
#         print('in the outlier list')
#         pass
# #         tmp = group.reset_index()
# #         sns.set_style('whitegrid')
# #         f, ax= plt.subplots(figsize = (14, 10))
# #         ax = sns.lineplot(x=tmp.index, y="front", data=tmp)
# #         ax.set_title(index) 
#     else:
#         curr = group.reset_index()
#         new_data = pd.concat([new_data, curr], axis=0)
# all_data = new_data
# del new_data
# len(all_data)
# all_data.to_csv('../data/new_data.csv', index=False)

### postprocessing

In [4]:
# df = pd.read_csv('../data/step1_clear_data.csv')

In [5]:
# def calc_avg_velocity(df, interval=20):
#     if 'velocity' not in df.columns:
#         df = calc_velocity(df)
        
#     df['avg_velocity'] = df['velocity'].rolling(interval, min_periods=1).mean()
#     return df

# def find_forwards_segment(df, exc=1.1, rotation_threshold=1, new_num=0.1):
#     assert exc is not None, 'rewrite function find_forwards_segment first'
#     tmp = df[df['exc_num']==exc]
#     tmp = tmp.loc[(tmp.Lx>0) & ((tmp.Az<rotation_threshold) | (tmp.Az>-rotation_threshold))]
#     tmp['exc_num'] = new_num
#     df = pd.concat([df, tmp])
#     df = df.reset_index(drop=True)
#     del tmp
#     return df

In [6]:
# df = calc_avg_velocity(df)
# # df = find_forwards_segment(df)
# df

In [7]:
# df.to_csv('../data/step1_clear_data.csv', index=False)

### load precleared data

In [8]:
all_data = pd.read_csv('../data/data2_path.csv')
print(all_data.columns)
# print(all_data[(all_data['day']==1) & 
#                (all_data['uid']==7) & 
#                (all_data['exc_num']==1.1) & 
#                (all_data['exc_times']==1)][['Az']])
print(all_data['exc_num'].unique())

Index(['Ax', 'Ay', 'Az', 'DFx', 'DFy', 'DFz', 'DMx', 'DMy', 'DMz', 'Fx', 'Fy',
       'Fz', 'Lx', 'Ly', 'Lz', 'Mx', 'My', 'Mz', 'day', 'exc_num', 'exc_times',
       'front', 'left', 'path.x', 'path.x1', 'path.x2', 'path.y', 'path.y1',
       'path.y2', 'pose.theta', 'pose.x', 'pose.y', 'record.t', 'right', 't',
       'uid', 'datetime', 'pre_dist', 'path_diff', 'pre_x', 'pre_y', 'curr_x',
       'curr_y', 'curr_dist'],
      dtype='object')
[ 1.1  1.5  2.1  2.2  3.1  3.2  4.1  4.2]


In [9]:
# print(all_data[(all_data['day']==1) & 
#                (all_data['uid']==7) & 
#                (all_data['exc_num']==1.1) & 
#                (all_data['exc_times']==2)][['Az', 'angle_velocity_xy', 'avg_av_xy']])

In [10]:
# test = all_data[(all_data['day']==1) & 
#                ((all_data['uid']==7) | (all_data['uid']==4)) & 
#                (all_data['exc_num']==1.1) & 
#                (all_data['exc_times']==1)]
# test = test.groupby(['day', 'uid', 'exc_num', 'exc_times']).head().reset_index()
# print(test.head())
# test['test'] = test.groupby(['day', 'uid', 'exc_num', 'exc_times']).rolling(100).mean().reset_index(drop=True)['Az']
# # test['test'] = test['Az'].rolling(2).mean()
# test.groupby(['day', 'uid', 'exc_num', 'exc_times']).head()\
#     [['day', 'uid', 'exc_num', 'exc_times','Az', 'test']]



In [11]:
# define parameters
test_col = 'inv_force'
diff_def = [#'v_angle', #'v_skew',  
            #'v_kurt',
            #'sum_va',
            #'avg_av_xy', 
            'inverted', 
            'sidewards',
            'disturbed', 'backward', 'path_diff']
mapper_cols = [#'v_angle', #'v_skew', #'v_kurt', 
               #'sum_va',
               #'avg_av_xy'
              ]

mapper_levels = [#5, #10, 
                 #5,
                 #5
                ]
n_class = mapper_levels+[2, 2, 2, 2, 3]
group_unit = ['day', 'uid', 'exc_num', 'exc_times']
group_cols = group_unit + [test_col]
diff_cols = group_cols+diff_def

selected_cols = group_cols+['difficulty']
necessary_cols = ['deviation', 'velocity' #, 'conf'
                 ]
selected_cols1 = add_necessary_cols(diff_cols, necessary_cols)
selected_cols = add_necessary_cols(selected_cols, necessary_cols)
b_del_outlier = False
def_env(test_col, diff_def, n_class, group_cols, b_del_outlier)

In [12]:
all_data = get_small_deviation(all_data)
all_data = calc_reciprocal_deviation(all_data, bias=0.2)
print(all_data.shape)
all_data = all_data.drop(['left', 'right'], axis=1)
all_data = calc_force(all_data)
print('main')
print(len(all_data))
print(len(all_data.dropna()))
all_data = calc_velocity(all_data)
# all_data = calc_resultent_force(all_data)
# all_data = calc_resultent_force_xy(all_data)
# all_data = calc_velocity_xy(all_data)
# all_data = calc_force_velocity_angle_xy(all_data)
# all_data = calc_abs_force_velocity_angle_xy(all_data)
# all_data = calc_torque_xy(all_data)
all_data = calc_torque_turning_freq(all_data)
all_data = calc_inverted_tt_freq(all_data, bias=1)
all_data = calc_inverted_force(all_data, bias=1)
# all_data = calc_assess_feature(all_data, c=0.3)
# all_data = calc_velocity_angle(all_data)
# all_data = calc_velocity_angle_without_backward(all_data)
# stop()
# all_data = calc_sd_velocity(all_data, by=group_unit, dropna=False)
# all_data = calc_velocity_skew(all_data, by=group_unit, dropna=False)
# all_data = calc_velocity_kurtosis(all_data, by=group_unit, dropna=False)
# all_data = calc_torque_xy_avg(all_data, dropna=False)
# all_data = calc_avg_velocity(all_data, by=group_unit, dropna=False)
# all_data = calc_avg_angle_velocity_xy(all_data, by=group_unit, dropna=False)
# all_data = calc_va_sum(all_data, by=group_unit, dropna=False)
all_data = set_inverted_task(all_data)
all_data = set_sidewards_task(all_data)
all_data = is_disturbed(all_data)
all_data = is_backward(all_data)
# all_data = calc_confidence(all_data)
# all_data = calc_velocity_deviation_score(all_data, a=1)
# all_data = calc_score(all_data)
# all_data = all_data.dropna()
print('main')
print(len(all_data))
# print(all_data['sum_va'].describe())
# sns.set_style('whitegrid')
# f, ax= plt.subplots(figsize = (14, 10))
# ax = sns.violinplot(y='sum_va', data=all_data)
# ax.set_title(index)
# plt.show()
# all_data = del_outlier(all_data, None, 'v_skew')
# all_data = del_outlier(all_data, None, 'v_kurt')
# all_data = all_data[selected_cols1]
# all_data = difficulty_mapper(all_data, cols=mapper_cols, levels=mapper_levels)
all_data = set_difficulty(all_data, def_cols=diff_def, levels=None)

if b_del_outlier:
    all_data = del_outlier(all_data, 
                ['difficulty', 'uid', 'day', 'exc_num', 'exc_times'], 
                test_col=test_col)
all_data = all_data.reset_index(drop=True)
all_data = all_data[selected_cols]
# groups = all_data[all_data['exc_num'] == 1.1].groupby(['day', 'uid', 'exc_num', 'exc_times'])
print(all_data.head())

print(all_data.shape)

print('exc and difficulty')
for index, group in all_data.groupby(['exc_num']):
    print(index)
    print(group['difficulty'].unique())

get small deviation
(347972, 44)
(317966, 45)
(317966, 46)
main
317966
308324
torque turning freq:
(306505, 46)
(297707, 48)
main
297707
set difficulty
297707
        difficulty                                   diff_description
0                1  _inverted0_sidewards0_disturbed0_backward0_pat...
94295            4  _inverted0_sidewards0_disturbed0_backward1_pat...
128049           7  _inverted0_sidewards0_disturbed1_backward0_pat...
134190          10  _inverted0_sidewards0_disturbed1_backward1_pat...
136609           2  _inverted0_sidewards0_disturbed0_backward0_pat...
194712           5  _inverted0_sidewards0_disturbed0_backward1_pat...
200172           6  _inverted0_sidewards0_disturbed0_backward1_pat...
209537           3  _inverted0_sidewards0_disturbed0_backward0_pat...
220988           8  _inverted0_sidewards0_disturbed1_backward0_pat...
222624          13  _inverted1_sidewards1_disturbed0_backward0_pat...
236517          15  _inverted1_sidewards1_disturbed0_backward1_pat...
2

In [13]:
all_data[all_data['difficulty']==10]

Unnamed: 0,day,uid,exc_num,exc_times,inv_force,difficulty,deviation,velocity
134190,1.0,1.0,1.5,1.0,466.849795,10,0.028111,0.339451
134191,1.0,1.0,1.5,1.0,457.763586,10,0.028111,0.339451
134192,1.0,1.0,1.5,1.0,454.741457,10,0.028111,0.339451
134193,1.0,1.0,1.5,1.0,452.293833,10,0.028111,0.339451
134194,1.0,1.0,1.5,1.0,456.637459,10,0.028111,0.339451
134195,1.0,1.0,1.5,1.0,461.555557,10,0.028111,0.359820
134196,1.0,1.0,1.5,1.0,468.070467,10,0.028111,0.359820
134197,1.0,1.0,1.5,1.0,487.784050,10,0.028111,0.359820
134198,1.0,1.0,1.5,1.0,484.602347,10,0.028111,0.359820
134199,1.0,1.0,1.5,1.0,469.010489,10,0.028111,0.422794


In [14]:
# tmp_col = 'v_sd'
# # tmp = all_data[(all_data['day']==1) & 
# #                (all_data['uid']==7) & 
# #                (all_data['exc_num']==1.1) & 
# #                (all_data['exc_times']==2)][tmp_col]
# tmp = all_data.groupby(group_unit).head()
# print(tmp)
# del tmp
# # print(tmp['Az'].rolling(100).mean().head())


### delete outlier 

In [15]:

# print(len(all_data))
# all_data = all_data.dropna()
# print(len(all_data))
# # 
# l1 = len(all_data[['uid', 'day', 'exc_num', 'exc_times']].drop_duplicates())
# print(l1)
# upper = all_data[test_col].quantile(0.999)
# print(upper)
# outlier = all_data[all_data[test_col]>upper]
# outlier = outlier[['uid', 'day', 'exc_num', 'exc_times']].drop_duplicates()
# all_data = all_data[all_data[test_col]<=upper]
# all_data,_ = ability_level_mapper(all_data, col=test_col, n_level=19, target_col_name='perf', invert=True)
# print('outlier')
# print(len(outlier))
# print(len(outlier)/l1)
# print(all_data[[test_col, 'perf']].describe())
# # del all_data
# del outlier

In [16]:
print(all_data.head())
all_data.to_csv('../data/step1_clear_data.csv', index=False)

   day  uid  exc_num  exc_times   inv_force  difficulty  deviation  velocity
0  1.0  1.0      1.1        1.0  484.201507           1   0.005096  0.113868
1  1.0  1.0      1.1        1.0  486.018444           1   0.005096  0.113868
2  1.0  1.0      1.1        1.0  488.478646           1   0.005096  0.113868
3  1.0  1.0      1.1        1.0  489.455159           1   0.005096  0.113868
4  1.0  1.0      1.1        1.0  494.271205           1   0.005096  0.113868


### test area

In [17]:
print(all_data.avg_av_xy.describe())
list1 = all_data[all_data['exc_num']==1.1]['difficulty'].unique()
list2 = all_data[all_data['exc_num']==2.1]['difficulty'].unique()
l1 = len(list1)
l3 = len(list2)
print('diff in 1.1: '+str(l1))
print('diff in 2.1: '+str(l3))
common = list(set(list1).intersection(list2))
print('the common diff: ' +str(len(common)))
print('percentage: ' +str(len(common)/l1),str(len(common)/l3))

AttributeError: 'DataFrame' object has no attribute 'avg_av_xy'

In [None]:
# tmp = all_data[all_data.v_angle>3.1]
# col = 'avg_av_xy'
# tmp = all_data.loc[#((all_data.uid==1) | (all_data.uid==3)) & 
#                    ((all_data.exc_num==1.3) | #(all_data.exc_num==2.3) | 
#                                                               (all_data.exc_num==1.1))]

# # tmp = all_data.loc[((all_data.uid==1) | (all_data.uid==3))]
# # tmp = tmp.loc[tmp.exc_num!=4.2]
# sns.set_style('whitegrid')
# f, ax= plt.subplots(figsize = (14, 10))
# ax = sns.violinplot(x="exc_num", y=col, hue='uid', data=tmp)

# tmp = all_data.loc[((all_data.uid==1) | (all_data.uid==11)) & ((all_data.exc_num==1.3) | #(all_data.exc_num==2.3) | 
#                                                              (all_data.exc_num==1.1))]
# sns.set_style('whitegrid')
# f, ax= plt.subplots(figsize = (14, 10))
# ax = sns.violinplot(x="exc_num", y=col, hue='uid', data=tmp)


In [None]:
# # del tmp
# # tmp = all_data.copy()
# tmp = all_data.loc[#((all_data.uid==2) | (all_data.uid==6)) 
#                     (all_data.day==2) 
# #                    & (all_data.exc_num==1.3) 
# #                    & (all_data.exc_times==1)
#                   ]
# cnt = 0
# by_cols = ['v_angle', 'v_sd',
#            'torque_xy_avg', 'inverted', 'day', 'exc_num', 'exc_times'
#           ]
# for index,group in tmp.groupby(by_cols):
    
#     if len(group)<100 or len(group['uid'].unique())<8:
#         continue
#     print(len(group))
#     cnt += 1
#     if cnt>10: continue
#     group = group.reset_index()
#     sns.set_style('whitegrid')
#     f, ax= plt.subplots(figsize = (14, 10))
#     ax = sns.violinplot(x="v_angle", y='velocity', hue='uid', data=group)
#     ax.set_title(index)
#     plt.show()

In [None]:
stop()

In [None]:
col = 'sum_va'
tmp = all_data.loc[(all_data.uid==3) & (all_data.day==1) & (all_data.exc_num==1.3) & (all_data.exc_times==1)]
sns.set_style('whitegrid')
f, ax= plt.subplots(figsize = (14, 10))
ax = sns.lineplot(x=tmp.index, y=col, data=tmp)

tmp = all_data.loc[((all_data.uid==4)) 
                   & (all_data.day==1) & (all_data.exc_num==1.1) & (all_data.exc_times==1)]
sns.set_style('whitegrid')
f, ax= plt.subplots(figsize = (14, 10))
ax = sns.lineplot(x=tmp.index, y=col, data=tmp)

In [None]:
# # parameters
# target_col = 'velocity'
# target_exc = None
# cnt = 0


# if target_exc is None:
#     groups = all_data[['day', 'uid', 'exc_num', 'exc_times', target_col]].groupby(['day', 'uid', 'exc_num', 'exc_times'])
# else:
#     tmp = all_data[all_data['exc_num']==target_exc]
#     assert len(tmp)>10, 'empty data'
#     groups = tmp[['day', 'uid', 'exc_num', 'exc_times', target_col]].groupby(['day', 'uid', 'exc_num', 'exc_times'])
#     del tmp



# for index, group in groups:
#     if cnt>10:
#         break
#     cnt += 1
    
# #     print(index)
#     tmp = group.reset_index(drop=True)
#     tmp = tmp.reset_index()
# #     print(group)
#     sns.set_style('whitegrid')
#     f, ax= plt.subplots(figsize = (14, 10))
#     ax = sns.lineplot(x="index", y=target_col, data=tmp)
#     ax.set_title(index)

### reduce initial deviation

In [None]:
def reduce_header(data, head=500, has_tail=False):
    
    groups = data.set_index(['day', 'exc_num', 'exc_times', 'uid'])
    groups = groups.groupby(groups.index)
    result = pd.DataFrame()
    
    for index, group in groups:
        
        if has_tail:
            assert len(group)>(head*2), 'outlier'
        else:
            assert len(group)>head, 'outlier'
            
        header = [i+1 for i in xrange(head)]
        if has_tail:
            tailer = header[::-1]
            middle = [head for i in xrange(len(group)-head*2)]
            weights = header+middle+tailer

        else:
            middle = [head for i in xrange(len(group)-head)]

            weights = header+middle
            
        weights = pd.Series(weights)    
        weights = weights/head
        assert len(group)==len(weights), 'different length between group and weights'
        curr = group.reset_index()
        curr['front'] =curr['front']*weights
        result = pd.concat([result, curr])
        del curr
        
    return result.reset_index(drop=True)
                            

# new_data = reduce_header(new_data,  head=300, has_tail=True)


In [None]:

print(len(new_data))
new_data.head()

In [None]:
new_data.to_csv('../data/step1_clear_data.csv', index=False)