In [None]:
# functions for grooming analyses 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import os
import seaborn as sns
import umap
from collections import OrderedDict
from scipy import signal, stats, interpolate

In [None]:
# for interpolation
def medfilt_data(values, size=15):
    padsize = size+5
    vpad = np.pad(values, (padsize, padsize), mode='reflect')
    vpadf = signal.medfilt(vpad, kernel_size=size)
    return vpadf[padsize:-padsize]

def nan_helper(y):
    return np.isnan(y), lambda z: z.nonzero()[0]

def interpolate_data(vals):
    nans, ix = nan_helper(vals)
    out = np.copy(vals)
    try:
        out[nans] = np.interp(ix(nans), ix(~nans), vals[~nans])
    except ValueError:
        out[:] = 0
    return out

In [1]:
def exclude_erroneous_points(df, bodyparts):
    for bodypart in bodyparts:
        errors = df.loc[0:][bodypart + '_error']
        for i in range(len(errors)):
            if errors[i] > 15:
                df.set_value(i, bodypart + '_x', np.nan)
                df.set_value(i, bodypart + '_y', np.nan)
                df.set_value(i, bodypart + '_z', np.nan)            
    return df

def normalize_data(df):
    bodyparts = np.array(['L1A', 'L1B', 'L1C', 'L1D', 'L1E', 
                          'L2A', 'L2B', 'L2C', 'L2D', 'L2E', 
                          'L3A', 'L3B', 'L3C', 'L3D', 'L3E', 
                          'R1A', 'R1B', 'R1C', 'R1D', 'R1E', 
                          'R2A', 'R2B', 'R2C', 'R2D', 'R2E',
                          'R3A', 'R3B', 'R3C', 'R3D', 'R3E'])
    L1C = np.array([df.iloc[0:]['L1C_' + xyz] for xyz in 'xyz']).T
    L1D = np.array([df.iloc[0:]['L1D_' + xyz] for xyz in 'xyz']).T
    measured_length = np.mean(np.linalg.norm(L1C - L1D, axis=1))
    expected_length = 0.4556788699978114 
    length_ratio = expected_length / measured_length
    
    for bodypart in bodyparts:
        for coord in 'xyz':
            df.iloc[0:][bodypart + '_' + coord] = df.iloc[0:][bodypart + '_' + coord] * length_ratio

    return df

# parse fly summary spreadsheet
def clean_summary(prefix, f_in, f_out):
    fname_in = os.path.join(prefix, f_in)
    fname_out = os.path.join(prefix, f_out)
    with open(fname_in, 'r', encoding='utf-8', errors='ignore') as infile, open(fname_out, 'w') as outfile:
        inputs = csv.reader(infile)
        output = csv.writer(outfile)

        for index, row in enumerate(inputs):
            output.writerow(row)

In [2]:
def some_contains(v, L):
    for name in L:
        if name in v:
            return True
    return False

def get_kde_vals(arr, xvals=None):
    
    x = arr[~np.isnan(arr)]
    kde = stats.gaussian_kde(x, bw_method='scott')
    
    if xvals is None:
        xvals = np.linspace(np.min(x)-1, np.max(x)+1)
    yvals = kde.evaluate(xvals)
    
    return xvals, yvals

# returns dictionary that maps bout number to fly id
def get_fly_id(data, bout_numbers):
    fly_id = dict()
    for j in range(len(bout_numbers)):
        bout_df = data[data.behavior_bout == bout_numbers[j]]
        fly_id[bout_numbers[j]] = bout_df.iloc[0].flyid
    return fly_id

# determine what video corresponds to each bout
def get_videos(bout_numbers, labels_df):
    videos = dict()
    for i in range(len(bout_numbers)):
        bout_labels_df = labels_df[labels_df.behavior_bout == bout_numbers[i]]
        videos[bout_numbers[i]] = bout_labels_df.iloc[0].filename
    return videos

# lists fly number (and day)
def get_flies(fly_ids): 
    flies = dict()
    dif_flies = np.unique(list(fly_ids.values()))
    for i in range(len(dif_flies)):
        flies[dif_flies[i]] = i
    return flies 

# get the angle names to analyze (use _BC instead of _abduct for now)
def get_angle_names(angles, angle_types, only_t1=False):
    angle_names = np.array([])
    for ang in angle_types:
        if only_t1:
            angle_names = np.append(angle_names, [s for s in list(angles.columns) if '1' in s and ang in s])
        else:
            angle_names = np.append(angle_names, [s for s in list(angles.columns) if ang in s])
    # angle_names = angle_names + ['fictrac_speed', 'fictrac_rot']
    return angle_names

# adjust _rot angles so there are no discontinuities
#def adjust_rot_angles(angles, angle_names):
#    rot_angs = [r for r in angle_names if '_rot' in r or '_abduct' in r]
#    for ang in rot_angs:
#        r = np.array(angles[ang])
#       r[r > 50] = r[r > 50] - 360
#        angles[ang] = r
#    return angles

def adjust_rot_angles(angles, angle_names):
    conds = ['2', '3', 'L1A', 'L1B', 'L1C', 'R1A', 'R1B', 'R1C']
    offsets = np.array([-50, -20, 20, -70, 10, 20, 70, -30])
    for j in range(len(conds)):
        rot_angs = [r for r in angle_names if '_rot' in r and conds[j] in r]
        for ang in rot_angs:
            r = np.array(angles[ang])
            r[r > offsets[j]] = r[r > offsets[j]] - 360
            angles[ang] = r
        
    abduct_angs = [r for r in angle_names if '_abduct' in r or 'A_flex' in r]
    for ang in abduct_angs:
        r = np.array(angles[ang])
        r[r > 50] = r[r > 50] - 360
        angles[ang] = r
        
    return angles
        
def correct_angles(data, angle_names):   
    c_flex_angs = [r for r in angle_names if 'C_flex' in r]
    for ang in c_flex_angs:
        data[ang] *= np.sign(data[ang])
        
    legs = ['L1', 'L2', 'L3', 'R1', 'R2', 'R3'] 
    for leg in legs:
        flex = np.array(data['{}A_abduct'.format(leg)])
        data['{}A_abduct'.format(leg)] = np.array(data['{}A_flex'.format(leg)])
        data['{}A_flex'.format(leg)] = flex
        
    return data

# removes grooming bouts from dataset that are less than a specified 
# number of frames (too short to analyze)
def remove_short_bouts(data, min_frames):    
    bout_numbers = np.unique(data.behavior_bout)
    bout_lengths = np.zeros(bout_numbers.shape)
    for j in range(len(bout_numbers)):
        bout_lengths[j] = len(data[data.behavior_bout == bout_numbers[j]])
        
    saved_bouts = bout_numbers[bout_lengths >= min_frames]
    data_new = data[data.behavior_bout.isin(saved_bouts)]  
    return data_new

# return dict mapping bout number to bout lengths
def get_bout_lengths(data):
    bout_numbers = np.unique(data.behavior_bout).astype(int)
    bout_length_dict = dict()
    for j in range(len(bout_numbers)):
        bout_length_dict[bout_numbers[j]] = len(data[data.behavior_bout == bout_numbers[j]])
    return bout_length_dict

# assign a unique bout number to each bout (previously had duplicates due
# to running experiments on different days)
def adjust_bout_numbers(data):
    
    dates = np.unique(data.date)
    cumulative_bouts = 1
    data_new = pd.DataFrame()
    
    for i in range(len(dates)):
        
        subset = data[data['date'] == dates[i]]
        bout_numbers = np.unique(subset.behavior_bout)
        bout_numbers_new = np.arange(cumulative_bouts, cumulative_bouts + len(bout_numbers), 1)
        cumulative_bouts = cumulative_bouts + len(bout_numbers)
        
        for j in range(len(bout_numbers)):
            subset['behavior_bout'].replace({bout_numbers[j]:bout_numbers_new[j]}, inplace=True)
            
        data_new = pd.concat([data_new, subset])
    
    return data_new

# determine which flies we have the most data for, then sort by flies with the most data
def data_per_fly(data):
    bout_numbers = np.unique(data.behavior_bout)
    fly_ids = get_fly_id(data, bout_numbers)
    fly_data = dict()
    for j in range(len(bout_numbers)):
        fly = fly_ids[bout_numbers[j]]
        bout_length = len(data[data.behavior_bout == bout_numbers[j]])
        if fly not in fly_data:
            fly_data[fly] = 0
        fly_data[fly] += bout_length       
    fly_names_sorted = sorted(fly_data, key=fly_data.get, reverse=True) 
    return fly_data, fly_names_sorted

# get all videos of t1 grooming from a given fly
def fly_to_video(data):
    fly_names = np.unique(data.flyid)
    fly_videos = dict()
    for j in range(len(fly_names)):
        fly_videos[fly_names[j]] = np.unique(data[data.flyid == fly_names[j]].filename)
    return fly_videos

# assign a different color to each fly
def get_fly_colors(fly_ids, colors):
    flies = fxn.get_flies(fly_ids)
    fly_colors = []
    for i in range(len(fly_ids)):
        f_id = fly_ids[i+1]
        fly_num = flies[f_id]
        fly_colors.append(colors[fly_num])
    return fly_colors

def ymax_coords_corr(df, legs, joints, coords):     
    ymin = 0
    ymax = 0
    for ii in range(len(coords)):
        for i in range(len(joints)):                      
            t1_l = normalize_corr(df.iloc[0:][legs[0] + joints[i] + coords[ii]])
            t1_r = normalize_corr(df.iloc[0:][legs[1] + joints[i] + coords[ii]])
            corr = signal.correlate(t1_l, t1_r)
            if np.amax(corr) > ymax: 
                ymax = np.amax(corr)
            if np.amin(corr) < ymin:
                ymin = np.amin(corr)           
    return ymin, ymax

def ymax_angles_corr(legs, joints, df):     
    ymin = 0
    ymax = 0
    for i in range(len(joints)):                      
        t1_l = normalize_corr(df.iloc[0:][legs[0] + joints[i]])
        t1_r = normalize_corr(df.iloc[0:][legs[1] + joints[i]])
        corr = signal.correlate(t1_l, t1_r)
        if np.amax(corr) > ymax: 
            ymax = np.amax(corr)
        if np.amin(corr) < ymin:
            ymin = np.amin(corr)           
    return ymin, ymax
    
# normalizing data for cross correlation
def normalize_corr(data):
    data = np.array(data)
    data_norm = (data - np.mean(data)) / np.std(data)
    return data_norm  

# given a leg and a joint angle, computes the mean angle for each bout
# mean_angles = mean_joint_angles('L1', 'CF', bout_numbers, df_t1_angles, x)
def mean_joint_angles(leg, joint, bout_numbers, df_t1_angles, labels_df):   
    mean_angles = dict()
    for i in range(len(bout_numbers)):
        t1_bout_df = df_t1_angles[labels_df.behavior_bout == bout_numbers[i]]
        t1 = t1_bout_df.iloc[0:][leg + '_' + joint]
        mean_angles[i+1] = np.mean(t1)   
    return mean_angles
      
# given a leg, joint, and coordinate, computes the mean position for each bout 
# mean_coords = mean_joint_coords('L1', 'A', 'x', bout_numbers, df_t1, x)
def mean_joint_coords(leg, joint, coord, bout_numbers, df_t1, labels_df):
    mean_coords = dict()
    for i in range(len(bout_numbers)):
        t1_bout_df = df_t1[labels_df.behavior_bout == bout_numbers[i]]
        t1 = t1_bout_df.iloc[0:][leg + joint + '_' + coord]
        mean_coords[i+1] = np.mean(t1)      
    return mean_coords

# offset from origin (0)
def origin_offset(leg, joint, coord, bout_numbers, df_t1, labels_df):
    joint_offsets = dict()
    for i in range(len(bout_numbers)):
        t1_bout_df = df_t1[labels_df.behavior_bout == bout_numbers[i]]
        t1 = np.array(t1_bout_df.iloc[0:][leg + joint + '_' + coord])     
        t1 = t1[~np.isnan(t1)]
        joint_offsets[i+1] = t1[0]   
    return joint_offsets

# function to find peaks of time series data and calculate mean time between peaks
# (can find mean interval for troughs if the negative of the data is passed in)
def mean_peak_interval(data, fps, thresh = None, dist = None):
    data = data[np.isfinite(data)]
    idxs, props = signal.find_peaks(data, height = thresh, distance = dist)
    peaks = data[idxs]
    intervals = np.diff(idxs) / fps # in seconds
    mean_interval = np.nanmean(intervals) 
    stderr_interval = np.nanstd(intervals) / np.sqrt(len(intervals))
    return mean_interval, stderr_interval, intervals

def get_envelope(data, dist = None, upper = True):
    
    if not upper:
        upper = -1
    
    env = np.zeros(data.shape) 
    indices = signal.find_peaks(upper*data, distance = dist)[0]
    idxs = np.insert(indices, 0, 0)
    peaks = data[idxs[1:]]
    peaks = np.insert(peaks, 0, data[0])
    idxs = np.append(idxs, len(data)-1)
    peaks = np.append(peaks, data[-1])
    spline = interpolate.interp1d(idxs, peaks, kind = 'cubic', bounds_error = False, fill_value=0.0)
    
    for k in range(0,len(data)):
        env[k] = spline(k)
        
    return env, indices[0], indices[-1]

def get_envelope_alt(data, dist = None, upper = True):
    
    if not upper:
        upper = -1
    
    env = np.zeros(data.shape) 
    indices = signal.find_peaks(upper*data, distance = dist)[0]
    if len(indices) > 3:
        idxs = np.insert(indices, 0, 0)
        peaks = data[idxs[1:]]
        peaks = np.insert(peaks, 0, data[0])
        idxs = np.append(idxs, len(data)-1)
        peaks = np.append(peaks, data[-1])
        spline = interpolate.interp1d(idxs, peaks, kind = 'cubic', bounds_error = False, fill_value=0.0)
        
        for k in range(0,len(data)):
            env[k] = spline(k)
        
    return env, indices

def get_leg_joints(legs, joints):
    leg_joints = []
    for i in legs:
        for j in joints:
            leg_joint = i + j
            leg_joints.append(leg_joint)       
    return leg_joints


In [1]:
def get_range(data, angle_vars):

    bout_numbers = np.unique(data.behavior_bout.astype(int))
    for j in range(len(angle_vars)):
        row = []
        for i in range(len(bout_numbers)):
            bout_data = data[data.behavior_bout == bout_numbers[i]]
            bout_angles = np.array(bout_data[angle_vars[j]])
            max_peak = np.max(bout_angles)
            min_trough = np.min(bout_angles)
            avg_amp = np.abs(max_peak - min_trough)
            row.extend([avg_amp]*len(bout_angles))
        data[angle_vars[j] + '_range'] = row
    
    return data
            
# detect peaks and troughs to find the average amplitude
def get_average_range(data, ang_vars, dist = 20, height = None):
    
    bout_numbers = np.unique(data.behavior_bout.astype(int))
    for j in range(len(ang_vars)):
        row = []
        for i in range(len(bout_numbers)):
            bout_data = data[data.behavior_bout == bout_numbers[i]]
            bout_angles = np.array(bout_data[ang_vars[j]])
            peak_idxs, props = signal.find_peaks(bout_angles, distance = dist, height = height)
            peaks = bout_angles[peak_idxs]
            trough_idxs, props = signal.find_peaks(-1*bout_angles, distance = dist, height = height)
            troughs = bout_angles[trough_idxs]
            avg_amp = np.abs(np.nanmean(peaks) - np.nanmean(troughs))
            row.extend([avg_amp]*len(bout_angles))
        data[ang_vars[j] + '_avg_range'] = row
    
    return data
            
def get_bout_features(data, feature_names, flip, normalize = True):
    bout_numbers = np.unique(data.behavior_bout.astype(int))
    bout_features = np.zeros([len(bout_numbers), len(feature_names)])
    bout_data = data.groupby(['behavior_bout']).mean()
    for i in range(len(bout_numbers)):
        bout = bout_data[bout_data.index == bout_numbers[i]]
        features = []
        for j in range(len(feature_names)):
            f = bout[feature_names[j]]
            if flip[j]:
                f = -1*f
            features.append(f)
        bout_features[i, :] = features
        
    if normalize:
        scaler = MinMaxScaler()
        bout_features = scaler.fit_transform(bout_features)
        
    return bout_numbers, bout_features

def compute_thresh(lower_scores, higher_scores):
    dif = abs(np.min(higher_scores) - np.max(lower_scores)) / 2
    thresh = dif + np.max(lower_scores) 
    return thresh

# runs all the steps
def compute_grooming_scores(data, angle_vars, features, flip, dist=20, norm=False):
    data = get_range(data, angle_vars)
    data = get_average_range(data, angle_vars, dist = dist, height = None)
    bout_numbers, all_features = get_bout_features(data, features, flip, normalize = norm)
    all_scores = np.nanmean(all_features, axis = 1)
    data['grooming_score'] = np.nan
    for j in range(len(bout_numbers)):
        data.loc[(data.behavior_bout == bout_numbers[j]),'grooming_score'] = all_scores[j]
    return data

In [3]:
def x_axes_lim(df_angles, angle_names):
    xmin = 180
    xmax = 0
    for j in range(len(angle_names)):
        t1 = df_angles.iloc[0:][angle_names[j]]
        t1_min = np.percentile(t1, 5)
        t1_max = np.percentile(t1, 95)
        if t1_min < xmin:
            xmin = t1_min
        if t1_max > xmax:
            xmax = t1_max    
    return np.array([xmin, xmax])

def y_lim_dif(df, x, coords, bout_num, joints_1, joints_2):      
    ymax = 0
    ymin = 0
    t1_bout_df = df[x.behavior_bout == bout_num]
    for j in range(len(coords)):
        
        t1_1 = np.array(t1_bout_df.iloc[0:][joints_1[j]])
        t1_2 = np.array(t1_bout_df.iloc[0:][joints_2[j]])
        t1_diff = t1_1 - t1_2
    
        if max(t1_diff) > ymax:
            ymax = max(t1_diff)
        if min(t1_diff) < ymin: 
            ymin = min(t1_diff)
            
    return ymin, ymax

def adjust_color(color, amount=0.5):

    import matplotlib.colors as mc
    import colorsys
    try:
        c = mc.cnames[color]
    except:
        c = color
    c = colorsys.rgb_to_hls(*mc.to_rgb(c))
    return colorsys.hls_to_rgb(c[0], 1 - amount * (1 - c[1]), c[2])

def custom_cmap(n_colors):
    cmap = plt.get_cmap('Spectral')
    colors = [cmap(i/(n_colors-1.9999)) for i in range(n_colors)]
    colors[4] = 'y'
    colors[5] = adjust_color('y', 1.7)
    colors[6] = 'g'
    colors[7] = adjust_color('g', 1.3)
    colors[-1] = adjust_color('#5636a7', 1.3)
    return colors

def traj_style(ax):
    ax.yaxis._axinfo["grid"]['linewidth'] = 0.8
    ax.yaxis._axinfo["grid"]['color'] = 'k'
    ax.yaxis._axinfo["grid"]['linestyle'] = ':'
    ax.xaxis._axinfo["grid"]['linewidth'] = 0.8
    ax.xaxis._axinfo["grid"]['color'] = 'k'
    ax.xaxis._axinfo["grid"]['linestyle'] = ':'
    ax.zaxis._axinfo["grid"]['linewidth'] = 0.8
    ax.zaxis._axinfo["grid"]['color'] = 'k'
    ax.zaxis._axinfo["grid"]['linestyle'] = ':'
    return ax