# View html output of this notebook here:
https://htmlpreview.github.io/?https://github.com/zagoodman/swallow/blob/main/jupyter/analyze.html

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup" data-toc-modified-id="Setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup</a></span></li><li><span><a href="#Import-and-explore-data" data-toc-modified-id="Import-and-explore-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Import and explore data</a></span><ul class="toc-item"><li><span><a href="#Compare-features-by-gender" data-toc-modified-id="Compare-features-by-gender-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Compare features by gender</a></span></li></ul></li><li><span><a href="#Demeaned-data" data-toc-modified-id="Demeaned-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Demeaned data</a></span><ul class="toc-item"><li><span><a href="#Gender-differences?" data-toc-modified-id="Gender-differences?-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Gender differences?</a></span></li></ul></li><li><span><a href="#T-SNE" data-toc-modified-id="T-SNE-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>T-SNE</a></span></li></ul></div>

This file takes calculated features data and visualizes relationships between features and swallow volume.

Description of features:
<img src="../images/features.jpg" width=1000>

## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [None]:
# custom plot settings

size=20
params = {'legend.fontsize': 'large',
          'legend.title_fontsize': size*0.75,
          'legend.frameon': False,
          'figure.figsize': (8,4),
          'axes.labelsize': size,
          'axes.titlesize': size,
          'xtick.labelsize': size*0.75,
          'ytick.labelsize': size*0.75,
          'axes.titlepad': 10,
          'figure.subplot.hspace': 0.9,
          'font.sans-serif': 'Arial'}
# matplotlib.rcParams.keys() # to see editable options
plt.rcParams.update(params)

## Import and explore data

In [None]:
# import data
df = pd.read_excel('../data/gen/collapsed_data.xlsx')

# create amplitude ratio feature
df['amp_ratio'] = df.amplitude_1 / df.amplitude_3

# create 'area under curve' feature (not really area, but correlated)
df['auc_1'] = df.amplitude_1 * df.duration_2
df['auc_2'] = df.amplitude_3 * (df.duration_3 - df.duration_2)

# add gender to df
male_list = [True, True, True, False, True, True, False, True, True, False]
df['male'] = df.person_id.apply(lambda x: male_list[x-1])
pd.crosstab(df.person_id, df.male)

# check df
print(df.isnull().sum())
display(df.describe())
df.head()

In [None]:
# Mean of each feature by swallow volume

dfmean = df.groupby('swallow_volume').agg('mean').reset_index()

for x in [v for v in dfmean.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
    plt.scatter(dfmean.swallow_volume, dfmean[x])
    plt.title(x)
    plt.show()

In [None]:
for x in [v for v in df.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
    display(df.boxplot(column=[x], by=['swallow_volume']))

### Compare features by gender

In [None]:
for x in [v for v in df.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
    display(df.boxplot(column=[x], by=['male']))
    plt.title('')
    plt.show()

## Demeaned data

In [None]:
# demean function 

def demean(df, var = ['person_id']):
    """Demean df grouped by var, and divides by std dev. Returns a df.
    """
    # get means
    dfmeans = df.groupby(var).agg('mean').reset_index()
    
    # get std devs
    dfstd = df.groupby(var).agg('std').reset_index()
    
    # loop through cols and demean
    for x in [v for v in df.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
        dftmp = dfmeans[var + [x]]
        dftmp = pd.concat([dftmp, dfstd[x]], 1)
        dftmp.columns = var + ['xmean', 'xstd']
        df = df.merge(dftmp, on=var, how='inner')
        df[x] = (df[x] - df.xmean) / df.xstd
        df.drop(['xmean', 'xstd'], 1, inplace=True)
    
    return df

dfdemean = demean(df, ['person_id'])
dfdemean.head()

In [None]:
# plot demeaned boxplots

for x in [v for v in dfdemean.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
    dfdemean.boxplot(column=[x], by=['swallow_volume'])

In [None]:
import seaborn as sns

# get subset
dfsub = dfdemean.loc[(dfdemean.person_id <= 5) | (dfdemean.person_id == 7), :]

for y in [v for v in dfsub.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
    print(y)
    
    # Plot
    sns.set(font_scale=1.5)
    sns.set_style("ticks")
    fig = plt.figure(figsize=(8,5))
    ax1 = fig.add_subplot(111) # row, col, num
    sns.stripplot(x='swallow_volume', y=y, hue='person_id', 
                  data=dfsub, dodge=True, jitter=True, alpha=.40, 
                  zorder=1, size=8)
    sns.pointplot(x='swallow_volume', y=y, ci='sd', 
                  data=dfsub, join=False, scale=1.5, 
                  zorder=100, color='black')
    ax1.set_xlabel('volume (mL)')
    ax1.set_ylabel(y)
    ax1.legend(title='person id', bbox_to_anchor=(1.01, 0.5), loc='center left', fontsize='xx-small')

In [None]:
# subplot with amp1, amp2, ampr, auc1

# get data subset
dfsub = dfdemean.loc[(dfdemean.person_id == 4) & (dfdemean.swallow_volume <= 15)]

# init plot
sns.set_style("ticks")
fig, ax = plt.subplots(2, 2, figsize=(12,8))

# amp1
y = 'amplitude_1'
sns.stripplot(ax=ax[0,0], x='swallow_volume', y=y, hue='person_id', 
      data=dfsub, dodge=True, jitter=True, alpha=.40, 
      zorder=1)
sns.pointplot(ax=ax[0,0], x='swallow_volume', y=y, ci='sd', 
      data=dfsub, join=False, 
      zorder=100, color='black')

# amp2
y = 'amplitude_2'
sns.stripplot(ax=ax[0,1], x='swallow_volume', y=y, hue='person_id', 
      data=dfsub, dodge=True, jitter=True, alpha=.40, 
      zorder=1)
sns.pointplot(ax=ax[0,1], x='swallow_volume', y=y, ci='sd', 
      data=dfsub, join=False, 
      zorder=100, color='black')

# ampr
y = 'amp_ratio'
sns.stripplot(ax=ax[1,0], x='swallow_volume', y=y, hue='person_id', 
      data=dfsub, dodge=True, jitter=True, alpha=.40, 
      zorder=1)
sns.pointplot(ax=ax[1,0], x='swallow_volume', y=y, ci='sd', 
      data=dfsub, join=False, 
      zorder=100, color='black')

# auc1
y = 'auc_1'
sns.stripplot(ax=ax[1,1], x='swallow_volume', y=y, hue='person_id', 
      data=dfsub, dodge=True, jitter=True, alpha=.40, 
      zorder=1)
sns.pointplot(ax=ax[1,1], x='swallow_volume', y=y, ci='sd', 
      data=dfsub, join=False, 
      zorder=100, color='black')

ax[0,0].title.set_text('Amplitude of First Peak')
ax[0,1].title.set_text('Amplitude of Second Peak')
ax[1,0].title.set_text('Ratio of Peak Amplitudes')
ax[1,1].title.set_text('Area Under Curve, First Peak')

# all units are 'normalized' to be mean zero, stddev = 1
ax[0,0].set_ylabel('Normalized Units')
ax[0,1].set_ylabel('')
ax[1,0].set_ylabel('Normalized Units')
ax[1,1].set_ylabel('')
# ax[0,0].set_ylabel('Normalized Amplitude')
# ax[0,1].set_ylabel('Normalized Amplitude')
# ax[1,0].set_ylabel('% Size')
# ax[1,1].set_ylabel('Normalized Area')

for i in range(2):
    for j in range(2):
        if i == 0:
            ax[i, j].set_xlabel('')
        else:
            ax[i, j].set_xlabel('Swallow Volume (ml)')
        ax[i,j].get_legend().remove()

fig.tight_layout()

In [None]:
# subplot with amp1, amp2, ampr, auc1

# get data subset
dfsub = dfdemean.loc[(dfdemean.person_id == 4) & (dfdemean.swallow_volume > 0)]

# init plot
sns.set_style("ticks")
fig, ax = plt.subplots(2, 2, figsize=(12,8))

# amp1
y = 'amplitude_1'
sns.stripplot(ax=ax[0,0], x='swallow_volume', y=y, hue='person_id', 
      data=dfsub, dodge=True, jitter=True, alpha=.40, 
      zorder=1)
sns.pointplot(ax=ax[0,0], x='swallow_volume', y=y, ci='sd', 
      data=dfsub, join=False, 
      zorder=100, color='black')

# amp2
y = 'amplitude_2'
sns.stripplot(ax=ax[0,1], x='swallow_volume', y=y, hue='person_id', 
      data=dfsub, dodge=True, jitter=True, alpha=.40, 
      zorder=1)
sns.pointplot(ax=ax[0,1], x='swallow_volume', y=y, ci='sd', 
      data=dfsub, join=False, 
      zorder=100, color='black')

# ampr
y = 'amp_ratio'
sns.stripplot(ax=ax[1,0], x='swallow_volume', y=y, hue='person_id', 
      data=dfsub, dodge=True, jitter=True, alpha=.40, 
      zorder=1)
sns.pointplot(ax=ax[1,0], x='swallow_volume', y=y, ci='sd', 
      data=dfsub, join=False, 
      zorder=100, color='black')

# auc1
y = 'auc_1'
sns.stripplot(ax=ax[1,1], x='swallow_volume', y=y, hue='person_id', 
      data=dfsub, dodge=True, jitter=True, alpha=.40, 
      zorder=1)
sns.pointplot(ax=ax[1,1], x='swallow_volume', y=y, ci='sd', 
      data=dfsub, join=False, 
      zorder=100, color='black')

ax[0,0].title.set_text('Amplitude of First Peak')
ax[0,1].title.set_text('Amplitude of Second Peak')
ax[1,0].title.set_text('Ratio of Peak Amplitudes')
ax[1,1].title.set_text('Area Under Curve, First Peak')

# all units are 'normalized' to be mean zero, stddev = 1
ax[0,0].set_ylabel('Normalized Units')
ax[0,1].set_ylabel('')
ax[1,0].set_ylabel('Normalized Units')
ax[1,1].set_ylabel('')
# ax[0,0].set_ylabel('Normalized Amplitude')
# ax[0,1].set_ylabel('Normalized Amplitude')
# ax[1,0].set_ylabel('% Size')
# ax[1,1].set_ylabel('Normalized Area')

for i in range(2):
    for j in range(2):
        if i == 0:
            ax[i, j].set_xlabel('')
        else:
            ax[i, j].set_xlabel('Swallow Volume (ml)')
        ax[i,j].get_legend().remove()

fig.tight_layout()

In [None]:
# get subset
dfsub = dfdemean.loc[(dfdemean.person_id <= 7) & (dfdemean.person_id != 6), :]

for y in [v for v in dfsub.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
    print(y)
    
    # Plot
    sns.set(font_scale=1.5)
    sns.set_style("ticks")
    fig = plt.figure(figsize=(8,5))
    ax1 = fig.add_subplot(111) # row, col, num
    sns.stripplot(x='swallow_volume', y=y, hue='person_id', 
                  data=dfsub, dodge=True, jitter=True, alpha=.40, 
                  zorder=1, size=8)
    sns.pointplot(x='swallow_volume', y=y, ci='sd', 
                  data=dfsub, join=False, scale=1.5, 
                  zorder=100, color='black')
    ax1.set_xlabel('volume (mL)')
    ax1.set_ylabel(y)
    ax1.legend(title='person id', bbox_to_anchor=(1.01, 0.5), loc='center left', fontsize='xx-small')

In [None]:
# get subset
dfsub = dfdemean.loc[dfdemean.person_id >= 8, :]

for y in [v for v in dfsub.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
    print(y)
    
    # Plot
    sns.set(font_scale=1.5)
    sns.set_style("ticks")
    fig = plt.figure(figsize=(8,5))
    ax1 = fig.add_subplot(111) # row, col, num
    sns.stripplot(x='swallow_volume', y=y, hue='person_id', 
                  data=dfsub, dodge=True, jitter=True, alpha=.40, 
                  zorder=1, size=8)
    sns.pointplot(x='swallow_volume', y=y, ci='sd', 
                  data=dfsub, join=False, scale=1.5, 
                  zorder=100, color='black')
    ax1.set_xlabel('volume (mL)')
    ax1.set_ylabel(y)
    ax1.legend(title='person id', bbox_to_anchor=(1.01, 0.5), loc='center left', fontsize='xx-small')

### Gender differences?

In [None]:
# get subset
dfsub = df.loc[df.person_id >= 0, :]

for y in [v for v in dfsub.columns if v not in ['swallow_volume', 'observation', 'person_id', 'swallow_id', 'male']]:
    print(y)
    
    # Plot
    sns.set(font_scale=1.5)
    sns.set_style("ticks")
    fig = plt.figure(figsize=(8,5))
    ax1 = fig.add_subplot(111) # row, col, num
    sns.stripplot(x='male', y=y, hue='person_id', 
                  data=dfsub, dodge=True, jitter=True, alpha=.40, 
                  zorder=1, size=8)
    sns.pointplot(x='male', y=y, ci='sd', 
                  data=dfsub, join=False, scale=1.5, 
                  zorder=100, color='black')
    ax1.set_xlabel('Male (Boolean)')
    ax1.set_ylabel(y)
    ax1.legend(title='person id', bbox_to_anchor=(1.01, 0.5), loc='center left', fontsize='xx-small')

## T-SNE

In [None]:
# fit T-SNE and plot

from sklearn.manifold import TSNE

# get subset of data
dfsub = df.loc[df.person_id >= 0, 
                 [c for c in df.columns if
                  c not in ['observation', 'swallow_id', 'emg']]]

# rescale data
def rescale(vec):
    return (vec - vec.mean()) / vec.std()
for v in [c for c in dfsub.columns if c not in ['person_id', 'swallow_volume']]:
    dfsub[v] = dfsub.groupby(['person_id'])[v].apply(lambda x: rescale(x))

# define X and Y vecs
X = dfsub.iloc[:, 2:-1]
Y = dfsub.loc[:, 'swallow_volume']

# fit model
X_embedded = TSNE(n_components = 2, 
                  perplexity = 50,
                  learning_rate = 100,
                  n_iter = 5000).fit_transform(X, Y)
print(X_embedded.shape)

# merge results
dfm = pd.concat([dfsub.reset_index(drop=True), 
                 pd.DataFrame(X_embedded, columns=['tsne0', 'tsne1'])], 1)
dfm.head()



# plot t-sne, color by person

# color map
cmap = plt.cm.jet

fig, ax = plt.subplots()

scatter = ax.scatter(dfm.tsne0, dfm.tsne1, c=dfm.person_id, cmap=cmap)
ax.legend(*scatter.legend_elements(num=10), ncol=2,
            loc="center left", bbox_to_anchor=(1, 0.5), title="person id")
plt.title('Colored by person id')
plt.show()


# plot t-sne, color by swallow volume

cmap = matplotlib.cm.Blues(np.linspace(0,1,10))
cmap = matplotlib.colors.ListedColormap(cmap[2:,:-1])

fig, ax = plt.subplots()

scatter = ax.scatter(dfm.tsne0, dfm.tsne1, c=dfm.swallow_volume, cmap=cmap)
ax.legend(*scatter.legend_elements(num=6),
            loc="center left", bbox_to_anchor=(1, 0.5), title="swallow volume")
plt.title('Colored by swallow volume')
plt.show()


# plot t-sne, color by gender

fig, ax = plt.subplots()

scatter = ax.scatter(dfm.tsne0, dfm.tsne1, c=df.male, cmap=cmap)
ax.legend(*scatter.legend_elements(num=2),
            loc="center left", bbox_to_anchor=(1, 0.5), title="Male (Boolean)")
plt.title('Colored by gender')
plt.show()