In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from selfregulation.utils.utils import get_behav_data
from selfregulation.utils.plot_utils import format_num
from statsmodels.stats.stattools import medcouple

In [None]:
data = get_behav_data()
clean = get_behav_data(file = 'meaningful_variables_clean.csv')
imputed = get_behav_data(file = 'meaningful_variables_imputed.csv')

In [None]:
old_cols = set(data.columns) - set(clean.columns)
clean_cols = set(clean.columns) - set(data.columns)


before_transform = data.loc[:, old_cols]
before_transform = before_transform.melt()
before_transform['stage'] = 'null'


clean = clean.loc[:, clean_cols]
clean.columns = ['.'.join(i.split('.')[:-1]) for i in clean.columns]
clean = clean.melt()
clean['stage'] = 'clean'


final = pd.concat([before_transform, clean])
# remove missing values
final = final.replace([np.inf, -np.inf], np.nan)
final.dropna(inplace=True)

In [None]:
# look at transformed and dropped data
%matplotlib inline
f, axes = plt.subplots(len(old_cols), 2, figsize=(10, len(old_cols)*5))
for row, name in enumerate(sorted(final.variable.unique())):
    for col, stage in enumerate(['null','clean']):
        subset = final.query('variable == "%s" and stage == "%s"' % (name, stage))
        if len(subset) > 0:
            axes[row][col].hist(subset['value'], bins=20)
            axes[row][0].set_ylabel(('\n').join(name.split('.')), fontsize=20)
            axes[row][col].set_title(str(stage))
            axes[row][col].text(.75,.6,format_num(subset.value.skew()),
                                fontsize=20,
                                color='black',
                               transform = axes[row][col].transAxes)
            axes[row][col].text(.75,.8,format_num(float(medcouple(subset.value))),
                                fontsize=20,
                                color='red',
                               transform = axes[row][col].transAxes)

In [None]:
# look clean and imputed data
clean = get_behav_data(file = 'meaningful_variables_clean.csv')
imputed = get_behav_data(file = 'meaningful_variables_imputed.csv')

clean_melted=clean.melt().assign(stage='clean')
impute_melted=imputed.melt().assign(stage='imputed')
final = pd.concat([clean_melted, impute_melted])
# remove missing values
final = final.replace([np.inf, -np.inf], np.nan)
final.dropna(inplace=True)

In [None]:
%matplotlib inline
n_rows=clean.shape[1]
f, axes = plt.subplots(n_rows, 2, figsize=(7, n_rows*3))
for row, name in enumerate(sorted(final.variable.unique())):
    for col, stage in enumerate(['clean','imputed']):
        subset = final.query('variable == "%s" and stage == "%s"' % (name, stage))
        if len(subset) > 0:
            axes[row][col].hist(subset['value'], bins=20)
            axes[row][0].set_ylabel(('\n').join(name.split('.')), fontsize=15,
                                   rotation=0, labelpad=100)
            axes[row][col].set_title(str(stage))
            axes[row][col].text(.75,.6,format_num(subset.value.skew()),
                                fontsize=15,
                                color='black',
                               transform = axes[row][col].transAxes)
            axes[row][col].text(.75,.8,format_num(float(medcouple(subset.value))),
                                fontsize=15,
                                color='red',
                               transform = axes[row][col].transAxes)