# Seasonal Repeat Analysis
Assess the variability and consistency of activity phenotypes, with the purpose of determining practicality for use as diagnostics

In [2]:
%matplotlib notebook
import scipy
import numpy
from IPython.display import display, HTML
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
import re
import matplotlib.patches as mpatches
from scipy.cluster import hierarchy
import pylab
import pandas

In [3]:
N_ITER = 100
OUTDIR = "../phewas/seasonal/"

In [4]:
# Original activity data
full_activity = pandas.read_csv("../processed/activity_features_aggregate.txt", index_col=0, sep="\t")
full_activity_summary = pandas.read_csv("../processed/activity_summary_aggregate.txt", index_col=0, sep="\t")

ukbb = pandas.read_hdf("../processed/ukbb_data_table.h5")

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# Load seasonal repeat data
full_activity_seasonal = pandas.read_csv("../processed/activity_features_aggregate_seasonal.txt", index_col=0, sep="\t")
id_instance = pandas.Series(full_activity_seasonal.index).astype(str).str.split(".", expand=True)
full_activity_seasonal["id"] = id_instance[0].values
full_activity_seasonal["instance"] = id_instance[1].values


full_activity_summary_seasonal = pandas.read_csv("../processed/activity_summary_aggregate_seasonal.txt", index_col=0, sep="\t")
id_instance = pandas.Series(full_activity_summary_seasonal.index).astype(str).str.split(".", expand=True)
full_activity_summary_seasonal["id"] = id_instance[0].values
full_activity_summary_seasonal["instance"] = id_instance[1].values

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
full_activity_seasonal.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
main_sleep_onset_mean,109973.0,23.468223,1.181336,12.059372,22.819861,23.396900,24.037151,35.775857
main_sleep_offset_mean,109973.0,31.618350,1.482982,15.331099,31.023102,31.607928,32.178239,177.993543
num_wakings_mean,109973.0,1.261570,1.088273,0.000000,0.571429,1.000000,1.714286,26.571429
WASO_mean,109973.0,0.211077,0.197600,0.000000,0.069048,0.156667,0.293333,1.983333
acceleration_during_main_sleep_mean,109973.0,3.065951,6.468282,0.000000,2.749012,2.995456,3.283363,2124.902312
...,...,...,...,...,...,...,...,...
VPA_within_day_SD_L5,111989.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
VPA_between_day_SD_L5,112021.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
VPA_RA,0.0,,,,,,,
VPA_IS,0.0,,,,,,,


In [7]:
# drop activity for people who fail basic QC
[c for c in full_activity_summary.columns if 'quality' in c]
okay = (full_activity_summary['quality-goodCalibration'].astype(bool)
            & (~full_activity_summary['quality-daylightSavingsCrossover'].astype(bool))
            & (full_activity_summary['quality-goodWearTime'].astype(bool))
       )
activity = full_activity[okay].copy()
activity_summary = full_activity_summary[okay].copy()
activity.columns = activity.columns.str.replace("-","_") # Can't use special characters easily
print(f"Dropping {(~okay).sum()} entries out of {len(okay)} due to bad quality or wear-time in their activity")

okay = (full_activity_summary_seasonal['quality-goodCalibration'].astype(bool)
            & (~full_activity_summary_seasonal['quality-daylightSavingsCrossover'].astype(bool))
            & (full_activity_summary_seasonal['quality-goodWearTime'].astype(bool))
       )
activity_seasonal = full_activity_seasonal[full_activity_seasonal.index.isin(okay.index[okay])].copy()
activity_summary_seasonal = full_activity_summary_seasonal[full_activity_summary_seasonal.index.isin(okay.index[okay])].copy()
activity_seasonal.columns = activity_seasonal.columns.str.replace("-","_") # Can't use special characters easily
print(f"Dropping {(~okay).sum()} entries out of {len(okay)} due to bad quality or wear-time in their SEASONAL activity")


  import sys


Dropping 11363 entries out of 103688 due to bad quality or wear-time in their activity
Dropping 1675 entries out of 11773 due to bad quality or wear-time in their SEASONAL activity


In [8]:
### Drop columns that are meaningless
# if standard deviation is 0 or NaN, then no variation and we drop it
# or if the standard deviation is less than 1% of the mean value
std = activity_seasonal.std(axis=0)
mean = activity_seasonal.select_dtypes(["number"]).mean(axis=0).abs()
bad_std_columns = (std == 0) | (std.isna()) | (std[mean.index] < 0.01 * mean )

# check columns with too many duplicates: if min = median or max = median, than too many people have the same value
# and we just drop the column
too_many_duplicates_columns = (activity_seasonal.min(axis=0) == activity_seasonal.median(axis=0)) | (activity_seasonal.max(axis=0) == activity_seasonal.median(axis=0))
columns_to_drop = bad_std_columns[bad_std_columns].index.union(too_many_duplicates_columns[too_many_duplicates_columns].index)
columns_to_drop
activity_seasonal.drop(columns=columns_to_drop, inplace=True)
print(f"Dropping {len(columns_to_drop)} due to extremely low variation between individuals")
print(f"Have {len(activity_seasonal.columns)} activity variables remaining")

Dropping 42 due to extremely low variation between individuals
Have 206 activity variables remaining


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [9]:
# First check for outliers in each column
# By the ratio of the standard deviation to the median absolute deviation
# In a normally distributed variable, this will be about 1.48
outlier_ratio = numpy.maximum(activity.std() / activity.mad(), activity_seasonal.std() / activity_seasonal.mad())
#outlier_ratio.sort_values(ascending=False).head(25)


# Take the 'bad' columns, those beyond the cutoff
OUTLIER_RATIO_CUTOFF = 4
STD_OUTLIER_CUTOFF = 10
outlier_data = activity.loc[:, outlier_ratio > OUTLIER_RATIO_CUTOFF]
is_outlier = (((outlier_data - outlier_data.mean()) / outlier_data.std()) > STD_OUTLIER_CUTOFF)
num_outliers = is_outlier.sum()
total_outliers = is_outlier.any(axis=1).sum()
to_drop = activity.index[is_outlier.any(axis=1)]
activity.drop(index=to_drop, inplace=True)
activity_summary.drop(index=to_drop, inplace=True)
print(f"Found {len(outlier_data.columns)} columns with evidence of some outlier problems")
print(f"Dropping {total_outliers} subjects with one of those columns beyond {STD_OUTLIER_CUTOFF} standard deviations from mean")

# Again for seasonal data
outlier_data = activity_seasonal.loc[:, outlier_data.columns]
is_outlier = (((outlier_data - outlier_data.mean()) / outlier_data.std()) > STD_OUTLIER_CUTOFF)
num_outliers = is_outlier.sum()
total_outliers = is_outlier.any(axis=1).sum()
to_drop = activity_seasonal.index[is_outlier.any(axis=1)]
activity_seasonal.drop(index=to_drop, inplace=True)
activity_summary_seasonal.drop(index=to_drop, inplace=True)
print(f"Dropping {total_outliers} subjects with one of those columns beyond {STD_OUTLIER_CUTOFF} standard deviations from mean from SEASONAL data")


Found 3 columns with evidence of some outlier problems
Dropping 288 subjects with one of those columns beyond 10 standard deviations from mean
Dropping 35 subjects with one of those columns beyond 10 standard deviations from mean from SEASONAL data


In [10]:
print(f"Remaining {len(activity_seasonal.index)} entries across {len(activity_seasonal['id'].unique())} individuals")

Remaining 10061 entries across 3151 individuals


In [11]:
# Generate processed columns
# Make phase unimodal by moving it to 0-24 hours
activity_seasonal['phase'] = (activity_seasonal.phase) % 24


In [12]:
fig, ax = pylab.subplots()

ax.hist(activity_seasonal.phase, bins=21)

<IPython.core.display.Javascript object>

(array([3.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        2.000e+00, 5.000e+00, 8.000e+00, 7.900e+01, 6.290e+02, 3.474e+03,
        4.379e+03, 1.233e+03, 1.800e+02, 3.800e+01, 8.000e+00, 8.000e+00,
        5.000e+00, 4.000e+00, 5.000e+00]),
 array([ 1.56017595,  2.62697259,  3.69376922,  4.76056586,  5.82736249,
         6.89415913,  7.96095577,  9.0277524 , 10.09454904, 11.16134568,
        12.22814231, 13.29493895, 14.36173558, 15.42853222, 16.49532886,
        17.56212549, 18.62892213, 19.69571876, 20.7625154 , 21.82931204,
        22.89610867, 23.96290531]),
 <BarContainer object of 21 artists>)

## Intra- versus inter-personal variance

In [13]:
# Determine intra-personal variability relative to inter-personal variability
def SSE(data):
    return ((data - data.mean())**2).sum()
#inter_personal = SSE(activity_seasonal.groupby("id").mean())
#intra_personal = activity_seasonal.select_dtypes("number").groupby(activity_seasonal.id).apply(SSE).sum()
#total_var = SSE(activity_seasonal.select_dtypes("number"))
inter_personal = activity_seasonal.groupby("id").mean().var()
intra_personal = activity_seasonal.groupby("id").var().mean()
total_var = activity_seasonal.var()
intra_personal_normalized = intra_personal / inter_personal

# Variability in variance proportion
#intra_personal_normalized_samples = []
#for i in range(N_ITER):
#    sample = activity_seasonal.sample(len(activity_seasonal), replace=True)
#    intra_personal_normalized_samples.append(sample.groupby("id").var().mean() / sample.var())
#intra_personal_normalized_samples = pandas.concat(intra_personal_normalized_samples, axis=1)

In [14]:
intra_personal_normalized.sort_values().head(20)

acceleration_hourly_SD            0.255674
acceleration_hourly_SD_M10        0.273849
acceleration_peak_value_mean      0.286818
acceleration_overall              0.290332
acceleration_within_day_SD        0.293552
mesor                             0.301375
sleep_peak_quality_mean           0.319078
acceleration_within_day_SD_M10    0.325643
main_sleep_onset_mean             0.350165
acceleration_peak_value_SD        0.352712
moderate_overall                  0.353342
acceleration_overall_M10          0.358172
MET_hourly_SD                     0.369396
inactivity_peak_value_mean        0.374460
sedentary_hourly_SD               0.401789
moderate_hourly_SD                0.404964
sleep_hourly_SD                   0.408442
amplitude                         0.414891
moderate_within_day_SD            0.425298
moderate_peak_value_mean          0.426312
dtype: float64

In [15]:
#intra_personal_normalized_errors = intra_personal_normalized_samples.std(axis=1)

In [16]:
fig, ax = pylab.subplots()

ax.scatter(numpy.log10(total_var), numpy.log10(intra_personal))
ax.set_xlabel("inter-personal variance (log10)")
ax.set_ylabel("intra-personal variance (log10)")
ax.set_title("Inter- versus Intra -personal variance")

<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'Inter- versus Intra -personal variance')

In [17]:
# Intra- versus inter-personal variance normalized by mean value
fig, ax = pylab.subplots()

ax.scatter(numpy.log10((intra_personal / activity_seasonal.mean()).drop(index=["id", "instance"])),

           numpy.log10((inter_personal / activity_seasonal.mean()).drop(index=["id", "instance"]))
          )

<IPython.core.display.Javascript object>

  result = getattr(ufunc, method)(*inputs, **kwargs)


<matplotlib.collections.PathCollection at 0x28efc594ba8>

In [18]:
fig, ax = pylab.subplots(figsize=(6,15))
ax.barh(intra_personal_normalized.index, intra_personal_normalized, align="center",
        #xerr=intra_personal_normalized_errors*2
       )
ax.set_ylim(0-1, len(intra_personal_normalized))
#ax.set_xlim(0,1)
ax.set_title("Within-subject variance to between-subject variance ratio")
fig.tight_layout()
ax.axvline(1.0, c="k")

fig.savefig(OUTDIR+"activity_vars.fraction_within_variance.png")

<IPython.core.display.Javascript object>

In [19]:
intra_personal_normalized.shape

(204,)

### Summary Data

In [20]:
# Same but for the "summary" datas
summary_intra_personal = activity_summary_seasonal.groupby("id").var().mean()
summary_inter_personal = activity_summary_seasonal.groupby("id").mean().var()
summary_intra_personal_normalized = summary_intra_personal / summary_inter_personal

#summary_inter_personal = SSE(activity_summary_seasonal.groupby("id").mean())
#summary_intra_personal = activity_summary_seasonal.select_dtypes("number").groupby(activity_seasonal.id).apply(SSE)
#summary_total_var = SSE(activity_summary_seasonal.select_dtypes("number"))

#summary_intra_personal_normalized = summary_intra_personal / summary_total_var

In [21]:
pandas.DataFrame({"intra":summary_intra_personal, "inter":summary_inter_personal, "norm": summary_intra_personal_normalized})[summary_inter_personal.index.str.contains("overall")].head(30)

Unnamed: 0,intra,inter,norm
wearTime-overall(days),0.499493,0.236517,2.111871
nonWearTime-overall(days),0.367185,0.194819,1.884746
acc-overall-avg,15.863329,69.160662,0.229369
acc-overall-sd,107.212927,482.243584,0.222321
MVPA-overall-avg,0.000262,0.001177,0.22256
MVPA-overall-sd,0.000746,0.003735,0.199807
VPA-overall-avg,5e-06,1.3e-05,0.347037
VPA-overall-sd,0.000238,0.000792,0.299987
moderate-overall-avg,0.000395,0.001267,0.311819
moderate-overall-sd,0.00111,0.003442,0.322455


In [22]:
#summary_intra_personal_normalized[[c for c in summary_intra_personal_normalized.index if 'VPA-hourOfDay' in c]]#.sort_values().head(35)
#summary_intra_personal_normalized['acceleration-'].sort_values().head(150)
#fig, ax = pylab.subplots()
#ax.hist(summary_intra_personal_normalized)
summary_intra_personal_normalized.sort_values().head(50)

MVPA-overall-sd                   0.199807
acc-overall-sd                    0.222321
MVPA-overall-avg                  0.222560
acc-overall-avg                   0.229369
MVPA-weekday-avg                  0.248654
acc-weekday-avg                   0.258489
sleep-hourOfDay-23-avg            0.275309
MVPA-hourOfWeekday-6-avg          0.299777
VPA-overall-sd                    0.299987
sleep-hourOfWeekday-23-avg        0.301384
sedentary-hourOfDay-23-avg        0.302584
MVPA-hourOfDay-6-avg              0.308773
moderate-overall-avg              0.311819
sleep-hourOfDay-0-avg             0.316399
sedentary-overall-avg             0.321117
MVPA-hourOfDay-5-avg              0.321250
moderate-overall-sd               0.322455
MVPA-hourOfWeekday-5-avg          0.329124
sedentary-hourOfWeekday-23-avg    0.333370
sedentary-hourOfDay-0-avg         0.334258
moderate-weekday-avg              0.334476
MET-overall-sd                    0.336282
sedentary-weekday-avg             0.337026
MET-hourOfD

In [23]:
# Show same for the "summary" data
fig, ax = pylab.subplots(figsize=(6,8))
summary_vals_to_use = summary_intra_personal[summary_intra_personal.index.str.contains("overall")].index
ax.barh(summary_intra_personal_normalized.loc[summary_vals_to_use].index,
        summary_intra_personal_normalized.loc[summary_vals_to_use],
        align="center")
ax.set_ylim(0-1, len(summary_vals_to_use))
ax.axvline(1.0, c="k")
#ax.set_xlim(0,1)
ax.set_title("Within-subject variance to between-subject variance ratio")
fig.tight_layout()

fig.savefig(OUTDIR+"summary_vars.fraction_within_variance.png")

<IPython.core.display.Javascript object>

In [24]:
fig, axes = pylab.subplots(figsize=(12,25), ncols=2)
tests_all = pandas.concat([intra_personal_normalized,
                           #summary_intra_personal_normalized[summary_vals_to_use]
                          ], axis=0)
for ax, tests in zip( axes,
                     [tests_all[:len(tests_all)//2],
                       tests_all[len(tests_all)//2:]]):
    ax.barh(tests.index, tests, align="center",
            #xerr=intra_personal_normalized_errors*2
           )
    ax.set_ylim(0-1, len(tests))
    #ax.set_xlim(0,1)
    ax.axvline(1.0, c="k")

fig.suptitle("Within-person variance to between-person variance ratio")
fig.tight_layout(rect=[0,0.02, 1, 0.97])
fig.savefig(OUTDIR+"all_activity_vars.fraction_within_variance.png")
print(f"{(tests_all < 1).sum()} out of {len(tests_all)} variables pass variance test")

<IPython.core.display.Javascript object>

94 out of 204 variables pass variance test


In [25]:
# save the measures out
inter_intra_variance = pandas.concat([
    pandas.DataFrame({
        "inter": inter_personal,
        "intra": intra_personal,
        "normalized": intra_personal_normalized
    }),
    pandas.DataFrame({
        "inter": summary_inter_personal,
        "intra": summary_intra_personal,
        "normalized": summary_intra_personal_normalized
    })])
inter_intra_variance.to_csv("../processed/inter_intra_personal_variance.txt", sep="\t")

In [26]:
activity_seasonal.sleep_overall.describe()

count    10061.000000
mean         0.388204
std          0.065610
min          0.114907
25%          0.347154
50%          0.382385
75%          0.420058
max          0.734708
Name: sleep_overall, dtype: float64

In [27]:
## Assess the within-person variance to try to see if there is in-fact a constant "measurement error" rate between people
var = "acc-overall-sd"
activity_summary_seasonal.groupby("id")[var].var().idxmax()
#activity_summary_seasonal[activity_summary_seasonal.id == '1998410'][var]
grouped = activity_summary_seasonal.groupby("id")[var]
variances = grouped.var() / grouped.mean()
#activity_summary_seasonal.groupby('id')[var][variances.index == '1998410']
#activity_summary_seasonal[activity_summary_seasonal.id == '1023674'][var]


fig, ax = pylab.subplots()
#ax.hist(numpy.log10(variances))
ax.hist(numpy.log10(activity_summary_seasonal[var]))

<IPython.core.display.Javascript object>

(array([   7.,   63.,  586., 2659., 4182., 1797.,  517.,  174.,   62.,
          16.]),
 array([0.95760729, 1.11335973, 1.26911218, 1.42486462, 1.58061706,
        1.73636951, 1.89212195, 2.0478744 , 2.20362684, 2.35937929,
        2.51513173]),
 <BarContainer object of 10 artists>)

In [28]:
## Check for normality
scipy.stats.normaltest((activity["main_sleep_duration_std"]), nan_policy="omit")

NormaltestResult(statistic=153810.78354569673, pvalue=0.0)

In [29]:
fig, ax = pylab.subplots()
sns.kdeplot(activity["main_sleep_duration_avg"])

<IPython.core.display.Javascript object>

<AxesSubplot:xlabel='main_sleep_duration_avg', ylabel='Density'>

### Seasonality

In [30]:
activity_seasonal['start_date'] = pandas.to_datetime(activity_summary_seasonal['file-startTime'])
year_start = pandas.to_datetime(activity_seasonal.start_date.dt.year.astype(str) + "-01-01")
activity_seasonal['year_fraction'] = (activity_seasonal.start_date - year_start) / (pandas.to_timedelta("1Y"))
activity_seasonal['cos_year_fraction'] = numpy.cos(activity_seasonal.year_fraction*2*numpy.pi)
activity_seasonal['sin_year_fraction'] = numpy.sin(activity_seasonal.year_fraction*2*numpy.pi)

In [31]:
def cyclic_lowess(y, x, **kwargs):
    cyclic_x = numpy.concatenate([x-1, x, x+1])
    cyclic_y = numpy.concatenate([y, y, y])
    return sm.nonparametric.lowess(cyclic_y, cyclic_x, **kwargs)

In [32]:
for activity_var in ["acceleration_RA", "phase", "acceleration_overall", "mesor"]:
    results = smf.ols(f"{activity_var} ~ cos_year_fraction + sin_year_fraction", data = activity_seasonal).fit()
    fig, ax = pylab.subplots()
    ax.scatter(activity_seasonal.year_fraction, activity_seasonal[activity_var])
    t = numpy.linspace(0, 1, 201)
    lowess = cyclic_lowess(activity_seasonal[activity_var], activity_seasonal.year_fraction, it = 0, frac=0.2, xvals=t)
    ax.plot(t, results.params['Intercept'] + results.params['cos_year_fraction'] * numpy.cos(t * numpy.pi * 2) + results.params['sin_year_fraction'] * numpy.sin(t * numpy.pi * 2), c='k')
    ax.plot(t, lowess, c='r')
    ax.set_title(activity_var)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [33]:
# Run the cosinor fits for all the features throughout the year
# and "correct" for the seasonal component
seasonal_tests_data = []
for activity_var in activity_seasonal.select_dtypes(['number']).columns:
    results = smf.ols(f"{activity_var} ~ cos_year_fraction + sin_year_fraction", data=activity_seasonal).fit()
    corrected = activity_seasonal[activity_var] - results.predict(activity_seasonal)
    corrected_inter_personal = corrected.groupby(activity_seasonal.id).mean().var()
    corrected_intra_personal = corrected.groupby(activity_seasonal.id).var().mean()
    corrected_intra_normalized = corrected_intra_personal / corrected_inter_personal

    seasonal_tests_data.append({
        "activity_var": activity_var,
        "p": results.f_pvalue,
        "cos": results.params['cos_year_fraction'],
        "sin": results.params['sin_year_fraction'],
        "corrected_inter_personal": corrected_inter_personal,
        "corrected_intra_personal": corrected_intra_personal,
        "corrected_intra_personal_normalized": corrected_intra_normalized,
    })
seasonal_tests = pandas.DataFrame(seasonal_tests_data)

In [34]:
seasonal_tests['amplitude'] = numpy.sqrt(seasonal_tests.cos**2 + seasonal_tests.sin**2)
seasonal_tests['inter_personal_std'] = numpy.sqrt(seasonal_tests.activity_var.map(inter_personal))
seasonal_tests['std_amplitude'] = seasonal_tests.amplitude / seasonal_tests.inter_personal_std
seasonal_tests['intra_personal_normalized'] = seasonal_tests.activity_var.map(intra_personal_normalized)

In [35]:
seasonal_tests.sort_values(by="std_amplitude", ascending=False).head(50)

Unnamed: 0,activity_var,p,cos,sin,corrected_inter_personal,corrected_intra_personal,corrected_intra_personal_normalized,amplitude,inter_personal_std,std_amplitude,intra_personal_normalized
192,light_RA,0.0,-0.236109,0.009904,0.015031,0.022879,1.522099,0.236317,0.133852,1.765505,3.291265
181,light_hourly_SD,0.0,-29.746112,2.141347,264.867729,449.587928,1.697405,29.823088,17.511559,1.703052,3.358896
185,light_hourly_SD_M10,0.0,-32.133472,2.300388,312.094675,528.679312,1.693971,32.215708,19.071916,1.68917,3.306772
186,light_within_day_SD_M10,0.0,-24.581593,0.448664,182.285072,300.552279,1.648804,24.585687,14.57911,1.686364,3.258965
182,light_within_day_SD,0.0,-24.788309,1.258187,191.848301,328.731278,1.713496,24.82022,14.802245,1.676788,3.339186
34,light_peak_value_mean,0.0,-68.368416,3.343606,1580.544456,2691.303728,1.70277,68.450127,42.484313,1.611186,3.182205
12,light_90th_mean,0.0,-79.090682,6.186843,2341.514966,4181.174262,1.785671,79.332294,51.163009,1.550579,3.185737
184,light_overall_M10,0.0,-40.960549,2.423316,661.591587,1201.643546,1.816292,41.032171,27.241426,1.506242,3.099778
49,light_90th_SD,0.0,-37.094328,4.812004,550.504423,1229.283854,2.233014,37.405141,25.005226,1.495893,3.425805
187,light_between_day_SD_M10,0.0,-19.742188,2.870412,160.86439,307.176026,1.909534,19.949768,13.399058,1.488893,3.16743


In [43]:
fig, ax = pylab.subplots()
ax.scatter(seasonal_tests.intra_personal_normalized, seasonal_tests.corrected_intra_personal_normalized, marker="+")
ax.axhline(1, c="k", linestyle="--")
ax.axvline(1, c="k", linestyle="--")
ax.set_xlabel("Original Within-/Between-person variance ratio")
ax.set_ylabel("Seasonally corrected Within-/Between-person variance ratio")
fig.savefig(OUTDIR+"seasonal_correction.png")

<IPython.core.display.Javascript object>

In [37]:
print(f"Variables which pass controls after correcting for seasonality:")
seasonal_tests.query("corrected_intra_personal_normalized < 1 and intra_personal_normalized > 1")

Variables which pass controls after correcting for seasonality:


Unnamed: 0,activity_var,p,cos,sin,corrected_inter_personal,corrected_intra_personal,corrected_intra_personal_normalized,amplitude,inter_personal_std,std_amplitude,intra_personal_normalized
168,temp_within_day_SD,0.0,0.202846,0.063706,0.062517,0.050945,0.814897,0.212615,0.256228,0.829789,1.232251
178,temp_RA,5.389627e-312,-0.012965,-0.002928,0.000362,0.000319,0.880397,0.013291,0.019423,0.684311,1.148876


In [None]:
seasonal_tests.to_csv("../processed/inter_intra_personal_variance.seasonal_correction.txt", sep="\t", index=False)

In [44]:
print(f"Biggest changes in variables due to seasonal corrections:")
seasonal_tests['difference'] =  (seasonal_tests.corrected_intra_personal_normalized - seasonal_tests.intra_personal_normalized).abs()
seasonal_tests.sort_values('difference', ascending=False).head(40)

Biggest changes in variables due to seasonal corrections:


Unnamed: 0,activity_var,p,cos,sin,corrected_inter_personal,corrected_intra_personal,corrected_intra_personal_normalized,amplitude,inter_personal_std,std_amplitude,intra_personal_normalized,difference
192,light_RA,0.0,-0.236109,0.009904,0.015031,0.022879,1.522099,0.236317,0.133852,1.765505,3.291265,1.769166
181,light_hourly_SD,0.0,-29.746112,2.141347,264.867729,449.587928,1.697405,29.823088,17.511559,1.703052,3.358896,1.661491
182,light_within_day_SD,0.0,-24.788309,1.258187,191.848301,328.731278,1.713496,24.82022,14.802245,1.676788,3.339186,1.62569
185,light_hourly_SD_M10,0.0,-32.133472,2.300388,312.094675,528.679312,1.693971,32.215708,19.071916,1.68917,3.306772,1.612801
186,light_within_day_SD_M10,0.0,-24.581593,0.448664,182.285072,300.552279,1.648804,24.585687,14.57911,1.686364,3.258965,1.610162
34,light_peak_value_mean,0.0,-68.368416,3.343606,1580.544456,2691.303728,1.70277,68.450127,42.484313,1.611186,3.182205,1.479435
12,light_90th_mean,0.0,-79.090682,6.186843,2341.514966,4181.174262,1.785671,79.332294,51.163009,1.550579,3.185737,1.400066
184,light_overall_M10,0.0,-40.960549,2.423316,661.591587,1201.643546,1.816292,41.032171,27.241426,1.506242,3.099778,1.283486
187,light_between_day_SD_M10,0.0,-19.742188,2.870412,160.86439,307.176026,1.909534,19.949768,13.399058,1.488893,3.16743,1.257896
49,light_90th_SD,0.0,-37.094328,4.812004,550.504423,1229.283854,2.233014,37.405141,25.005226,1.495893,3.425805,1.192792
