In [2]:
import os
import pandas as pd
import numpy as np
import csv
import sys
import math
import matplotlib.pyplot as plt
import matplotlib.dates as dates

from tqdm import tqdm
from datetime import datetime, timedelta, time, date
from sklearn.linear_model import LinearRegression
from scipy import stats
from statistics import median
import statsmodels.api as sm
import warnings
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
from matplotlib.transforms import offset_copy

warnings.simplefilter("ignore")

The LifeAtCMU_Phase1/ and LifeAtCMU_Phase2/ folders contain the underlying Fitbit data. The first step is to save the combined sleep-steps into a single file for every participant. 

Here, we go from sleep_raw_data/ and steps_raw_data/ to sleep_steps_data/.

In [3]:
def getPath(phase):
    if phase == 'lac1':
        path = 'LifeAtCMU_Phase1/'
    elif phase == 'lac2':
        path = 'LifeAtCMU_Phase2/'
    else:
        raise('Invalid phase argument')
    return path

In [4]:
# combines all raw sleep and raw steps 
# input: "lac1" or "lac2"
def saveAllCombined(phase):

    if phase == 'lac1':
        TIME_INDEX = pd.date_range(start='1/16/2017',
                                   end='5/15/2017',
                                   freq='min')
    elif phase == 'lac2':
        TIME_INDEX = pd.date_range(start='1/16/2018',
                                   end='5/15/2018',
                                   freq='min')
    else:
        raise('Invalid phase argument')
        
    path = getPath(phase)
       
    sleep_dir = os.path.join(path,'sleep_raw_data/')
    steps_dir = os.path.join(path,'steps_raw_data/')
    combined_dir = os.path.join(path,'sleep_steps_data/')

    sleep_files = os.listdir(sleep_dir)
    steps_files = os.listdir(steps_dir)
    both = list(set(sleep_files) & set(steps_files))

    DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
    
    for idx in tqdm(range(len(both))):

        if os.path.exists(os.path.join(combined_dir, both[idx])):
            continue
        else:
            try:
                df_sleep = pd.read_csv(os.path.join(sleep_dir, both[idx]))
                df_steps = pd.read_csv(os.path.join(steps_dir, both[idx]))

                # Convert times to datetime objects
                df_sleep['dt_typed'] = df_sleep['dateTime'].apply(lambda time: datetime.strptime(time,DATE_FORMAT).replace(second=0))
                df_steps['dt_typed'] = df_steps['datetime'].apply(lambda time: datetime.strptime(time,DATE_FORMAT).replace(second=0))
                df_sleep = df_sleep.rename(columns={'value' : 'sleep_value'})

                df = pd.DataFrame(index=TIME_INDEX)
                df = df.merge(df_sleep[['sleep_value','dt_typed']],how='left',left_index=True,right_on='dt_typed')
                df = df.merge(df_steps[['steps','dt_typed']],how='left',left_on='dt_typed',right_on='dt_typed')
                df = df.drop_duplicates(subset='dt_typed', keep='first')
                df = df.set_index('dt_typed')

                # lac1 only gives steps every 5 minutes, so we interpolate
                if phase == 'lac1':
                    df['steps'] = df['steps'].interpolate()

                df.index.names = ['time']
                df.sort_index(inplace=True)
                df.to_csv(os.path.join(combined_dir, both[idx]))
            except:
                print(both[idx])

In [5]:
# saveAllCombined('lac1')
# saveAllCombined('lac2')

Once we have all the combined sleep-steps files, we then want to extract the sleep episodes. To do this, we have two parameters: MIN_CONSEC_NON_AWAKE and MAX_CONSEC_AWAKE.

MIN_CONSEC_NON_AWAKE is the number of consecutive minutes that a student has to be asleep/restless for an episode to be recognized, and MAX_CONSEC_AWAKE is the number of consecutive awake minutes that have to be recognized for the termination of an episode. Any timepoint where steps are positive is considered awake. 

In [6]:
# calculates the number of minutes that separates two datetimes
def diff_min(dt1,dt2):
    
    DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
    
    td = datetime.strptime(dt1, DATE_FORMAT) - datetime.strptime(dt2, DATE_FORMAT)
    return int(td.total_seconds() / 60)

In [7]:
MIN_CONSEC_NON_AWAKE = 20
MAX_CONSEC_AWAKE = 5

In [8]:
def getSubjectSleepEpisodes(phase, 
                            filename,
                            min_consec_non_awake = MIN_CONSEC_NON_AWAKE,
                            max_consec_awake = MAX_CONSEC_AWAKE):
        
    path = getPath(phase)

    COMBINED_DIR = os.path.join(path, 'sleep_steps_data/')
    EPISODES_DIR = os.path.join(path, 'sleep_episodes/')
    
    # Read combined data
    filename = os.path.join(COMBINED_DIR, filename)
    combined_df = pd.read_csv(filename, index_col=0)
    
    # Define fields
    columns = ['start_time', 'end_time', 'length', 'time_asleep', 'time_restless', 'time_awake']
    
    # Make dictionary
    episode_dict = {} # start_time -> ['end_time', 'length', 'time_asleep', 'time_restless', 'time_awake']
    
    # tracks the last time point the subject was awake
    last_awake = combined_df.index[0]
    
    # used for tracking when in or not in and episode
    consec_non_awake = 0
    consec_awake = 0
    on_episode = False
    start_index = None

    
    # iterate through the subject's combined dataframe
    for index, row in combined_df.iterrows():
        sleep = row['sleep_value']

        # update counts
        if math.isnan(sleep) or sleep == 3 or sleep == 0: # 3 = awake, 0 = missing in lac1
            consec_awake += 1
            consec_non_awake = 0
            last_awake = index
        elif sleep == 1: # 1 = asleep
            consec_awake = 0
            consec_non_awake += 1
        elif sleep == 2: # 2 = restless
            consec_awake = 0
            consec_non_awake += 1
        else:
            raise Exception('sleep value is not NaN, 1, 2, or 3')

        # update episode status
        # starting new episode
        if not on_episode and consec_non_awake >= min_consec_non_awake:
            on_episode = True
            start_index = last_awake
            start_loc = combined_df.index.get_loc(index)
        # ending episode
        elif on_episode and consec_awake >= max_consec_awake:
            index_loc = combined_df.index.get_loc(index)
            end_index = combined_df.index[index_loc-max_consec_awake+1]
            length = diff_min(end_index,start_index)
            temp_df = combined_df.iloc[start_loc:index_loc]
            time_awake = len(temp_df[temp_df['sleep_value']==3])
            time_restless = len(temp_df[temp_df['sleep_value']==2])
            time_asleep = length - time_restless - time_awake
            episode_dict[start_index] = [end_index, length, time_asleep, time_restless, time_awake]
            start_index = index
            on_episode = False

    # Make dataframe from dictionary
    episode_df = pd.DataFrame.from_dict(episode_dict, orient='index', columns=columns[1:])
    episode_df.index.name = columns[0]
    
    return episode_df

In [9]:
# returns the main episode date for given time
def getWindow(time):
    if time.hour < 12:
        time = time - timedelta(days=1)
    return time.date()

In [10]:
def computeAllEpisodesForParams(phase,
                                min_consec_non_awake, 
                                max_consec_awake):

    path = getPath(phase)

    COMBINED_DIR = os.path.join(path, 'sleep_steps_data/')
    EPISODES_DIR = os.path.join(path, 'sleep_episodes/')    
    
    
    directory = str(min_consec_non_awake) +'_' + str(max_consec_awake) + '/'
    directory = os.path.join(EPISODES_DIR, directory)
    
    try:
        os.mkdir(directory)
    except FileExistsError:
        pass
    
    for src in tqdm(os.listdir(COMBINED_DIR)):

        

        dest = 'EPI_' + src.split('.')[0] + '_' + \
                    str(min_consec_non_awake) + '_' + \
                    str(max_consec_awake) + '.csv'
        
        dest = os.path.join(directory,dest)
        
        # if not os.path.exists(dest):    
        df = getSubjectSleepEpisodes(
                 phase,
                 src,
                 min_consec_non_awake,
                 max_consec_awake)
        df.to_csv(dest)

In [11]:
# computeAllEpisodesForParams('lac1',
#                             MIN_CONSEC_NON_AWAKE,
#                             MAX_CONSEC_AWAKE)

In [12]:
# computeAllEpisodesForParams('lac2',
#                             MIN_CONSEC_NON_AWAKE,
#                             MAX_CONSEC_AWAKE)

Once we've extracted the sleep episodes (which are in the sleep_episodes/ folder under a subdirectory titled "MIN_CONSEC_NON_AWAKE"_"MAX_CONSEC_AWAKE"), we then have to compute the sleep features of interest.

To do so, we need to convert the bedtime and waketime into minutes after a given zero (e.g. minutes after 6 pm) since time is modulo 24 hr. For bedtime, students generally go to bed after 6 pm, and for waketime, students generally wake up after 4 am (these numbers were not chosen formally). Thus, we set BED_ZERO = 18 (i.e. 6 pm), and WAKE_ZERO = 4 (i.e. 4 am), and convert bedtime and waketime to the number of minutes after these respective zeros.

Additionally, we identified the main sleep episode of Day n as the longest sleep episode which started between noon of Day n and noon of Day (n+1).

In [13]:
BED_ZERO = 18
WAKE_ZERO = 4

In [14]:
def timeToMin(time, zero_hour):
    
    hour = time.hour
    minute = time.minute
    
    if hour < zero_hour:
        hour += 24
        
    return (hour-zero_hour)*60 + minute

In [15]:
def getEpisodeData(phase,
                   filename): # ex: 'PID001.csv'
    
    DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
    
    path = getPath(phase)
    
    EPISODES_DIR = os.path.join(path, 'sleep_episodes/20_5/')
    file = os.path.join(EPISODES_DIR, 'EPI_'+ filename.split('.')[0] + '_20_5.csv')

    df = pd.read_csv(file, header=0)
    
    df['start_time'] = df['start_time'].apply(lambda x: datetime.strptime(x,DATE_FORMAT))
    df['end_time'] = df['end_time'].apply(lambda x: datetime.strptime(x,DATE_FORMAT))
    
    # define a main sleep episode as the longest sleep episode
    # in sleep window
    df['subject_id'] = filename.split('.')[0]
    df['main_episode_of'] = df['start_time'].apply(lambda x: getWindow(x))
    df = df.sort_values(by='length', ascending=False)
    df = df.sort_values(by='main_episode_of')    
    
    return df

In [16]:
def getMainEpisodeData(phase, filename):
    
    df = getEpisodeData(phase, filename)
    df = df.sort_values(by='length', ascending=False)
    df = df.drop_duplicates(subset='main_episode_of',keep='first')
    df = df.sort_values(by='main_episode_of') 
    
    return df

Here, we compute various sleep features of interest. 

MSSD refers to mean successive squared difference. This is a measure of variability that also takes into account the temporal nature of the data. For instance, the MSSD of [3,4,10] is ((4-3)^2 + (10-4)^2)/2 = 37/2

Not all of these features are used in this SLEEP Poster, but are included in the event of further analyses.

In [17]:
def getMainEpisodeMSSD(main_episode_df):
    
    main_episode_df = main_episode_df.sort_values(by='main_episode_of')
    
    main_episode_df['bedtime'] = main_episode_df['start_time'].apply(lambda x: timeToMin(x,BED_ZERO))
    main_episode_df['waketime'] = main_episode_df['end_time'].apply(lambda x: timeToMin(x,WAKE_ZERO))
    main_episode_df['midpoint_sleep'] = (main_episode_df['waketime'] + main_episode_df['bedtime']) / 2.0

    
    count = 0
    bt_total = []
    wt_total = []
    mp_total = []
    
    for i, idx1 in enumerate(main_episode_df.index):
        row1 = main_episode_df.loc[idx1]
        
        if i < len(main_episode_df.index)-1:
            idx2 = main_episode_df.index[i+1]
            row2 = main_episode_df.loc[idx2]
        else:
            row2 = None
        
        if row2 is not None:
            count += 1
            bt_total.append((row2['bedtime']-row1['bedtime'])**2)
            wt_total.append((row2['waketime']-row1['waketime'])**2)
            mp_total.append((row2['midpoint_sleep']-row1['midpoint_sleep'])**2)
        
    
    if count == 0:
        return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
    
    bt_mssd = np.array(bt_total).mean() / (float(count) * 3600)
    wt_mssd = np.array(wt_total).mean() / (float(count) * 3600)
    mp_mssd = np.array(mp_total).mean() / (float(count) * 3600)
        
    bt_mssd_median = median(bt_total)
    wt_mssd_median = median(wt_total)
    mp_mssd_median = median(mp_total)
        
    return bt_mssd, wt_mssd, mp_mssd, bt_mssd_median, wt_mssd_median, mp_mssd_median

In [18]:
# start_window and end_window are date objects
# extracts start to end inclusive
def computeSummaryStats(phase, 
                        filename, # ex: 'PID001.csv'
                        start_window, 
                        end_window):
    
        
        DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
        
        summary = dict()
        
        subject_id = filename.split('.')[0]
        
        summary['subject_id'] = subject_id
        
 
        all_df = getEpisodeData(phase, subject_id)
        main_df = getMainEpisodeData(phase, subject_id)

        start = pd.Timestamp(start_window)
        end = pd.Timestamp(end_window)
        main_df = main_df[main_df['main_episode_of'].between(start,end)]
        
        all_df['start_time'] = all_df['start_time'].apply(lambda x: datetime.strptime(str(x),DATE_FORMAT))
        all_df['main_episode_of'] = all_df['start_time'].apply(lambda x: getWindow(x))
        all_df = all_df[all_df['main_episode_of'].between(start,end)]
        
        nap_df = all_df[~all_df.start_time.isin(main_df.start_time)]
        
        summary['num_naps'] = len(nap_df)
        summary['avg_nap_length'] = nap_df['length'].mean()
        
        try:
            summary['frac_napped_of_total_sleep'] = nap_df['length'].sum() / all_df['length'].sum()
            summary['avg_24_hr_sleep'] = all_df['length'].sum() / len(all_df['main_episode_of'].unique())
            summary['frac_sleep_episodes_as_naps'] = len(nap_df) / len(all_df)
            
        except: # all_df is empty
            summary['frac_napped_of_total_sleep'] = np.nan
            summary['avg_24_hr_sleep'] = np.nan
            summary['frac_sleep_episodes_as_naps'] = np.nan
        
        summary['frac_nights_with_data'] = len(main_df) / ((end_window-start_window).days + 1)
 

        if len(main_df) == 0:
            return None

        
        main_df['bedtime'] = main_df['start_time'].apply(lambda x: timeToMin(x,BED_ZERO))
        main_df['waketime'] = main_df['end_time'].apply(lambda x: timeToMin(x,WAKE_ZERO)) 
        main_df['midpoint_sleep'] = (main_df['waketime'] + main_df['bedtime']) / 2.0

            
        main_df['proportion_awake'] = main_df['time_awake'] / main_df['length']
        main_df['proportion_restless'] = main_df['time_restless'] / main_df['length']
        main_df['weekday'] = main_df['main_episode_of'].apply(lambda x: x.weekday())
        
        # 4 = friday night, 5 = saturday night
        main_df['is_weekend'] = main_df['weekday'].apply(lambda x: x in [4,5])
        
        # bedtime measures
        summary['bedtime'] = main_df['bedtime'].mean()
        summary['bedtime_std'] = main_df['bedtime'].std()
    
        # waketime measures
        summary['waketime'] = main_df['waketime'].mean()
        
        # midpoint sleep measures
        summary['midpoint_sleep'] = main_df['midpoint_sleep'].mean()

        # weekday measures
        try:  
            summary['bedtime_weekday'] = main_df.groupby('is_weekend')['bedtime'].mean()[False]        
            summary['waketime_weekday'] = main_df.groupby('is_weekend')['waketime'].mean()[False]
            summary['midpoint_sleep_weekday'] = main_df.groupby('is_weekend')['midpoint_sleep'].mean()[False]            
        except KeyError:
            summary['bedtime_weekday'] = np.nan     
            summary['waketime_weekday'] = np.nan
            summary['midpoint_sleep_weekday'] = np.nan                
        
        # weekend measures
        try:
            summary['bedtime_weekend'] = main_df.groupby('is_weekend')['bedtime'].mean()[True]  
            summary['waketime_weekend'] = main_df.groupby('is_weekend')['waketime'].mean()[True]      
            summary['midpoint_sleep_weekend'] = main_df.groupby('is_weekend')['midpoint_sleep'].mean()[True]      
        except KeyError:
            summary['bedtime_weekend'] = np.nan
            summary['waketime_weekend'] = np.nan
            summary['midpoint_sleep_weekend'] = np.nan

            
        try:
            summary['social_jetlag'] = summary['bedtime_weekend'] - summary['bedtime_weekday']
        except:
            summary['social_jetlag'] = np.nan
            
        summary['time_in_bed'] = main_df['length'].mean()
                
        bt_mssd, wt_mssd, mp_mssd, bt_mssd_median, wt_mssd_median, mp_mssd_median = getMainEpisodeMSSD(main_df)
        
        summary['bedtime_mssd'] = bt_mssd
        summary['waketime_mssd'] = wt_mssd
        summary['midpoint_sleep_mssd'] = mp_mssd
        
        summary['bedtime_mssd_median'] = bt_mssd_median
        summary['waketime_mssd_median'] = wt_mssd_median
        summary['midpoint_sleep_mssd_median'] = mp_mssd_median
        
        summary['WASO_fraction'] = main_df['proportion_awake'].mean()
        summary['restless_fraction'] = main_df['proportion_restless'].mean()
        summary['TST'] = main_df['time_asleep'].mean()
        summary['TST_std'] = main_df['time_asleep'].std()
        
        
        return summary

In [19]:
# start_window and end_window are date objects
def computeAllSummaryStats(phase,
                           start_window, 
                           end_window,
                           overwrite=False):
    
    path = getPath(phase)

    FEATURES_FOLDER = os.path.join(path, 'computed_features/')
    EPISODES_DIR = os.path.join(path, 'sleep_episodes/')
    feature_file = str(start_window) + '_' + str(end_window) + '.csv'       
    feature_file = os.path.join(FEATURES_FOLDER,feature_file)    
   
    if os.path.exists(feature_file) and not overwrite:
        return 'Already written'
        
    if phase not in ['uw1_sum', 'uw2_sum']:
        ids = os.listdir(os.path.join(path, 'sleep_steps_data/'))
    else:
        dir_ids = os.listdir(os.path.join(path, 'sleep_episodes/'))
        ids = [x.split('_')[1] for x in dir_ids]
        
        

    
    with open(feature_file, 'w') as f:
    
        header = ['subject_id',
                  'num_naps',
                  'avg_nap_length',
                  'frac_napped_of_total_sleep',
                  'avg_24_hr_sleep',
                  'frac_sleep_episodes_as_naps',
                  'frac_nights_with_data',
                  'bedtime',
                  'waketime',
                  'midpoint_sleep',
                  'time_in_bed',
                  'bedtime_mssd',
                  'bedtime_mssd_median',
                  'bedtime_std',
                  'bedtime_weekend',
                  'bedtime_weekday',
                  'waketime_mssd',
                  'waketime_mssd_median',
                  'waketime_weekend',
                  'waketime_weekday',
                  'midpoint_sleep_mssd',
                  'midpoint_sleep_mssd_median',
                  'midpoint_sleep_weekend',
                  'midpoint_sleep_weekday',
                  'WASO_fraction',
                  'restless_fraction',
                  'social_jetlag',
                  'TST',
                  'TST_std']
        
        w = csv.DictWriter(f, header)
        w.writeheader()
        
        for filename in ids:
    
            summary = computeSummaryStats(phase, 
                                          filename,
                                          start_window, 
                                          end_window)
                
            if not summary:
                summary = dict()
                for val in header:
                    summary[val] = np.nan
                summary['subject_id'] = filename.split('.')[0]
                summary['frac_nights_with_data'] = 0
                
            w.writerow(summary)

In [20]:
# lac1
# computeAllSummaryStats('lac1', date(2017,1,16), date(2017,5,15))

# lac2
# computeAllSummaryStats('lac2', date(2018,1,16), date(2018,5,15))

In [21]:
# given the start and end of the semester, get the week of semester that contains date
def getWeek(date, start, end):
    
    week_index = pd.date_range(start=start,end=end,freq='w')
    
    start_date = datetime.strptime(start, '%m/%d/%Y').date()
    end_date = datetime.strptime(end, '%m/%d/%Y').date()
    
    if start_date <= date < week_index[0]:
        return 0
    elif week_index[0] <= date <  week_index[-1]:
        for idx in range(len(week_index)-1):
            if  week_index[idx] <= date <  week_index[idx+1]:
                return idx+1
    elif week_index[-1] <= date <= end_date:
        return len(week_index)
    return -1

In [22]:
def getAllMainEpisodeData(phase):

    path = getPath(phase)
    ids = os.listdir(os.path.join(path, 'sleep_steps_data/'))

    return pd.concat([getMainEpisodeData(phase, filename) for filename in ids])

In [23]:
# brings column values to front in order listed for dataframe
def bringToFront(df, col_names):
    col_list = list(df.columns)
    for col in col_names[::-1]:
        col_list.insert(0,col_list.pop(col_list.index(col)))
    
    return df.reindex(columns=col_list)

In [24]:
def getSummaryStats(phase,
                    start,
                    end):

    start_window = start
    end_window = end
    
    path = getPath(phase)
            
    # get entire semester participant data   
    filename = str(start_window) + '_' + str(end_window) + '.csv'
    filename = os.path.join(path, 'computed_features/', filename)
    summary_stats_df = pd.read_csv(filename)
    summary_stats_df.set_index('subject_id', inplace=True)
    
    return summary_stats_df

In [25]:
def getCESD(phase):
    path = os.path.join('EMA_data/',
                        phase + '_pre_post_ema.csv')
    df = pd.read_csv(path, index_col='ID', low_memory=False)
    df.index.names = ['subject_id']
    df.index = pd.to_numeric(df.index)
    df = df[df['postCESD_sum'] != " "]
    df['postCESD_sum'] = pd.to_numeric(df['postCESD_sum'])
    df = df[df['postCESD_sum'] >= 0]
    df = df[df['preCESD_sum'] != " "]
    df['preCESD_sum'] = pd.to_numeric(df['preCESD_sum'])
    df = df[df['preCESD_sum'] >= 0]
    return df[['preCESD_sum', 'postCESD_sum']]

In [26]:
# Input: X = [pre, predictor], y = [post]
# Output: coefficient, p-value for predictor
def getOLSResult(X,y,predictor):
    
    X = sm.add_constant(X)

    model = sm.OLS(y, X)
    results = model.fit()
    
    return results.params[predictor], results.pvalues[predictor]

In [27]:
# all predictors of interest, GPA features, no nan rows,
# and participants with >=20% fraction of nights data 
def getRegDF(phase, start, end, predictors, control=None, thresh=0.2):
        
        # compute summary statistics
        computeAllSummaryStats(phase, start, end, overwrite=True) # ADD overwrite=True
        stats_df = getSummaryStats(phase, start, end)
        
        # filter by completeness threshold
        # print('Fitbit (no thresh):', len(stats_df), 'participants')
        stats_df = stats_df[stats_df['frac_nights_with_data'] >= thresh]
        # print('Fitbit (thresh):', len(stats_df), 'participants')

        cesd_df = getCESD(phase)
        
        # combine summary statistics with cesd data
        combined_df = stats_df.merge(cesd_df, on='subject_id', how='outer')

        # choose only columns of variables of interest
        columns = predictors + ['preCESD_sum', 'postCESD_sum'] + ['frac_nights_with_data']
                
        if control:
            columns += [control]        
        
        combined_df = combined_df[columns].dropna()
        
        return combined_df[columns]

In [28]:
# Feb 7-28, 2018, thresh=0.2, 160 participants, p = .034
def generateBins(num_weeks):
    # predictors to test
    predictors = ['TST',
                  'midpoint_sleep',
                  'midpoint_sleep_mssd']

    threshold = 0.2
    
    lac2_semester_start = date(2018,1,17)
    lac1_semester_start = date(2017,1,18)

    if not os.path.exists(str(num_weeks)+'_week_bins/'):
        os.makedirs(str(num_weeks)+'_week_bins/')
    
    for week in tqdm(range(1,18-num_weeks)):
        days_from_start = week*7

        dfs = []
        for phase in ['lac1', 'lac2']:
            if phase == 'lac2': # LAC2 Spring Break: Mar 12-16, 2018
                start = lac2_semester_start + timedelta(days=days_from_start)
            elif phase == 'lac1': # LAC1 Spring Break: Mar 13-17, 2017
                start = lac1_semester_start + timedelta(days=days_from_start)

            end = start + timedelta(days=num_weeks*7)


            df = getRegDF(phase, start, end, predictors, thresh=threshold)
            df['cohort'] = phase
            dfs.append(df)
        

        filepath = str(num_weeks)+'_week_bins/'+str(week)+'_'+str(week+num_weeks)+'.csv'
        lac_reg_dfs = pd.concat(dfs)
        lac_reg_dfs = pd.concat(dfs).to_csv(filepath)


In [29]:
# generateBins(3)

In [32]:
week = 7
days_from_start = week*7
start = date(2018,1,17) + timedelta(days=days_from_start)
start

# SPRING BREAK IS BETWEEN WEEKS 7-8

datetime.date(2018, 3, 7)