In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.colors as colors
import pickle
import matplotlib.dates as mdates
import matplotlib

In [None]:
hMinus = 24*2
hPlus = 24*2-1
#hMinus = 24
#hPlus = 24-1
hMinusIQR = 24*5
hPlusIQR = 24*5-1
nDays = 10
# based on Pandas slicing

def add_rolling_dem(df):
    # Can't use np.roll b/c it does not deal with NANs
    # in a sophisticated manner.  Use np.nanmean which
    # skips all NANs and leaves them out of the sum and
    # division
    rolling = np.empty((0,), float)
    
    for i in range(len(df.index)):
        val = np.nanmedian(df.loc[i-hMinus:i+hPlus, 'demand (MW)'])
        rolling = np.append(rolling, val)
    
    return df.assign(rollingDem=rolling)


def add_rolling_dem_long(df):
    # Can't use np.roll b/c it does not deal with NANs
    # in a sophisticated manner.  Use np.nanmean which
    # skips all NANs and leaves them out of the sum and
    # division
    rolling = np.empty((0,), float)
    
    for i in range(len(df.index)):
        val = np.nanmedian(df.loc[max(0, i-nDays*24):min(i+nDays*24, len(df.index)), 'demand (MW)'])
        rolling = np.append(rolling, val)
    
    return df.assign(rollingDemLong=rolling)

def add_demand_minus_rolling_dem(df):
    diff = df['demand (MW)'] - df['rollingDem']
    df = df.assign(dem_minus_rolling=diff)
    return df



def add_demand_minus_rolling_dem_iqr(df):
    rolling = np.empty((0,), float)
    
    for i in range(len(df.index)):
        lst = df.loc[i-hMinusIQR:i+hPlusIQR, 'dem_minus_rolling']
        iqr = np.nanpercentile(lst, 75) - np.nanpercentile(lst, 25)
        rolling = np.append(rolling, iqr)
    
    return df.assign(dem_minus_rolling_IQR=rolling)





def add_hourly_median_dem_deviations(df):
    # Create a df to hold all values to take nanmedian later
    vals_dem_minus_rolling = df['dem_minus_rolling']
    # Loop over nDays days on each side
    for i in range(-nDays, nDays+1):
        # Already initialized with zero value
        if i == 0:
            continue
        vals_dem_minus_rolling = pd.concat(
            [vals_dem_minus_rolling, df.shift(periods=i*24)['dem_minus_rolling']], axis=1)

    df['vals_dem_minus_rolling'] = vals_dem_minus_rolling.median(axis=1, skipna=True)
    # 1+vals to make it a scale factor
    return df.assign(hourly_median_dem_dev=1.+df['vals_dem_minus_rolling']/df['rollingDemLong'])

                
                
# delta with previous and following time steps
def add_deltas(df):
    diff = df['demand (MW)'].diff()
    df = df.assign(delta_pre=diff)
    diff = df['demand (MW)'].diff(periods=-1)
    df = df.assign(delta_post=diff)
    return df



def add_rolling_delta_iqr(df):
    rolling = np.empty((0,), float)
    
    for i in range(len(df.index)):
        lst = df.loc[i-hMinusIQR:i+hPlusIQR, 'delta_pre']
        iqr = np.nanpercentile(lst, 75) - np.nanpercentile(lst, 25)
        rolling = np.append(rolling, iqr)
    
    return df.assign(delta_rolling_IQR=rolling)


def add_categories(df):
    df['category'] = np.where(df['demand (MW)'].isna(), 'MISSING', 'OKAY')
    return df


def filter_neg_and_zeros(df):
    df['category'] = np.where(df['demand (MW)'] <= 0., 'NEG_OR_ZERO', df['category'])
    df['demand (MW)'] = df['demand (MW)'].mask(df['demand (MW)'] <= 0.)
    return df

    
def filter_extrem_demand(df, multiplier):
    med = np.nanmedian(df['demand (MW)'])
    filtered = df['demand (MW)'].where(df['demand (MW)'] < med * multiplier)
    df['globalDemandFiltered'] = np.where(df['demand (MW)'] != filtered, df['demand (MW)'], np.nan)
    df['category'] = df['category'].mask(((df['demand (MW)'] != filtered) & \
                    (df['demand (MW)'].notna())), other='GLOBAL_DEM')
    df['demand (MW)'] = filtered
    return df
    

def filter_local_demand(df, multiplier_up, multiplier_down):
    filtered = df['demand (MW)'].where(
            (df['demand (MW)'] < df['rollingDem'] * df['hourly_median_dem_dev'] + \
                     multiplier_up * df['dem_minus_rolling_IQR']) & \
            (df['demand (MW)'] > df['rollingDem'] * df['hourly_median_dem_dev'] - \
                     multiplier_down * df['dem_minus_rolling_IQR']))
    df['localDemandFiltered'] = np.where(df['demand (MW)'] != filtered, df['demand (MW)'], np.nan)
    df['category'] = df['category'].mask(((df['demand (MW)'] != filtered) & \
                    (df['demand (MW)'].notna())), other='LOCAL_DEM')
    df['demand (MW)'] = filtered
    return df


# Filter on a multiplier of the IQR and set
# the associated 'demand (MW)' value to NAN.
# Filter on 1 multiplier for double deltas
# and another for single jumps
def filter_deltas(df, multiplier):
    
    filtered = df['demand (MW)'].mask(
            ((df['delta_pre'] > df['delta_rolling_IQR'] * multiplier) & \
            (df['delta_post'] > df['delta_rolling_IQR'] * multiplier)) | \
            ((df['delta_pre'] < -1. * df['delta_rolling_IQR'] * multiplier) & \
            (df['delta_post'] < -1. * df['delta_rolling_IQR'] * multiplier)))

    df['deltaFiltered'] = np.where(df['demand (MW)'] != filtered, df['demand (MW)'], np.nan)
    df['category'] = df['category'].mask(((df['demand (MW)'] != filtered) & \
                    (df['demand (MW)'].notna())), other='DELTA')
    df['demand (MW)'] = filtered
    return df


def filter_runs(df):
    
    d1 = df['demand (MW)'].diff(periods=1)
    d2 = df['demand (MW)'].diff(periods=2)

    # cannot compare a dtyped [float64] array with a scalar of type [bool]
    filtered = df['demand (MW)'].mask((d1 == 0) & (d2 == 0))
    df['runFiltered'] = np.where(df['demand (MW)'] != filtered, df['demand (MW)'], np.nan)
    df['demand (MW)'] = filtered
    df['category'] = np.where(df['runFiltered'].notna(), 'IDENTICAL_RUN', df['category'])
    return df
    
    

def mark_missing_and_empty(df, col):
    #marked = np.zeros(len(df.index))
    print(df[col].isna())

def show_structure(df):
    plt.imshow(~df.isna(), aspect='auto')
    plt.xlabel("variables")
    plt.ylabel("cases")
    plt.gray()
    plt.show()


def simple_hist(col, df, iq2, iq3, factor, save, x_log=False):
    plt.close()
    fig, ax = plt.subplots()
    
    if df[col].max() == np.Inf:
        print(save, df[col].max())
        return
    if df[col].min() == np.NINF:
        print(save, df[col].min())
        return
    n, bins, patches = ax.hist(df[col] * (~df['demand (MW)'].isna()), 100, facecolor='red', alpha=0.2, label='pre')
    #n, bins, patches = ax.hist(df['delta_post'], 100, facecolor='blue', alpha=0.2, label='post')
    if col == 'Demand (MW)':
        ax.set_xlabel('Demand (MW)')
    elif col == 'dem_diff_norm_rolling':
        ax.set_xlabel('$\Delta$(Demand, Rolling Avg)/Rolling IQR')
    elif col == 'dem_minus_rolling':
        ax.set_xlabel('$\Delta$(Demand, Rolling Avg) (MW)')
    elif col == 'delta_pre':
        ax.set_xlabel('$\Delta$(Demand ti, Demand ti-1) (MW)')
    elif col == 'delta_pre_norm':
        ax.set_xlabel('Normalized Demand Difference (diff/Rolling IQR)')
    elif col == 'diff_norm_diffIQR_D':
        ax.set_xlabel('Normalized Demand Difference ($\Delta$(t-1, t+1)/Rolling IQR)')
    ax.set_ylabel('Counts')
            
    # Draw iq2 and iq3
    iqr = iq3 - iq2
    iq2_l1 = mlines.Line2D([-iqr,-iqr], ax.get_ylim())
    ax.add_line(iq2_l1)
    iq2_l2 = mlines.Line2D([-iqr*factor,-iqr*factor], ax.get_ylim())
    ax.add_line(iq2_l2)
    iq3_l1 = mlines.Line2D([iqr,iqr], ax.get_ylim())
    ax.add_line(iq3_l1)
    iq3_l2 = mlines.Line2D([iqr*factor,iqr*factor], ax.get_ylim())
    ax.add_line(iq3_l2)
    
    if x_log:
        plt.xscale('log', nonposx='clip')
    plt.tight_layout()
    plt.yscale('log', nonposy='clip')
    plt.savefig(save)
    
    



# Create many demand plots so we can actually see the values
def scrolling_demand(width, region, df, title, save, dem_up, dem_down, delta_up, delta_down):
    start = 0
    end = width
    i = 0
    tot_l = len(df.index)
    while True:
        s = save.replace('.png', '_{}cnt'.format(i))
        t = title+': cnt {}'.format(i)
        o = df.loc[start:end]

        print("scrolling: start {} - end {}".format(start, end))
        comparison_demand_plot(o, t, s, dem_up, dem_down)
        comparison_diff_plot(o, t, s.replace('cnt', 'cnt_diff'), delta_up, delta_down)
        # end-start+1 is the length, remember pandas slice notation includes end point
        if not ((df['globalDemandFiltered'].loc[start:end].isna().sum() == len(o.index)) and \
                (df['localDemandFiltered'].loc[start:end].isna().sum() == len(o.index)) and \
                (df['runFiltered'].loc[start:end].isna().sum() == len(o.index)) and \
                (df['deltaFiltered'].loc[start:end].isna().sum() == len(o.index))):
            s = save.replace('.png', '_{}cnt'.format(i)).replace(region, 'z2_'+region)
            print(start, end, s)
            comparison_demand_plot(o, t, s, dem_up, dem_down)
            comparison_diff_plot(o, t, s.replace('cnt', 'cnt_diff'), delta_up, delta_down)
        if end == tot_l:
            break
        i += 1
        start += width
        end += width
        if end >= tot_l:
            end = tot_l

            

            
def comparison_demand_plot(df, title, save, multiplier_up, multiplier_down):
    plt.close()
    fig, ax = plt.subplots(figsize=(15,5))
    ax.set_xlabel('Hour')
    ax.set_ylabel('Demand')
    plt.title(title)
    ax.plot(df['date_time'], df['demand (MW)'], 'k-', label='demand')
    ax.plot(df['date_time'], df['rollingDem'], 'b-', label='rolling dem+/-')
    ax.plot(df['date_time'], df['rollingDem']*df['hourly_median_dem_dev']+multiplier_up*df['dem_minus_rolling_IQR'], 'b-.')
    ax.plot(df['date_time'], df['rollingDem']*df['hourly_median_dem_dev']-multiplier_down*df['dem_minus_rolling_IQR'], 'b-.')
    ax.plot(df['date_time'], df['localDemandFiltered'], 'mo', label='localDemandFiltered')
    ax.plot(df['date_time'], df['globalDemandFiltered'], 'co', label='globalDemandFiltered')
    ax.plot(df['date_time'], df['deltaFiltered'], 'go', label='deltaFiltered')
    ax.plot(df['date_time'], df['runFiltered'], 'yo', label='runFiltered')
    plt.legend()
    ax.xaxis.set_major_locator(mdates.WeekdayLocator(byweekday=mdates.MO))
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    ax.xaxis.set_minor_locator(mdates.DayLocator())
    plt.tight_layout()
    plt.grid()
    plt.savefig(save)
    
def comparison_diff_plot(df, title, save, multiplier_up, multiplier_down):
    plt.close()
    fig, ax = plt.subplots(figsize=(15,5))
    ax.set_xlabel('Hour')
    ax.set_ylabel('$\Delta$(Demand ti, ti-1) (MW)')
    plt.title(title)
    ax.plot(df['date_time'], df['delta_pre'], 'k-', label='delta_pre')
    ax.plot(df['date_time'], multiplier_up*df['delta_rolling_IQR'], 'b-.')
    ax.plot(df['date_time'], -multiplier_down*df['delta_rolling_IQR'], 'b-.')
    plt.legend()
    ax.xaxis.set_major_locator(mdates.WeekdayLocator(byweekday=mdates.MO))
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    ax.xaxis.set_minor_locator(mdates.DayLocator())
    plt.tight_layout()
    plt.grid()
    plt.savefig(save)

    
    
def get_iqrs(vals):
    iq3 = np.nanpercentile(vals, 75)
    iq2 = np.nanpercentile(vals, 25)
    iqr = iq3 - iq2
    return iqr, iq2, iq3


def return_all_regions():
    return ['AEC', 'AECI', 'CPLE', 'CPLW',
    'DUK', 'FMPP', 'FPC',
    'FPL', 'GVL', 'HST', 'ISNE',
    'JEA', 'LGEE', 'MISO', 'NSB',
    'NYIS', 'OVEC', 'PJM', 'SC',
    'SCEG', 'SEC', 'SOCO',
    'SPA', 'SWPP', 'TAL', 'TEC',
    'TVA', 'ERCO',
    'AVA', 'AZPS', 'BANC', 'BPAT',
    'CHPD', 'CISO', 'DOPD',
    'EPE', 'GCPD', 'IID',
    'IPCO', 'LDWP', 'NEVP', 'NWMT',
    'PACE', 'PACW', 'PGE', 'PNM',
    'PSCO', 'PSEI', 'SCL', 'SRP',
    'TEPC', 'TIDC', 'TPWR', 'WACM',
    'WALC', 'WAUW']

def plot_var_by_time(df, region, var, include_dem=True):
    print("Plotting reg {} var {}".format(region, var))
    plt.close()
    fig, ax = plt.subplots(figsize=(15,5))
    if include_dem:
        ax.plot(df['date_time'], df['demand (MW)'], 'k-', label='Demand')
    ax.plot(df['date_time'], df[var], 'r-', label=var)
    plt.tight_layout()
    plt.legend()
    ax.xaxis.set_major_locator(mdates.WeekdayLocator(byweekday=mdates.MO))
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    ax.xaxis.set_minor_locator(mdates.MonthLocator())
    plt.title("{} {}".format(region, var))
    plt.savefig('plt/{}_{}.png'.format(region, var))

In [None]:
global_dem_cut = 10
local_dem_cut_up = 3
local_dem_cut_down = 2
delta_multiplier = 2

dem_map = {}
regions = ['TIDC', 'CISO', 'LDWP', 'BANC']
#regions = ['LDWP',]# 'CISO', 'LDWP']
regions = ['SWPP',]# 'FPC','TIDC', 'LDWP', 'BANC']
#regions = ['CISO',]

dump_to_pickle = True
load_from_pickle = False
do_timing = False
version = '_v5'


regions = ['SRP', 'BANC', 'SPA', 'SEC']

regions = return_all_regions()
#regions = ['CISO','ERCO','MISO','PJM','SOCO','SWPP','SEC','SRP',]
#regions = ['AEC','AECI','CPLE','DUK']



regions.sort()
for region in regions:
    print(region)

    dem_map = {}
    if dump_to_pickle:
        file_path = '../get_eia_demand_data/data/{}.csv'.format(region)
        dem_map[region] = pd.read_csv(file_path,
                           dtype={'demand (MW)':np.float64},
                          parse_dates=True, na_values=['MISSING', 'EMPTY'])
        
        # Convert date/time
        dem_map[region]['time'] = pd.to_datetime(dem_map[region]['time'])
        dem_map[region]['date_time'] = dem_map[region]['time']
        
        # Drop unused columns
        dem_map[region] = dem_map[region].drop(['series_id','time','year','month','day',
                                                'hour','forecast demand (MW)'], axis=1)

        # Add categories to track filtering
        dem_map[region] = add_categories(dem_map[region])
        
        # Missing and empty values are marked
        dem_map[region] = dem_map[region].assign(missing=dem_map[region]['demand (MW)'].isna())

        # Set all negative and zero values to NAN
        dem_map[region] = filter_neg_and_zeros(dem_map[region])
        
        # Set last demand values in runs of 3+ to NAN
        dem_map[region] = filter_runs(dem_map[region])

        # Global demand filter on 10x the median value
        dem_map[region] = filter_extrem_demand(dem_map[region], global_dem_cut)
        
        if not do_timing:
            # Add rolling dem average
            dem_map[region] = add_rolling_dem(dem_map[region])
            #plot_var_by_time(dem_map[region], region, 'rollingDem')
            dem_map[region] = add_rolling_dem_long(dem_map[region])
            #plot_var_by_time(dem_map[region], region, 'rollingDemLong')
            dem_map[region] = add_demand_minus_rolling_dem(dem_map[region])
            #plot_var_by_time(dem_map[region], region, 'dem_minus_rolling')
            dem_map[region] = add_hourly_median_dem_deviations(dem_map[region])
            #plot_var_by_time(dem_map[region], region, 'hourly_median_dem_dev', False)
            #plot_var_by_time(dem_map[region], region, 'vals_dem_minus_rolling')
            #plot_var_by_time(dem_map[region], region, 'vals_dem_minus_rolling2')
            dem_map[region] = add_demand_minus_rolling_dem_iqr(dem_map[region])
            #plot_var_by_time(dem_map[region], region, 'dem_minus_rolling_IQR')
        

            # Add deltas
            dem_map[region] = add_deltas(dem_map[region])
            dem_map[region] = add_rolling_delta_iqr(dem_map[region])
   
    
    
        if do_timing:
            # Add rolling dem average
            print("add_rolling_dem")
            %time dem_map[region] = add_rolling_dem(dem_map[region])
            plot_var_by_time(dem_map[region], region, 'rollingDem')
            print("add_rolling_dem_long")
            %time dem_map[region] = add_rolling_dem_long(dem_map[region])
            plot_var_by_time(dem_map[region], region, 'rollingDemLong')
            print("add_demand_minus_rolling_dem")
            %time dem_map[region] = add_demand_minus_rolling_dem(dem_map[region])
            plot_var_by_time(dem_map[region], region, 'dem_minus_rolling')
            print("add_hourly_median_dem_deviations")
            %time dem_map[region] = add_hourly_median_dem_deviations(dem_map[region])
            plot_var_by_time(dem_map[region], region, 'hourly_median_dem_dev', False)
            plot_var_by_time(dem_map[region], region, 'vals_dem_minus_rolling')
            plot_var_by_time(dem_map[region], region, 'vals_dem_minus_rolling2')
            print("add_demand_minus_rolling_dem_iqr")
            %time dem_map[region] = add_demand_minus_rolling_dem_iqr(dem_map[region])
            plot_var_by_time(dem_map[region], region, 'dem_minus_rolling_IQR')
        
    
        
            # Add deltas
            print("add_deltas")
            %time dem_map[region] = add_deltas(dem_map[region])
            print("add_rolling_delta_iqr")
            %time dem_map[region] = add_rolling_delta_iqr(dem_map[region])

    
        dem_map[region] = filter_local_demand(dem_map[region], local_dem_cut_up, local_dem_cut_down)
        dem_map[region] = filter_deltas(dem_map[region], delta_multiplier)
        
        print('Saving pickle /Users/truggles/tmp_data/pickle_{}{}.pkl'.format(region, version))
        pickle_file = open('/Users/truggles/tmp_data/pickle_{}{}.pkl'.format(region, version), 'wb') 
        pickle.dump(dem_map[region], pickle_file)
        pickle_file.close()
        #dem_map[region].to_csv('/Users/truggles/tmp_data/csv_{}.csv'.format(region))
        continue

    
    if load_from_pickle:
        print('Loading from pickle /Users/truggles/tmp_data/pickle_{}{}.pkl'.format(region, version))
        pickle_in = open('/Users/truggles/tmp_data/pickle_{}{}.pkl'.format(region, version),'rb')
        dem_map[region] = pickle.load(pickle_in)
        dem_map[region]['date_time'] = pd.to_datetime(dem_map[region]['date_time'])
        
    
    
    
    
    plt.close()
    fig, ax = plt.subplots(figsize=(15,5))
    ax.plot(dem_map[region]['demand (MW)'], 'k-', label='demand')
    ax.plot(dem_map[region]['globalDemandFiltered'], 'g-', label='globalDemandFiltered')
    ax.plot(dem_map[region]['localDemandFiltered'], 'r-', label='localDemandFiltered')
    ax.plot(dem_map[region]['deltaFiltered'], 'b-', label='demandFiltered')
    ax.plot(dem_map[region]['runFiltered'], 'y-', label='runFiltered')
    plt.tight_layout()
    plt.legend()
    plt.title("{} Cleaned Demand".format(region))
    plt.savefig('plt/{}_demand_show_filters.png'.format(region))
    
    width = 500
    title = '{} Demand Showing Filters'.format(region)
    save = '/Users/truggles/tmp_plots/{}_demand_show_filters.png'.format(region)

    scrolling_demand(width, region, dem_map[region], title, save, local_dem_cut_up, local_dem_cut_down,
                    delta_multiplier, delta_multiplier)



In [None]:
prep_final_output = True
version = '_v5'
print("prep_final_output {}".format(prep_final_output))

regions = return_all_regions()
#regions = ['CISO','ERCO','MISO','PJM','SOCO','SWPP']



regions.sort()
print(regions)
for i, region in enumerate(regions):
    if not prep_final_output:
        break
    print('Loading from pickle /Users/truggles/tmp_data/pickle_{}{}.pkl'.format(region, version))
    pickle_in = open('/Users/truggles/tmp_data/pickle_{}{}.pkl'.format(region, version),'rb')
    if i == 0: # Load first instance to master
        master = pickle.load(pickle_in)
        master['date_time'] = pd.to_datetime(master['date_time'])
        master[region] = master['demand (MW)']
        master[region+'_category'] = master['category']
        master = master.drop(['demand (MW)', 'category', 'missing', 'runFiltered', 
                              'globalDemandFiltered', 'rollingDem',
                              'rollingDemLong', 'dem_minus_rolling', 
                              'vals_dem_minus_rolling',
                              'hourly_median_dem_dev', 'dem_minus_rolling_IQR',
                              'delta_pre', 'delta_post', 'delta_rolling_IQR'], axis=1)
        continue
        

    df = pickle.load(pickle_in)
    master[region] = df['demand (MW)']
    master[region+'_category'] = df['category']

if prep_final_output:
    print(master.head(5))
    master.to_csv('/Users/truggles/tmp_data/csv_MASTER{}.csv'.format(version), index=False, na_rep='NA')
        