In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.colors as colors

In [None]:
hPlus = 24
hMinus = 23
hPlus = 48
hMinus = 47

def add_rolling_dem(df):
    # Can't use np.roll b/c it does not deal with NANs
    # in a sophisticated manner.  Use np.nanmean which
    # skips all NANs and leaves them out of the sum and
    # division
    rolling = np.empty((0,), float)
    
    for i in range(len(df.index)):
        val = np.nanmean(df.loc[i-hPlus:i+hMinus, 'demand (MW)'])
        rolling = np.append(rolling, val)
    
    return df.assign(rollingDem=rolling)

def add_rolling_dem_iqr(df):
    rolling = np.empty((0,), float)
    
    for i in range(len(df.index)):
        lst = df.loc[i-hPlus:i+hMinus, 'demand (MW)']
        iqr = np.nanpercentile(lst, 75) - np.nanpercentile(lst, 25)
        rolling = np.append(rolling, iqr)
    
    return df.assign(rollingDemIQR=rolling)

def add_rolling_dem_median(df):
    rolling = np.empty((0,), float)
    
    for i in range(len(df.index)):
        val = np.nanmedian(df.loc[i-hPlus:i+hMinus, 'demand (MW)'])
        rolling = np.append(rolling, val)
    
    return df.assign(rollingDemMed=rolling)

def add_demand_delta_rolling(df):
    # Must have already called add_rolling_dem
    diff = df['demand (MW)'] - df['rollingDem']
    df = df.assign(dem_diff_rolling=diff)
    return df

def add_norm_demand_delta_rolling(df):
    # Must have already called add_rolling_dem
    diff = df['dem_diff_rolling'] / df['rollingDemIQR']
    df = df.assign(dem_diff_norm_rolling=diff)
    return df

# delta with previous and following time steps
def add_deltas(df):
    diff = df['demand (MW)'].diff()
    df = df.assign(diff_pre=diff)
    diff = df['demand (MW)'].diff(periods=-1)
    df = df.assign(diff_post=diff)
    return df

def add_deltas_norm(df):
    df = df.assign(diff_pre_norm= df['diff_pre']/df['rollingDemIQR'] )
    return df.assign(diff_post_norm= df['diff_post']/df['rollingDemIQR'] )



def filtering(df, del_iq2, del_iq3, del_iqr, del_m, dem_iq2, dem_iq3, dem_iqr, dem_m):
    filtered = np.empty((0,), float)
    for index, row in df.iterrows():
        
        # Always filter on extreme demands
        if df.at[index, 'dem_diff_norm_rolling'] > dem_iq3 + dem_iqr * dem_m or \
                df.at[index, 'dem_diff_norm_rolling'] < dem_iq2 - dem_iqr * dem_m:
            filtered = np.append(filtered, df.at[index, 'demand (MW)'])
            df.at[index, 'demand (MW)'] = np.nan
        else:
            filtered = np.append(filtered, np.nan)
        
        # Double Deltas
        if (row['diff_pre_norm'] > del_iq3 + del_iqr * del_m or row['diff_pre_norm'] < del_iq2 - del_iqr * del_m) and \
                (row['diff_post_norm'] > del_iq3 + del_iqr * del_m or row['diff_post_norm'] < del_iq2 - del_iqr * del_m):
            filtered = np.append(filtered, df.at[index, 'demand (MW)'])
            df.at[index, 'demand (MW)'] = np.nan
        else: # not filtered, insert np.nan
            filtered = np.append(filtered, np.nan)
        
        
        
        
        # Mix
    return df.assign(deltaFiltered=filtered)
    #print(df.head())
    #return df


    
def filter_extrem_demand(df, multiplier):
    iqr, iq2, iq3 = get_iqrs(df['demand (MW)'])
    med = np.nanmedian(df['demand (MW)'])
    filtered = df['demand (MW)'].where((df['demand (MW)'] < med + iqr * multiplier) & \
                                      (df['demand (MW)'] > med - iqr * multiplier))
    df['demandExtFiltered'] = np.where(df['demand (MW)'] != filtered, df['demand (MW)'], np.nan)
    df['demand (MW)'] = filtered
    return df
    
    
    
def filter_local_demand(df, iq2, iq3, multiplier):
    dem_iqr = iq3 - iq2
    filtered = np.empty((0,), float)
    for index, row in df.iterrows():
        if row['dem_diff_norm_rolling'] > dem_iqr * multiplier or \
                row['dem_diff_norm_rolling'] < -1. * dem_iqr * multiplier:
            filtered = np.append(filtered, df.at[index, 'demand (MW)'])
            df.at[index, 'demand (MW)'] = np.nan
        else:
            filtered = np.append(filtered, np.nan)
            
    #f = df['dem_diff_norm_rolling'].where((df['dem_diff_norm_rolling'] > iq3 + dem_iqr * multiplier))# or \
                                       #df['dem_diff_norm_rolling'] < iq2 - dem_iqr * multiplier))
    #filtered = df['demand (MW)'].where(f.isna())
    #df['demand (MW)'] = df['demand (MW)'].mask(f.isna())
    return df.assign(demandFiltered=filtered)
    #print(df.head())
    #return df


# Filter on a multiplier of the IQR and set
# the associated 'demand (MW)' value to NAN.
# Filter on 1 multiplier for double deltas
# and another for single jumps
def filter_deltas(df, iq2, iq3, iqr, m_double):#, m_single):
    filtered = np.empty((0,), float)
    for index, row in df.iterrows():
        #if row['diff_pre'] > iq3 + iqr * m_single or row['diff_pre'] < iq2 - iqr * m_single:
        #    filtered = np.append(filtered, df.at[index, 'demand (MW)'])
        #    df.at[index, 'demand (MW)'] = np.nan
        #elif row['diff_post'] > iq3 + iqr * m_single or row['diff_post'] < iq2 - iqr * m_single:
        #    filtered = np.append(filtered, df.at[index, 'demand (MW)'])
        #    df.at[index, 'demand (MW)'] = np.nan
        if (row['diff_pre_norm'] > iqr * m_double or row['diff_pre_norm'] < -1. * iqr * m_double) and \
                (row['diff_post_norm'] > iqr * m_double or row['diff_post_norm'] < -1. * iqr * m_double):
            filtered = np.append(filtered, df.at[index, 'demand (MW)'])
            df.at[index, 'demand (MW)'] = np.nan
        else: # not filtered, insert np.nan
            filtered = np.append(filtered, np.nan)
    return df.assign(deltaFiltered=filtered)
    #print(df.head())
    #return df


def mark_missing_and_empty(df, col):
    #marked = np.zeros(len(df.index))
    print(df[col].isna())

def show_structure(df):
    plt.imshow(~df.isna(), aspect='auto')
    plt.xlabel("variables")
    plt.ylabel("cases")
    plt.gray()
    plt.show()


def simple_hist(col, df, iq2, iq3, factor, save, x_log=False):
    plt.close()
    fig, ax = plt.subplots()
    
    if df[col].max() == np.Inf:
        print(save, df[col].max())
        return
    if df[col].min() == np.NINF:
        print(save, df[col].min())
        return
    n, bins, patches = ax.hist(df[col] * (~df['demand (MW)'].isna()), 100, facecolor='red', alpha=0.2, label='pre')
    #n, bins, patches = ax.hist(df['diff_post'], 100, facecolor='blue', alpha=0.2, label='post')
    if col == 'Demand (MW)':
        ax.set_xlabel('Demand (MW)')
    elif col == 'dem_diff_norm_rolling':
        ax.set_xlabel('$\Delta$(Demand, Rolling Avg)/Rolling IQR')
    elif col == 'dem_diff_rolling':
        ax.set_xlabel('$\Delta$(Demand, Rolling Avg) (MW)')
    elif col == 'diff_pre':
        ax.set_xlabel('$\Delta$(Demand ti, Demand ti-1) (MW)')
    elif col == 'diff_pre_norm':
        ax.set_xlabel('Normalized Demand Difference (diff/Rolling IQR)')
    ax.set_ylabel('Counts')
            
    # Draw iq2 and iq3
    iqr = iq3 - iq2
    #iq2_l1 = mlines.Line2D([iq2,iq2], ax.get_ylim())
    #ax.add_line(iq2_l1)
    #iq2_l2 = mlines.Line2D([iq2-iqr*factor,iq2-iqr*factor], ax.get_ylim())
    #ax.add_line(iq2_l2)
    #iq3_l1 = mlines.Line2D([iq3,iq3], ax.get_ylim())
    #ax.add_line(iq3_l1)
    #iq3_l2 = mlines.Line2D([iq3+iqr*factor,iq3+iqr*factor], ax.get_ylim())
    #ax.add_line(iq3_l2)

    iq2_l1 = mlines.Line2D([-iqr,-iqr], ax.get_ylim())
    ax.add_line(iq2_l1)
    iq2_l2 = mlines.Line2D([-iqr*factor,-iqr*factor], ax.get_ylim())
    ax.add_line(iq2_l2)
    iq3_l1 = mlines.Line2D([iqr,iqr], ax.get_ylim())
    ax.add_line(iq3_l1)
    iq3_l2 = mlines.Line2D([iqr*factor,iqr*factor], ax.get_ylim())
    ax.add_line(iq3_l2)
    
    if x_log:
        plt.xscale('log', nonposx='clip')
    plt.tight_layout()
    plt.yscale('log', nonposy='clip')
    plt.savefig(save)
    
    
def plot_2D_diff_and_demand_norm(df, x_factor, y_factor, save):
    plt.close()
    x = []
    y = []
    only_diff_pre_norm = []
    only_dem_diff_norm_rolling = []
    for index, row in df.iterrows():
        if np.isnan(row['demand (MW)']):
            continue
        diff_pre_norm = row['diff_pre_norm']
        dem_diff_norm_rolling = row['dem_diff_norm_rolling']
        if np.isnan(row['diff_pre_norm']) and np.isnan(row['dem_diff_norm_rolling']):
            continue
        if np.isnan(row['diff_pre_norm']):
            only_dem_diff_norm_rolling.append(dem_diff_norm_rolling)
            diff_pre_norm = -4
        if np.isnan(row['dem_diff_norm_rolling']):
            print("dem_diff_norm_rolling = np.nan")
            only_diff_pre_norm.append(diff_pre_norm)
        if np.isnan(row['diff_pre_norm']) or np.isnan(row['dem_diff_norm_rolling']):
            continue
        x.append(diff_pre_norm)
        y.append(dem_diff_norm_rolling)
    only_diff_pre_norm.sort()
    only_dem_diff_norm_rolling.sort()
    print("only_diff_pre_norm", len(only_diff_pre_norm))
    print(only_diff_pre_norm)
    print("only_dem_diff_norm_rolling", len(only_dem_diff_norm_rolling))
    print(only_dem_diff_norm_rolling)
    heatmap, xedges, yedges = np.histogram2d(x, y, bins=100)
    extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]

    fig, ax = plt.subplots()
    im = ax.imshow(heatmap.T, extent=extent, origin='lower', 
               norm=colors.LogNorm(vmin=1e-5, vmax=heatmap.max()),
               cmap='RdBu_r')
    ax.figure.colorbar(im)
    plt.xlabel('Normalized Demand Difference (diff/Rolling IQR)')
    plt.ylabel('$\Delta$(Demand, Rolling Avg)/Rolling IQR')
    
    
    x_iqr, x_iq2, x_iq3 = get_iqrs(df['diff_pre_norm'])
    y_iqr, y_iq2, y_iq3 = get_iqrs(df['dem_diff_norm_rolling'])
    # Draw iq2 and iq3
    #x_iq2_l1 = mlines.Line2D([x_iq2,x_iq2], ax.get_ylim())
    #ax.add_line(x_iq2_l1)
    #x_iq2_l2 = mlines.Line2D([x_iq2-x_iqr*x_factor,x_iq2-x_iqr*x_factor], ax.get_ylim())
    #ax.add_line(x_iq2_l2)
    #x_iq3_l1 = mlines.Line2D([x_iq3,x_iq3], ax.get_ylim())
    #ax.add_line(x_iq3_l1)
    #x_iq3_l2 = mlines.Line2D([x_iq3+x_iqr*x_factor,x_iq3+x_iqr*x_factor], ax.get_ylim())
    #ax.add_line(x_iq3_l2)
    
    #y_iq2_l1 = mlines.Line2D(ax.get_xlim(), [y_iq2,y_iq2])
    #ax.add_line(y_iq2_l1)
    #y_iq2_l2 = mlines.Line2D(ax.get_xlim(), [y_iq2-y_iqr*y_factor,y_iq2-y_iqr*y_factor])
    #ax.add_line(y_iq2_l2)
    #y_iq3_l1 = mlines.Line2D(ax.get_xlim(), [y_iq3,y_iq3])
    #ax.add_line(y_iq3_l1)
    #y_iq3_l2 = mlines.Line2D(ax.get_xlim(), [y_iq3+y_iqr*y_factor,y_iq3+y_iqr*y_factor])
    #ax.add_line(y_iq3_l2)
    
    x_iq2_l1 = mlines.Line2D([-x_iqr,-x_iqr], ax.get_ylim())
    ax.add_line(x_iq2_l1)
    x_iq2_l2 = mlines.Line2D([-x_iqr*x_factor,-x_iqr*x_factor], ax.get_ylim())
    ax.add_line(x_iq2_l2)
    x_iq3_l1 = mlines.Line2D([x_iqr,x_iqr], ax.get_ylim())
    ax.add_line(x_iq3_l1)
    x_iq3_l2 = mlines.Line2D([x_iqr*x_factor,x_iqr*x_factor], ax.get_ylim())
    ax.add_line(x_iq3_l2)
    
    y_iq2_l1 = mlines.Line2D(ax.get_xlim(), [-y_iqr,-y_iqr])
    ax.add_line(y_iq2_l1)
    y_iq2_l2 = mlines.Line2D(ax.get_xlim(), [-y_iqr*y_factor,-y_iqr*y_factor])
    ax.add_line(y_iq2_l2)
    y_iq3_l1 = mlines.Line2D(ax.get_xlim(), [y_iqr,y_iqr])
    ax.add_line(y_iq3_l1)
    y_iq3_l2 = mlines.Line2D(ax.get_xlim(), [y_iqr*y_factor,y_iqr*y_factor])
    ax.add_line(y_iq3_l2)
    
    plt.savefig(save)



# Create many demand plots so we can actually see the values
def scrolling_demand(width, df, title, save):
    start = 0
    end = width
    i = 0
    tot_l = len(df.index)
    while True:
        s = save.replace('.png', '_{}cnt'.format(i))
        t = title+': cnt {}'.format(i)
        o = df.loc[start:end]

        # end-start+1 is the length, remember pandas slice notation includes end point
        if not (df['demandFiltered'].loc[start:end].isna().sum() == len(o.index) and \
                df['deltaFiltered'].loc[start:end].isna().sum() == len(o.index)):
            comparison_demand_plot(o, t, s)
        if end == tot_l:
            break
        i += 1
        start += width
        end += width
        if end >= tot_l:
            end = tot_l


def comparison_demand_plot(df, title, save):
    plt.close()
    fig, ax = plt.subplots(figsize=(15,5))
    ax.set_xlabel('Hour')
    ax.set_ylabel('Demand')
    plt.title(title)
    ax.plot(df['demand (MW)'], 'r-', label='demand')
    ax.plot(df['rollingDem'], 'b-', label='rolling dem+/-')
    ax.plot(df['rollingDem']+df['rollingDemIQR'], 'b-')
    ax.plot(df['rollingDem']-df['rollingDemIQR'], 'b-')
    ax.plot(df['demandFiltered'], 'mo', label='demandFiltered')
    ax.plot(df['deltaFiltered'], 'go', label='deltaFiltered')
    plt.legend()
    plt.tight_layout()
    plt.savefig(save)
    
    
def get_iqrs(vals):
    iq3 = np.nanpercentile(vals, 75)
    iq2 = np.nanpercentile(vals, 25)
    iqr = iq3 - iq2
    return iqr, iq2, iq3


def return_all_regions():
    return ['AEC', 'AECI', 'CPLE', 'CPLW',
    'DUK', 'FMPP', 'FPC',
    'FPL', 'GVL', 'HST', 'ISNE',
    'JEA', 'LGEE', 'MISO', 'NSB',
    'NYIS', 'OVEC', 'PJM', 'SC',
    'SCEG', 'SEC', 'SOCO',
    'SPA', 'SWPP', 'TAL', 'TEC',
    'TVA', 'ERCO',
    'AVA', 'AZPS', 'BANC', 'BPAT',
    'CHPD', 'CISO', 'DOPD',
    'EPE', 'GCPD', 'IID',
    'IPCO', 'LDWP', 'NEVP', 'NWMT',
    'PACE', 'PACW', 'PGE', 'PNM',
    'PSCO', 'PSEI', 'SCL', 'SRP',
    'TEPC', 'TIDC', 'TPWR', 'WACM',
    'WALC', 'WAUW']

In [None]:
dem_map = {}
regions = ['TIDC', 'CISO', 'LDWP', 'BANC']
regions = ['LDWP',]# 'CISO', 'LDWP']
for region in regions:
#for region in return_all_regions():
    print(region)
    dem_map = {}
    file_path = '../get_eia_demand_data/data/{}.csv'.format(region)
    dem_map[region] = pd.read_csv(file_path, #index_col='time',
                       dtype={'demand (MW)':np.float64},
                      parse_dates=True, na_values=['MISSING', 'EMPTY'])
    
    # Convert date/time
    dem_map[region]['time'] = pd.to_datetime(dem_map[region]['time'])

    plt.close()
    fig, ax = plt.subplots(figsize=(15,5))
    ax.plot(dem_map[region]['demand (MW)'], 'r-', label='demand')
    plt.tight_layout()
    plt.title("{} Raw Demand".format(region))
    plt.savefig('plt/{}_demand_initial.png'.format(region))
    
    # Make missing and empty values
    dem_map[region] = dem_map[region].assign(missing=dem_map[region]['demand (MW)'].isna())
    
    # Set all negative and zero values to NAN
    dem_map[region]['demand (MW)'] = dem_map[region]['demand (MW)'].mask(dem_map[region]['demand (MW)'] <= 0.)
    
    logX = True if region == 'BANC' else False
    dem_multiplier = 5
    dem_iqr, dem_iq2, dem_iq3 = get_iqrs(dem_map[region]['demand (MW)'])
    simple_hist('demand (MW)', dem_map[region], dem_iq2, dem_iq3, dem_multiplier, 'plt/{}_demand_0.png'.format(region), logX)
    dem_map[region] = filter_extrem_demand(dem_map[region], dem_multiplier)
    simple_hist('demand (MW)', dem_map[region], dem_iq2, dem_iq3, dem_multiplier, 'plt/{}_demand_1.png'.format(region), logX)
    
    
    # Add deltas
    dem_map[region] = add_rolling_dem_median(dem_map[region])
    dem_map[region] = add_rolling_dem_iqr(dem_map[region])
    dem_map[region] = add_deltas(dem_map[region])
    dem_map[region] = add_deltas_norm(dem_map[region])
    
    # Add rolling dem average
    dem_map[region] = add_rolling_dem(dem_map[region])
    dem_map[region] = add_demand_delta_rolling(dem_map[region])
    dem_map[region] = add_norm_demand_delta_rolling(dem_map[region])
    
    
    
    # Calculate IQRs
    iqr, iq2, iq3 = get_iqrs(dem_map[region]['diff_pre'])
    n_iqr, n_iq2, n_iq3 = get_iqrs(dem_map[region]['diff_pre_norm'])
    dem_iqr, dem_iq2, dem_iq3 = get_iqrs(dem_map[region]['demand (MW)'])
    demD_iqr, demD_iq2, demD_iq3 = get_iqrs(dem_map[region]['dem_diff_rolling'])
    demDN_iqr, demDN_iq2, demDN_iq3 = get_iqrs(dem_map[region]['dem_diff_norm_rolling'])
    
    # Plots
    dem_multiplier = 5
    multiplier_double = 4
    #multiplier_single = 6
    dem_M = 4.
    plot_2D_diff_and_demand_norm(dem_map[region], multiplier_double, dem_M, 'plt/{}_2D_diff_and_demand_norm_original.png'.format(region))
    
    logX = True if region == 'BANC' else False
    simple_hist('diff_pre_norm', dem_map[region], n_iq2, n_iq3, multiplier_double, 'plt/{}_diff_pre_norm_original.png'.format(region))
    simple_hist('diff_pre', dem_map[region], iq2, iq3, multiplier_double, 'plt/{}_diff_pre_original.png'.format(region))
    simple_hist('demand (MW)', dem_map[region], dem_iq2, dem_iq3, dem_multiplier, 'plt/{}_demand_original.png'.format(region), logX)
    simple_hist('dem_diff_rolling', dem_map[region], demD_iq2, demD_iq3, dem_M, 'plt/{}_dem_diff_rolling_original.png'.format(region))
    simple_hist('dem_diff_norm_rolling', dem_map[region], demDN_iq2, demDN_iq3, dem_M, 'plt/{}_dem_diff_norm_rolling_original.png'.format(region))
    
    print("iqr pre {:.2f} x 5 = {:.2f}".format(iqr, iqr*5))
    print('diff_pre 25% {:.2f}  75% {:.2f}'.format(np.percentile(dem_map[region]['diff_pre'].dropna(), 25),
                                                  np.percentile(dem_map[region]['diff_pre'].dropna(), 75)))
    

    dem_map[region] = filter_deltas(dem_map[region], n_iq2, n_iq3, n_iqr, multiplier_double)#, multiplier_single)
    # Plot results for demand hist
    simple_hist('demand (MW)', dem_map[region], dem_iq2, dem_iq3, dem_multiplier, 
                  'plt/{}_dem_post-delta-filter.png'.format(region), logX)
    simple_hist('dem_diff_rolling', dem_map[region], demD_iq2, demD_iq3, dem_M, 'plt/{}_dem_diff_rolling_post_delta.png'.format(region))
    simple_hist('dem_diff_norm_rolling', dem_map[region], demDN_iq2, demDN_iq3, dem_M, 'plt/{}_dem_diff_norm_rolling_post_delta.png'.format(region))
    
    
    # Filter on extreme demand values
    dem_map[region] = filter_local_demand(dem_map[region], demDN_iq2, demDN_iq3, dem_M)
    
    simple_hist('demand (MW)', dem_map[region], dem_iq2, dem_iq3, dem_multiplier, 
                  'plt/{}_dem_post_ext_dem.png'.format(region))
    simple_hist('diff_pre_norm', dem_map[region], n_iq2, n_iq3, multiplier_double, 'plt/{}_diff_pre_norm_post_ext_dem.png'.format(region))
    simple_hist('diff_pre', dem_map[region], iq2, iq3, multiplier_double, 'plt/{}_diff_pre_post_ext_dem.png'.format(region))
    simple_hist('dem_diff_rolling', dem_map[region], demD_iq2, demD_iq3, dem_M, 'plt/{}_dem_diff_rolling_post_ext_dem.png'.format(region))
    simple_hist('dem_diff_norm_rolling', dem_map[region], demDN_iq2, demDN_iq3, dem_M, 'plt/{}_dem_diff_norm_rolling_post_ext_dem.png'.format(region))
    
    plot_2D_diff_and_demand_norm(dem_map[region], multiplier_double, dem_M, 'plt/{}_2D_diff_and_demand_norm_final.png'.format(region))
    
    #print(dem_map[region].head())
    #print(dem_map[region].loc[1000:1005])

    plt.close()
    fig, ax = plt.subplots(figsize=(15,5))
    ax.plot(dem_map[region]['demand (MW)'], 'r-', label='demand')
    ax.plot(dem_map[region]['rollingDem'], 'b-', label='rolling dem')
    plt.tight_layout()
    plt.legend()
    plt.title("{} Cleaned Demand".format(region))
    plt.savefig('plt/{}_demand.png'.format(region))
    
    plt.close()
    fig, ax = plt.subplots(figsize=(15,5))
    ax.plot(dem_map[region]['demand (MW)'], 'r-', label='demand')
    ax.plot(dem_map[region]['rollingDem'], 'b-', label='rolling dem')
    ax.plot(dem_map[region]['demandFiltered'], 'g-', label='demandFiltered')
    ax.plot(dem_map[region]['deltaFiltered'], 'y-', label='demandFiltered')
    plt.tight_layout()
    plt.legend()
    plt.title("{} Cleaned Demand".format(region))
    plt.savefig('plt/{}_demand_show_filters.png'.format(region))
    
    width = 500
    title = '{} Demand Showing Filters'.format(region)
    save = '/Users/truggles/tmp_plots/{}_demand_show_filters.png'.format(region)
    scrolling_demand(width, dem_map[region], title, save)