In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.imputation.mice as smi
import copy
import datetime
import matplotlib.dates as mdates
import matplotlib

In [None]:
# Load with various verbose summaries
def load_csv(file_path, columns, na_vals=['NA',], parse_d=True, verbose=0):
    dtype_map = {}
    for col in columns:
        dtype_map[col] = np.float64
    df = pd.read_csv(file_path,
                    dtype=dtype_map,
                    parse_dates=parse_d,
                    na_values=na_vals)
    if verbose >= 1:
        #print("\nHead(10)")
        #print(df.head(10))
        print("\nDescribe")
        print(df.describe().round(2))
        print("\ndf.isna().sum(axis=0)")
        print(df.isna().sum(axis=0))
    if verbose >= 2:
        plt.imshow(~df.isna(), aspect='auto')
        plt.xlabel("variables")
        plt.ylabel("cases")
        plt.gray()
        plt.show()
    if 'date_time' in df.columns:
        df['date_time'] = pd.to_datetime(df['date_time'])
    return df


def add_lags(df, regions):
    for r in regions:
        print("Adding lag for region {}".format(r))
        kwargs = {
                 '{}_Lag1'.format(r) : lambda x: np.roll(df[r], +1),
             }
        df = df.assign(**kwargs)
    return df


def drop_col(df, col):
    return df.drop(col, axis=1)



In [None]:
def return_imputed_indices(raw, name):
    index = pd.isnull(raw[name]).nonzero()[0]
    return index

def get_overimpute_index(raw, imp, col):
    init_nan = return_imputed_indices(raw, col)
    init_nan_set = set()
    for i in init_nan:
        init_nan_set.add(i)
    
    over_nan = return_imputed_indices(imp, col)
    over_nan_set = set()
    for i in over_nan:
        over_nan_set.add(i)
    
    return np.array(list(over_nan_set.difference(init_nan_set)))

def split_index_into_sort_and_long_gaps(index):
    short = []
    long = []
    index.sort()
    prev_was_short = False
    for i in range(len(index)-2): # Can't compare the last one like this
        if index[i+1] == index[i] + 1 and index[i+2] == index[i] + 2:
            long.append(index[i])
            prev_was_short = False
        else:
            short.append(index[i])
            prev_was_short = True
    if prev_was_short:
        short.append(index[-2])
        short.append(index[-1])
    else:
        long.append(index[-2])
        long.append(index[-1])
    return short, long
            
    

    
def return_values_by_index(imp, indices, name, replace_nan_with_zero=True):
    vals = imp.loc[indices, name]
    if replace_nan_with_zero:
        vals = vals.fillna(0)
    return vals

def comparison_demand_plot(region, original, imp, imp_up, imp_down, imp_name, title, save, o_max):

    name_map = {
        'CISO': 'California Independent System Operator',
        'TIDC': 'Turlock Irrigation District',
        'BANC': 'Balancing Authority of Northern California',
        'LDWP': 'Los Angeles Department of Water and Power',
    }
    title = title
    if region in name_map.keys():
        title = name_map[region]
    
    plt.close()
    sf = 1.0
    fig, ax = plt.subplots(figsize=(15*sf,7*sf))
    ax.set_ylabel('Demand (MW)')
    ax.yaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter('{x:,.0f}'))
    plt.title(title)
    ax.plot(original['date_time'], original[region], 'k-', label='Demand', linewidth=3.0)
    ax.plot(original['date_time'], imp, 'r-', label='Imputed Mean Demand', linewidth=3.0)
    ax.fill_between(original['date_time'], imp_down, imp_up, facecolor='orange', alpha=0.5, label='Imputed Range')
    ax.plot(original['date_time'], original[region], 'k-', label='_nolegend_', linewidth=3.0)
    #ax.set_ylim(0, o_max*1.3)
    ax.set_ylim(0, ax.get_ylim()[1]*1.3)
    plt.legend(prop={'size': 20})  
    ax.xaxis.set_major_locator(mdates.DayLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%a %Y-%m-%d'))
    plt.setp( ax.xaxis.get_majorticklabels(), rotation=45 )
    #ax.xaxis.set_minor_locator(mdates.DayLocator())
    ax.set_ylim(min(ax.get_ylim()[0], 0), ax.get_ylim()[1])
    plt.tight_layout()
    plt.grid()
    plt.savefig(save)

# Create many demand plots so we can actually see the values
def scrolling_demand(width, region, original, imps, imp_mean, imp_name, title, save):
    start = 0
    end = width-1
    k = 0
    tot_l = len(original.index)
    o_max = np.nanmax(original[region])
    while True:
        print(k, start, end)
        s = save.replace('.png', '_{}cnt'.format(k))
        t = title+': cnt {}'.format(k)
        o = original.loc[start:end]
        imp_avg = imp_mean.loc[start:end]
        
        
        # Max and min
        imp_max = []
        imp_min = []
        end_l = end if end == len(original.index) else end+1
        for j in range(start, end_l):
            
            #print(j)
            # Could check if imputed, skip that time saver for now
            max_ = -999
            min_ = 999999
            
            for cnt, imp in enumerate(imps):
                #print(f"{j} {cnt} --- max {max_}: min {min_}")
                if imp.iloc[j][region] > max_:
                    max_ = imp.iloc[j][region]
                if imp.iloc[j][region] < min_:
                    min_ = imp.iloc[j][region]
            imp_max.append(max_)
            imp_min.append(min_)
        #print(len(o))
        #print(len(imp_avg))
        #print(len(imp_max))
        #print(len(imp_min))
                
        
        # Don't waste a plot if no imputation happened
        if not (o[region].equals(imp_avg[region])):
            print(f"Actually plotting for hours: {start}, {end}")
            comparison_demand_plot(region, o, imp_avg[region], imp_max, imp_min, imp_name, t, s, o_max)
        if end == tot_l:
            break
        k += 1
        start += width
        end += width
        if end >= tot_l:
            end = tot_l








def comparison_scatter_plot(v1s, v2s, labels, t1, t2, title, save, float_y_min=False):

    plt.close()
    fig, ax = plt.subplots(figsize=(5,5))
    ax.set_xlabel(t1)
    ax.set_ylabel(t2)
    max_v1s = np.max( list(map(lambda x: np.max(x), v1s)))
    max_v2s = np.max( list(map(lambda x: np.max(x), v2s)))
    ax.set_xlim(0, max_v1s*1.1)
    min_v2s = 0
    if float_y_min:
        min_v2s = np.min( list(map(lambda x: np.min(x), v2s)))
    ax.set_ylim(min_v2s, max_v2s*1.1)
    plt.title(title)
    for v1, v2, l in zip(v1s, v2s, labels):
        ax.plot(v1, v2, '.', label=l, alpha=0.2)
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.grid()
    plt.savefig(save)


def simple_resolution(df_true, df_imp, title, save, n_bins=20):

    plt.close()
    #if df_true.index.all() != df_imp.index.all():
    #    print("Indices do not align, exiting simple_resolution")
    #    return 0

    #res_grid = []
    #for index, value in df_true.items():
    #    if value > 0:
    #        res_grid.append( (df_imp.at[index]-value) / value)
    #    else:
    #        print("Value == 0 for simple_resolution {} {}".format(title, save))
    res_grid = []
    for obs, val in zip(df_true, df_imp):
        if obs != 0:
            res_grid.append((val-obs)/obs)
        else:
            print("Value == 0 for simple_resolution {} {}".format(title, save))
    
    fig, ax = plt.subplots(figsize=(5,5))
    n, bins, patches = ax.hist(res_grid, n_bins, 
            facecolor='b', alpha=0.5, density=False)
    print("Length simp_res {}".format(np.sum(n)))
    fig.set_figheight(10)
    fig.set_figwidth(10)
    plt.title(title)
    plt.xlabel('(Imp. - Obs.)/Obs.')
    plt.ylabel('Counts')
    plt.tight_layout()
    plt.grid()
    plt.savefig(save)


# df_master is used to get the correct UCT time
def resolution_by_time(df_master, idx_vals, df_true, df_imp, title, save):

    plt.close()
    #if df_true.index.all() != df_imp.index.all():
    #    print("Indices do not align, exiting resolution_by_time")
    #    return 0
    
    # Get UCT time by index
    zero_index_hour = df_master.at[0, 'date_time'].hour
    
    res_grid = []
    for i in range(24):
        res_grid.append([])
    month_grid = []
    for i in range(12):
        month_grid.append([])
    week_grid = []
    for i in range(7):
        week_grid.append([])
    
    for idx, obs, val in zip(idx_vals, df_true, df_imp):
        mod = (idx + zero_index_hour - 8)%24 # -8 for PST vs. UCT
        res = (val-obs)/obs
        res_grid[mod].append(res)
        # Get month
        month = df_master.at[idx, 'date_time'].month
        month_grid[month-1].append(res)
        # Get week
        day = df_master.at[idx, 'date_time'].weekday()
        week_grid[day].append(res)

    # Plot hourly
    fig, ax = plt.subplots(figsize=(15,10))
    plt.grid()
    ax.set_title(title+': whiskers at 5%/95%')
    ax.boxplot(res_grid, whis=[5, 95])
    ax.set_xlabel('Hour (PST)')
    ax.set_ylabel('(Imp. - Obs.)/Obs.')
    plt.tight_layout()
    plt.savefig(save)
    
    # Plot monthly
    fig, ax = plt.subplots(figsize=(15,10))
    plt.grid()
    ax.set_title(title.replace('hour', 'month')+': whiskers at 5%/95%')
    ax.boxplot(month_grid, whis=[5, 95])
    ax.set_xlabel('Month')
    ax.set_ylabel('(Imp. - Obs.)/Obs.')
    plt.tight_layout()
    plt.savefig(save.replace('hour', 'month'))
    
    # Plot weekly
    fig, ax = plt.subplots(figsize=(15,10))
    plt.grid()
    ax.set_title(title.replace('hour', 'week')+': whiskers at 5%/95%')
    ax.boxplot(week_grid, whis=[5, 95])
    ax.set_xlabel('Week Days (1 == Monday, 7 == Sunday)')
    ax.set_ylabel('(Imp. - Obs.)/Obs.')
    plt.tight_layout()
    plt.savefig(save.replace('hour', 'week'))

    
    
def return_all_regions():
    return ['AEC', 'AECI', 'CPLE', 'CPLW',
    'DUK', 'FMPP', 'FPC',
    'FPL', 'GVL', 'HST', 'ISNE',
    'JEA', 'LGEE', 'MISO', 'NSB',
    'NYIS', 'OVEC', 'PJM', 'SC',
    'SCEG', 'SEC', 'SOCO',
    'SPA', 'SWPP', 'TAL', 'TEC',
    'TVA', 'ERCO',
    'AVA', 'AZPS', 'BANC', 'BPAT',
    'CHPD', 'CISO', 'DOPD',
    'EPE', 'GCPD', 'IID',
    'IPCO', 'LDWP', 'NEVP', 'NWMT',
    'PACE', 'PACW', 'PGE', 'PNM',
    'PSCO', 'PSEI', 'SCL', 'SRP',
    'TEPC', 'TIDC', 'TPWR', 'WACM',
    'WALC', 'WAUW']
    
    

    
    




width = 240

n1 = 'MICE'
## Open a saved csv and check contents
base1 = '/Users/truggles/Downloads/'
base2 = '/Users/truggles/Downloads/'

# Chez Ruggles
base1 = '/Users/truggles/Downloads/fourty_options_with_plots/'
base2 = '/Users/truggles/Downloads/results_from_forty_options/'

imp_map = {} # input impute file: returned imputed file,
for i in range(40):
    imp_map['ca_for_overimpute4_{:d}.csv'.format(i)] = \
            'mean_impute_CA_overimpute4_{:d}_mice.csv'.format(i)

    
    

### NOTE, if you want ALL regions use regions = return_all_regions()
regions = ['BANC', 'CISO', 'LDWP', 'TIDC']

only_scrolling = True
if only_scrolling:
    print("\nOnly producing the scrolling demand plots\n")

if not only_scrolling:
    file_path = '~/Downloads/overimpute_20191004_v0/csv_MASTER_v12_2day.csv'
    df = load_csv(file_path, regions, ['NA',], True, 2)
    #df = add_lags(df, regions)
    #df = drop_col(df, 'date_time')
    
regs = ['TIDC',]
# Loop all regions
for r in regions:
    if only_scrolling:
        break
    print(r)
    record = {
        'idx_all' : [],
        'idx_short' : [],
        'idx_long' : [],
        
        'obs_all' : [],
        'obs_short' : [],
        'obs_long' : [],

        'imp_all' : [],
        'imp_short' : [],
        'imp_long' : [],
    }
    # Loop all imputed files
    all_mice = []
    for k, v in imp_map.items():
        df_imp = load_csv(base1+k, regions, ['NA',], True, 0)
        df_mice = load_csv(base2+v, regions, ['NA',], True, 0)
        all_mice.append(df_mice)
        
        #indices = return_imputed_indices(df, r)

        # All imputed points comparing algos against eachother
        #v1 = return_values_by_index(df_mice, indices, r)

        #comparison_demand_plot(df, [v1,], [n1,],
        #    'Imputation of {}'.format(r), 'imp_dem_{}_comp.png'.format(r))
        #scrolling_demand(width, r, df, df_mice, 'MICE',
        #    'Imputation of {}'.format(r), '/Users/truggles/tmp_plots_imp/imp_dem_{}_comp.png'.format(r))
        
        
        # Find index ONLY from overimputation
        over_index = get_overimpute_index(df, df_imp, r)

        # Split results by short vs long imputation gaps
        short_obs, long_obs = split_index_into_sort_and_long_gaps(over_index)
        record['idx_all'].append(over_index)
        record['idx_short'].append(short_obs)
        record['idx_long'].append(long_obs)
        
        record['obs_all'].append(return_values_by_index(df, over_index, r))
        record['obs_short'].append(return_values_by_index(df, short_obs, r))
        record['obs_long'].append(return_values_by_index(df, long_obs, r))
        
        record['imp_all'].append(return_values_by_index(df_mice, over_index, r))
        record['imp_short'].append(return_values_by_index(df_mice, short_obs, r))
        record['imp_long'].append(return_values_by_index(df_mice, long_obs, r))

    
    idxs = ['idx_all', 'idx_short', 'idx_long']
    obss = ['obs_all', 'obs_short', 'obs_long']
    imps = ['imp_all', 'imp_short', 'imp_long']
    names = ['all', 'short', 'long']
    for name, idx, obs, imp in zip(names, idxs, obss, imps):
        idx_vals = np.concatenate(record[idx])
        obs_vals = pd.concat(record[obs])
        imp_vals = pd.concat(record[imp])
        
        simple_resolution(obs_vals, imp_vals, 'overimp resolution {} {} {}'.format(name, n1, r),
            'impOver_resolution_{}_{}_{}.png'.format(r, n1, name), 30)
        resolution_by_time(df, idx_vals, obs_vals, imp_vals, 'overimp hourly resolution {} {} {}'.format(name, n1, r),
            'imp_resolution_hourly_{}_{}_{}.png'.format(r, n1, name))
        comparison_scatter_plot([obs_vals.values,],
            [imp_vals.values,], [n1+':Obs',], 'Observed', 'Imputed',
            'Comparing {} for region: {}'.format(n1, name), 'imp_scatter_overImp_{}_{}_comp.png'.format(r, name))
        # Scatter with resolution on y-axis
        res1 = []
        for val, obs in zip(imp_vals, obs_vals):
            res1.append((val-obs)/obs)
        float_y_min = True
        comparison_scatter_plot([obs_vals.values,],
            [res1,], [n1+' Resolution',], 'Observed', '(Imp. - Obs.)/Obs.',
            'Resolution {} for region: {}'.format(n1, name), 'imp_scatter_overImp_res_{}_{}_comp.png'.format(r, name), float_y_min)       

#for k1, v1 in ordered.items():
#    for k2, v2 in v1.items():
#        print(k1, k2, v2)

In [None]:
### Overimpute validation plots with scrolling demand
### based on ca_for_overimpute_0.csv
### Dave returned a single LONG csv with 20 version indices with imputed copies
### Current files all link to those found in overimpute_20191004_v0.zip on gDrive

no_scrolling = False
width = 240


base = '~/Downloads/overimpute_20191004_v0/'

if not no_scrolling:
    regions = ['BANC', 'CISO', 'LDWP', 'TIDC']
    df = load_csv(base+'csv_MASTER_v12_2day.csv', regions, ['NA',], True, 2)
    imp_mean = load_csv(base+'mean_overimpute_new_clim_lag_lead_csv_MASTER_v12_2day_0_mice.csv', regions, ['NA',], True, 0)
    df_imp = load_csv(base+'csv_MASTER_v12_2day_0.csv', regions, ['NA',], True, 0)
    df_mice_all = load_csv(base+'all_overimpute_new_clim_lag_lead_csv_MASTER_v12_2day_0_mice_INT.csv', regions, ['NA',], True, 0)
    df['date_time'] = pd.to_datetime(df['date_time'])
    imp_mean['date_time'] = pd.to_datetime(imp_mean['date_time'])
    df_imp['date_time'] = pd.to_datetime(df_imp['date_time'])
    df_mice_all['date_time'] = pd.to_datetime(df_mice_all['date_time'])


In [None]:
regs = ['CISO','BANC','LDWP']
regs = ['LDWP','BANC', 'CISO']
regs = ['NYIS','PJM']
# Loop all regions
for r in regs:
    if no_scrolling:
        continue
    print(r)
    
    # Loop all imputed files
    all_mice = []
    for i in range(1, 21):
        #print(i)
        
        df_mice = df_mice_all.loc[df_mice_all['imp_index'] == i]
        #print(len(df_mice))
        all_mice.append(df_mice)
    
    scrolling_demand(width, r, df, all_mice, imp_mean, 'MICE',
            'Imputation of {}'.format(r), '/Users/truggles/tmp_plots_imp_Oct04/imp_dem_{}_comp.png'.format(r))

In [None]:
def scan_for_missing_structure(df):
    print("scan_for_missing_structure")
    rec = {}
    for col in df.columns:
        if 'Lag1' in col or 'date_time' in col: continue
        rec[col] = [0, []] # missing tally, and record
    for index, row in df.iterrows():
        if index%1000==0:
            print(" - scanning row {}".format(index))
        for col, info in rec.items():
            if np.isnan(row[col]): # incriment missing tally
                info[0] += 1
            elif not np.isnan(row[col]) and info[0] > 0:
                info[1].append(info[0])
                info[0] = 0
    #for k, v in rec.items():
    #    print(k, v)
    return rec


def remove_locations(df, requested_gaps, cnt=-1):
    print("remove_locations cnt {}".format(cnt))
    # Begin with the longest requested gaps and work you way
    # to smaller requested gaps
    for col, info in requested_gaps.items():
        info[1].sort()
        info[1].reverse()
        #print(col, info[1])
        for length in info[1]:
            # Try in requested column first the simple way, if that doesn't
            # work, try the "difficult" way where some existing
            # np.nans will be included in naned data
            #if find_and_remove_location(df, col, length):
            #    find_and_remove_difficult_location(df, col, length, cnt)
            find_and_remove_location2(df, col, length)

            

# Loop over vals in the dataframe and find a continous region which
# has a reasonably "good" data buffer around the requested length of data to remove.
# Change the valse to np.nan
def find_and_remove_location2(df, col, length, verbose=False):
    # Start at a random index position to not bias the removals
    # all towards the front
    loc = int(np.random.uniform(0, len(df.index)-length-2))

    while True:
        # Check that we begin with a "good" data value
        if not np.isnan(df.at[loc, col]):

            if verbose:
                print("Will remove length {} from location with {} NANs".format(
                    length, df.loc[loc:loc+length, col].isna().sum()))
            df.loc[loc+1:loc+length, col] = np.nan
            if verbose:
                print("Removed length {} from location with {} NANs".format(
                    length, df.loc[loc:loc+length, col].isna().sum()))
            return 0
        loc += 1
        
        # Wrap to start of df before reaching the end to
        # ensure that the gap will fit
        if loc + length >= len(df.index):
            loc = 0


# Loop over vals in the dataframe and find a continous region which
# has a reasonably "good" data buffer around the requested length of data to remove.
# Change the valse to np.nan
def find_and_remove_location(df, col, length, verbose=False):
    # Start at a random index position to not bias the removals
    # all towards the front
    max_good_data = 0
    loc = int(np.random.uniform(0, len(df.index)))
    if loc == len(df.index):
        loc -= 1
    start_of_good_data = loc
    length_of_good_data = 0
    n_loops = 0
    # How much good data on each side of the new gap?
    if length <= 100:
        tgt_length = 5 * length 
        buffer = 2 * length # 2x on each side
    elif length <= 1000:
        tgt_length = 3 * length 
        buffer = 1 * length # 1x on each side
    else:
        tgt_length = int(1.5 * length) 
        buffer = int(0.25 * length) # 0.25x on each side
    while True:
        if np.isnan(df.at[loc, col]):
            start_of_good_data = loc + 1 # This is the following value
            # and will continuously incriment if isnan()
            length_of_good_data = 0
        else: # good data
            length_of_good_data += 1
            if length_of_good_data > max_good_data:
                max_good_data = length_of_good_data

        # Remember pandas DataFrame has different slice notation that normal python
        # where the terminal value is included in the slice
        if length_of_good_data == tgt_length - 1:
            if verbose:
                print("Found a great spot for removal, col {:}, l={:d}, tgt_l={:d}, buffer={:d}, [{:d}:{:d}]".format(
                        col, length, tgt_length, buffer, start_of_good_data, start_of_good_data+length_of_good_data))
            strt = start_of_good_data + buffer # Begin nan after good data buffer 
            end = start_of_good_data + buffer + length - 1
            if verbose:
                print(df.loc[start_of_good_data:start_of_good_data+length_of_good_data, col])
            df.loc[strt:end, col] = np.nan
            if verbose:
                print(df.loc[start_of_good_data:start_of_good_data+length_of_good_data, col])
            return 0
        loc += 1
        
        # Wrap to start of df
        if loc >= len(df.index):
            loc = 0
            n_loops += 1
            if n_loops > 1:
                print("Too many loops for col {} and requested length {}, max good data length {}".format(
                    col, length, max_good_data))
                return 1

            
def find_and_remove_difficult_location(df, col, length, to_take=-1, verbose=False):
    # Scan data and look for highest purity "good" data region
    # to apply np.nan
    rec = []
    print("find_and_remove_difficult_location")
    print(" - Looking for col {} for length {}".format(col, length))
    for i in range(0, int(len(df.index) - 1.5 * length)):
        rec.append((i, df.loc[i:int(i+1.5*length), col].isna().sum()))
    nan_min = 9999
    best_idx = -1
    for val in rec:
        if val[1] < nan_min:
            nan_min = val[1]
            best_idx = val[0]

    # Find other comparable locations for adding a gap.
    # Search for reginons with 20% more less NANs
    # with respect to length requested.
    others = []
    for val in rec:
        if val[1] <= length * 0.2 and abs(val[0]-best_idx) > length:
            new_gap = True
            for other in others:
                if abs(val[0]-other[0]) < length:
                    new_gap = False
            if new_gap and val[0] - length < len(df.index):
                others.append(val)
    print(" - others:")
    print(others)
    
    
    print(" - Best idx {} for nan count of {}".format(best_idx, nan_min))
    if to_take >= 0:
        print(others[to_take][0])
        print(" - Will select location based on 'Others' {} {}".format(to_take, others[to_take]))
        df.loc[int(others[to_take][0]+0.25*length):int(others[to_take][0]+1.25*length), col] = np.nan
        print(" - Difficult NAN insertion resulting in 'other' location {} with {} total np.nans".format(
            others[to_take][0],
            df.loc[others[to_take][0]:int(others[to_take][0]+1.5*length), col].isna().sum()))
    else:
        df.loc[int(best_idx+0.25*length):int(best_idx+1.25*length), col] = np.nan
        print(" - Difficult NAN insertion resulting in defaul location {} with {} total np.nans".format(
            best_idx,
            df.loc[best_idx:int(best_idx+1.5*length), col].isna().sum()))
            
# Drop the category columns
def drop_cols_for_overimpute(df):
    cols = df.columns
    for col in cols:
        if '_category' in col:
            df = df.drop(col, axis=1)
    return df
    
        
    

version = 'v7_2day'
path = '/Users/truggles/tmp_data/'
file = 'csv_MASTER'
file_path = path+file+'_{}.csv'.format(version)
regions = return_all_regions()
df = load_csv(file_path, regions, ['NA',], True, 0)
df = drop_cols_for_overimpute(df)

np.random.seed(1)
plt.imshow(~df.isna(), aspect='auto')
plt.xlabel("variables")
plt.ylabel("cases")
plt.gray()
plt.savefig('{}_{}.png'.format(file, version))

make_overimpute = False
if make_overimpute:
    results = scan_for_missing_structure(df)
    for i in range(0, 2):
        df2 = copy.deepcopy(df)
        results2 = copy.deepcopy(results)
        remove_locations(df2, results2, i)
        plt.imshow(~df2.isna(), aspect='auto')
        plt.xlabel("variables")
        plt.ylabel("cases")
        plt.gray()
        plt.savefig('{}_{}_{}.png'.format(file, version, str(i)))
        print("Saving as '{}_{}_{}.csv'".format(file, version, str(i)))
        df2.to_csv('{}_{}_{}.csv'.format(file, version, str(i)), index=False, na_rep='NA')

In [None]:
####XXXXXX

def comparison_demand_plot(region, original, imp, imp_up, imp_down, imp_name, title, save, o_max):

    name_map = {
        'CISO': 'California Independent System Operator',
        'TIDC': 'Turlock Irrigation District',
        'BANC': 'Balancing Authority of Northern California',
        'LDWP': 'Los Angeles Department of Water and Power',
    }
    title = title
    if region in name_map.keys():
        title = name_map[region]
    
    plt.close()
    sf = 1.0
    fig, ax = plt.subplots(figsize=(15*sf,7*sf))
    ax.set_ylabel('Demand (MW)')
    ax.yaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter('{x:,.0f}'))
    plt.title(title)
    ax.plot(original['date_time'], original[region], 'k-', label='Demand', linewidth=3.0)
    ax.plot(original['date_time'], imp, 'r-', label='Imputed Mean Demand', linewidth=3.0)
    ax.fill_between(original['date_time'], imp_down, imp_up, facecolor='orange', alpha=0.5, label='Imputed Range')
    ax.plot(original['date_time'], original[region], 'k-', label='_nolegend_', linewidth=3.0)
    #ax.set_ylim(0, o_max*1.3)
    ax.set_ylim(0, ax.get_ylim()[1]*1.3)
    plt.legend(prop={'size': 20})  
    ax.xaxis.set_major_locator(mdates.DayLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    plt.setp( ax.xaxis.get_majorticklabels(), rotation=45 )
    #ax.xaxis.set_minor_locator(mdates.DayLocator())
    ax.set_ylim(min(ax.get_ylim()[0], 0), ax.get_ylim()[1])
    plt.tight_layout()
    plt.grid()
    plt.savefig(save)

    

    

# Create many demand plots so we can actually see the values
def scrolling_demand(width, region, original, imps, imp_mean, imp_name, title, save):
    start = 0
    end = width-1
    k = 0
    tot_l = len(original.index)
    o_max = np.nanmax(original[region])
    while True:
        print(k, start, end)
        s = save.replace('.png', '_{}cnt'.format(k))
        t = title+': cnt {}'.format(k)
        o = original.loc[start:end]
        imp_avg = imp_mean.loc[start:end]
        
        
        # Max and min
        imp_max = []
        imp_min = []
        end_l = end if end == len(original.index) else end+1
        for j in range(start, end_l):
            
            #print(j)
            # Could check if imputed, skip that time saver for now
            max_ = -999
            min_ = 999999
            
            for cnt, imp in enumerate(imps):
                #print(f"{j} {cnt} --- max {max_}: min {min_}")
                if imp.iloc[j][region] > max_:
                    max_ = imp.iloc[j][region]
                if imp.iloc[j][region] < min_:
                    min_ = imp.iloc[j][region]
            imp_max.append(max_)
            imp_min.append(min_)
        #print(len(o))
        #print(len(imp_avg))
        #print(len(imp_max))
        #print(len(imp_min))
                
        
        # Don't waste a plot if no imputation happened
        if not (o[region].equals(imp_avg[region])):
            print(f"Actually plotting for hours: {start}, {end}")
            comparison_demand_plot(region, o, imp_avg[region], imp_max, imp_min, imp_name, t, s, o_max)
        if end == tot_l:
            break
        k += 1
        start += width
        end += width
        if end >= tot_l:
            end = tot_l
