In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import date, time, datetime
import matplotlib
from helpers import return_good_regions
import pickle

In [None]:
base = '/Users/truggles/Downloads/overimpute_Jan_2020_All_Files/'

df = pd.read_csv(base+'csv_MASTER_v12_2day_dropped.csv')
df['date_time'] = pd.to_datetime(df['date_time'])
df = df.drop(['SEC','SEC_category','OVEC','OVEC_category'], axis=1)
df = df.reset_index()

In [None]:
def get_max_gaps(df, ba):
    
    prev_max = 0
    current_length = 0
    
    all_prev_max = 0
    all_current_length = 0
    for idx in df.index:
        
        if current_length > prev_max:
            prev_max = current_length
        if all_current_length > all_prev_max:
            all_prev_max = all_current_length
        
        # For length of missing
        if df.loc[idx][f"{ba}_category"] == 'MISSING':
            current_length += 1
        else:
            current_length = 0
        
        # For length of any gap
        if df.loc[idx][f"{ba}_category"] != 'OKAY':
            all_current_length += 1
        else:
            all_current_length = 0
     
    return max(all_prev_max, all_current_length), max(prev_max, current_length)



# For each BA, get 1) their mean demand
# 2) the number of screened for missing
# 3) the number screened for other reasons
def make_2D_missing_and_screened_vs_demand_plot(df, regions):

    import_from_pkl = True
    
    TYPEs = ['OKAY', 'MISSING', 'NEG_OR_ZERO', 'GLOBAL_DEM', 
             'GLOBAL_DEM_PLUS_MINUS', 'LOCAL_DEM_UP', 'LOCAL_DEM_DOWN',
            'DELTA', 'SINGLE_DELTA', 'IDENTICAL_RUN', 'ANOMALOUS_REGION']
    
    ba_map = {}
    
    for ba in regions:
        if import_from_pkl:
            continue
        print(ba)
        ba_map[ba] = {}
        ba_map[ba]['mean'] = np.nanmean(df[ba])
        for group in ['OKAY','MISSING']:
        
            ba_map[ba][group] = np.where(df[f"{ba}_category"] == group, 1, 0).sum()
            
        ba_map[ba]['SCREENED'] = len(df.index) - ba_map[ba]['OKAY'] - ba_map[ba]['MISSING']
        ba_map[ba]['MAX_GAP'], ba_map[ba]['MAX_MISSING'] = get_max_gaps(df, ba)
    
    if not import_from_pkl:
        pickle_file = open('missing_and_screened_vs_demand_dict_Jan23.pkl', 'wb') 
        pickle.dump(ba_map, pickle_file)
        pickle_file.close()
    
    pickle_in = open('missing_and_screened_vs_demand_dict_Jan23.pkl', 'rb')
    ba_map = pickle.load(pickle_in)
    
    ba_name = []
    means = []
    missing = []
    screened = []
    okay = []
    max_missing = []
    max_gap = []
    for ba, vals in ba_map.items():
        ba_name.append(ba)
        means.append(vals['mean'])
        missing.append(vals['MISSING'])
        screened.append(vals['SCREENED'])
        okay.append(vals['OKAY'])
        max_missing.append(vals['MAX_MISSING'])
        max_gap.append(vals['MAX_GAP'])
        print(f"{ba} mean {means[-1]} max_missing {max_missing[-1]} screened {screened[-1]} missing {missing[-1]}")
        
        #print(ba, okay[-1], missing[-1], screened[-1])
        if missing[-1] + screened[-1] + okay[-1] != len(df.index):
            print("Problem...")

            
    
    size = 50
    c1 = 'C1'
    c2 = 'C2'
    fig, ax = plt.subplots()#figsize=(10,6))
    ax.scatter(means, missing, color=c1, marker='o', s=size, label='Missing Hours')
    ax.scatter(means, screened, color=c1, marker='x', s=size, label='Screened Hours')
    #ax.scatter(means, max_missing, label='max_missing')
    
    print("Cases where the number of screened hours is > missing hours")
    for Mean, Miss, Screen in zip(means, missing, screened):
        if Screen > Miss:
            print(f"{round(Mean,1)}, {Miss}, {Screen}")

    ax.yaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter('{x:,.0f}'))
    ax.set_ylabel('Hours Marked for Imputation', color=c1)
    ax.tick_params(axis='y', labelcolor=c1)
    
    ax.set_xlabel('Mean Annual Demand (MW)')
    plt.legend(loc='upper left')
    
    # Second y-axis
    ax2 = ax.twinx()  # instantiate a second axes that shares the same x-axis
    ax2.set_ylabel('Length of Gap (hours)', color=c2)  # we already handled the x-label with ax1
    ax2.scatter(means, max_gap, color=c2, marker='o', s=size, label='Maximum Gap')
    ax2.tick_params(axis='y', labelcolor=c2)
    
    ax2.yaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter('{x:,.0f}'))
    plt.legend(loc='upper right')
    
    # Get the 'x's on top
    #sf = 1.*ax.get_ylim()[1]/ax2.get_ylim()[1]
    #s_array = np.array(screened)
    #ax2.scatter(means, s_array*sf, color=c1, marker='x', s=size, label='_nolegend_')
    
    y_max = max(ax.get_ylim()[1], ax2.get_ylim()[1])
    ax.set_ylim(0, y_max*1.3)
    ax2.set_ylim(0, y_max*1.3)
    
    plt.xscale('log', nonposx='clip')
    fig.tight_layout()    
    plt.legend()
    #plt.grid()
    plt.savefig('missing_and_screened_vs_demand_plot.pdf')




regions = return_good_regions()
regions_for_seasons = ['PJM','MISO','ERCO',]#'SWPP','SOCO','CISO','NYIS','TVA','FPL','ISNE',]#'DUK']
region_for_annual = 'TVA'

#regions = regions_for_seasons

make_2D_missing_and_screened_vs_demand_plot(df, regions)