In [None]:
# JBY: Set up env using "ipython --pylab" imports
%pylab

In [None]:
%autoreload 2

In [None]:
interactive = False
if interactive:
    %matplotlib osx
else:
    %matplotlib inline
figsize(17,6)
rcParams['font.size'] = 16

In [None]:
import pandas as pd
import json
import os

#from pyextra import looser

# Misc functions

## Load some functions from helper.py

In [None]:
from helper import DuckStruct

## Define some functions here

In [None]:
def display_full(df):
    with pd.option_context('display.max_columns', 2000), pd.option_context('display.max_colwidth', -1):
        display(df)

In [None]:
def savefigs(name):
    savefig('%s.png' % name)
    savefig('%s.pdf' % name)

# Load data

In [None]:
with open(os.path.join(os.getcwd(), '..', 'data', 'carbonplan_projects.json'), 'r') as ff:
    whole_json = json.load(ff)

In [None]:
df = pd.json_normalize(whole_json['projects'])

In [None]:
df.head()

Flatten tags for easier filtering

In [None]:
tagset = set()
for tags in df.tags:
    for tag in tags:
        tagset.add(tag)
taglist = sorted(list(tagset))

In [None]:
# Create boolean field for each tag, e.g. t_dac and t_ocean
for tag in taglist:
    df['t_%s' % tag] = df.tags.map(lambda x: tag in x)

In [None]:
# n_tags
df['n_tags'] = df.tags.map(len)

In [None]:
#df.head()

Flatten metrics

In [None]:
metrics = ['mechanism', 'volume', 'negativity', 'permanence', 'additionality', 'cost', 'specificity']
metric_fields = ['value', 'units', 'rating', 'notes', 'comment']

In [None]:
# Ugly but works to read the nested metrics and expand
metrics_dfs = []
for ii in range(len(df)):
    metrics_dfs.append(pd.json_normalize(df.metrics[ii]))

In [None]:
for metric in metrics:
    for metric_field in metric_fields:
        colname = '%s_%s' % (metric, metric_field)
        #print(metric, metric_field)
        df[colname] = [dfx[dfx['name'] == metric][metric_field].iloc[0] for dfx in metrics_dfs]

In [None]:
# Clean up
for col in ['type', 'metrics']:
    if col in df.columns:
        del df[col]

# Look at data

In [None]:
df.head(5)

In [None]:
#for col in df.columns:
#    print(col)

In [None]:
df.columns

In [None]:
display_full(df.head(1))

In [None]:
for tag in taglist:
    print('%4d: %s' % (df['t_%s' % tag].sum(), tag))

In [None]:
display_full(df[df.t_dac & df.t_mineralization])

# Plots

In [None]:
# Colors are those used on https://carbonplan.org/research/cdr-database
_colors = {
    'forests': (49.0, 70.0, 42.0),
    'soil': (92.0, 59.0, 33.0),
    'biomass': (83.0, 75.0, 37.0),
    'ocean': (39.0, 73.0, 77.0),
    'mineralization': (66.0, 71.0, 77.0),
    'dac': (74.0, 52.0, 85.0),
}
colors = {k: array(v)/100.0 for k, v in _colors.items()}

In [None]:
primary_tags = list(colors.keys())
primary_tag_set = set(primary_tags)

In [None]:
def get_pt(tags):
    '''Returns a single primary tag (first tag from primary_tags found), or 'none' if project has no primary tags.'''
    for pt in primary_tags:
        if pt in tags:
            return pt
    else:
        return 'none'

In [None]:
def get_clr(tags, default_clr=(.7, .7, .7)):
    '''Returns the color of the first tag found, if any, or a default color if not.'''
    pt = get_pt(tags)
    return default_clr if tag == 'none' else colors[pt]

In [None]:
# Add pt column
df['pt'] = df.tags.map(get_pt)
# Add clr column
df['clr'] = df.tags.map(get_clr)

In [None]:
# Does any project not have a primary tag?
print((df.pt == 'none').sum(), 'projects are missing a primary tag')

In [None]:
figsize(18,18)
clr_handles = {tag: None for tag in primary_tags}
for ii, project in enumerate(df.itertuples()):
    clr = project.clr
    volume = project.volume_value
    hh, = semilogx(volume, ii, 'o', mec=clr, mfc=clr, ms=15)
    clr_handles[project.pt] = hh
xlabel('Volume (tons)')
ylabel('Project ID')
legend(clr_handles.values(), clr_handles.keys())
savefigs('carbon_plan_type_vol_separate')

In [None]:
figsize(18,4)
for ii, project in enumerate(df.itertuples()):
    clr = project.clr
    y_coord = (-len(primary_tags)) if project.pt == 'none' else -primary_tags.index(project.pt)
    volume = project.volume_value
    semilogx(volume, y_coord, 'o', mec=clr, mfc=clr, ms=20)
xlabel('Volume (tons)')
yticks([])
tight_layout()
savefigs('carbon_plan_type_vol')

In [None]:
figsize(18,4)
for ii, project in enumerate(df.itertuples()):
    clr = project.clr
    y_coord = (-len(primary_tags)) if project.pt == 'none' else -primary_tags.index(project.pt)
    permanence = project.permanence_value
    semilogx(permanence, y_coord, 'o', mec=clr, mfc=clr, ms=20)
xlabel('Permanence (years)')
yticks([])
tight_layout()
savefigs('carbon_plan_type_permanence')

In [None]:
figsize(18,4)
for ii, project in enumerate(df.itertuples()):
    clr = project.clr
    y_coord = (-len(primary_tags)) if project.pt == 'none' else -primary_tags.index(project.pt)
    cost = project.cost_value
    semilogx(cost, y_coord, 'o', mec=clr, mfc=clr, ms=20)
xlabel('Cost ($/ton)')
yticks([])
tight_layout()
savefigs('carbon_plan_type_cost')

**Plot Individual Supply curves**

In [None]:
print('Sequestration types:')
df.pt.unique()

In [None]:
print('Sequestration types that have some cost data:')
df[(df.cost_rating != -9999)].pt.unique()

In [None]:
display_full(df[(df.cost_rating != -9999) & (df.pt == 'mineralization')].sort_values(by='cost_value'))

In [None]:
def plot_single_vol_cost_curve(df, save_as=None, plot_legend=True):
    df = df.sort_values(by='cost_value')

    cv_cost = []
    cv_vol = []
    for ii, project in enumerate(df.itertuples()):
        # Left point
        cv_vol.append(0 if len(cv_vol) == 0 else cv_vol[-1])
        cv_cost.append(project.cost_value)
        # Right point
        cv_vol.append(cv_vol[-1] + project.volume_value)
        cv_cost.append(project.cost_value)    
        clr = project.clr
        pt = project.pt
    cv_cost = array(cv_cost)
    cv_vol = array(cv_vol)
    #plot(cv_vol, cv_cost, c=clr)
    fill_between(cv_vol, cv_cost, color=clr)
    xlabel('Volume (tons)')
    ylabel('Cost ($/ton)')
    if plot_legend:
        legend((pt,), loc='upper left')
    tight_layout()
    ylim(bottom=0)
    if save_as:
        savefigs(save_as)

In [None]:
figsize(18,4)
df_filt = df[(df.cost_rating != -9999) & (df.pt == 'mineralization')]
plot_single_vol_cost_curve(df_filt, save_as='vol_cost_mineralization')

In [None]:
for pt in df[(df.cost_rating != -9999)].pt.unique():
    df_filt = df[(df.cost_rating != -9999) & (df.pt == pt)]
    figure()
    plot_single_vol_cost_curve(df_filt, save_as='vol_cost_%s' % pt)

**Plot Combined Supply curves**

In [None]:
def plot_vol_cost_curve(df, save_as=None, plot_legend=True):
    # Creates a sorted copy
    df = df.sort_values(by='cost_value')

    df['volume_cumsum'] = df.volume_value.cumsum()
    
    # Each entry is [vol, cost].
    # Separate curve for each primary tag
    pt_dat = {pt: DuckStruct(vc_list=[[0, 0]], clr=None) for pt in df.pt.unique()}
    
    for ii, project in enumerate(df.itertuples()):
        pt = project.pt
        duck = pt_dat[pt]
        duck.clr = project.clr
        vc = duck.vc_list

        left_vol = 0 if ii == 0 else df.volume_cumsum[ii-1]
        right_vol = project.volume_cumsum
        cost = project.cost_value
        print('ii is', ii, 'and adding left_vol and right vol', left_vol, right_vol)
        # Four points
        vc.append([left_vol, 0])
        vc.append([left_vol, cost])
        vc.append([right_vol, cost])
        vc.append([right_vol, 0])

    for pt, duck in pt_dat.items():
        duck.vc_arr = array(duck.vc_list)
        if pt == 'mineralization':
            #fill_between(duck.vc_arr[:,0], duck.vc_arr[:,1], color=duck.clr)
            print('plotting')
            plot(duck.vc_arr[:,0], duck.vc_arr[:,1], 'o-', color=duck.clr)
            display(duck.vc_arr)
            #display(duck.vc_arr[:,0].diff)
            return(duck)
    xlabel('Volume (tons)')
    ylabel('Cost ($/ton)')
    if plot_legend:
        legend((pt,), loc='upper left')
    tight_layout()
    ylim(bottom=0)
    if save_as:
        savefigs(save_as)
        

In [None]:
df[(df.cost_rating != -9999)].pt.unique()

In [None]:
figsize(18,6)
df_filt = df[(df.cost_rating != -9999)]
duck = plot_vol_cost_curve(df_filt)