In [1]:
""" Apply industry weights on merged table.
-------------------------------------------------------------------------------

This script applies the industry weights to the framework. Overall Water Risk
(OWR) is calculated for every industry. When scores are unavailable (nan),
the weights have been set to Nan to exclude them from the weight sum. 


Grouped and overall water risks is calculated and stored as a separate 
indicator callend awr (aggregated water risk). 

Author: Rutger Hofste
Date: 20181211
Kernel: python35
Docker: rutgerhofste/gisdocker:ubuntu16.04

"""

SCRIPT_NAME = 'Y2018M12D11_RH_Master_Weights_GPD_V02'
OUTPUT_VERSION = 1

BQ_IN = {}
# Master Table
BQ_IN["MASTER"] = "y2018m12d04_rh_master_merge_rawdata_gpd_v02_v02"

# Weights
BQ_IN["WEIGHTS"] ="y2018m12d06_rh_process_weights_bq_v01_v01"

BQ_PROJECT_ID = "aqueduct30"
BQ_OUTPUT_DATASET_NAME = "aqueduct30v01"
BQ_OUTPUT_TABLE_NAME = "{}_v{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION).lower()

ec2_output_path = "/volumes/data/{}/output_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION) 
s3_output_path = "s3://wri-projects/Aqueduct30/processData/{}/output_V{:02.0f}/".format(SCRIPT_NAME,OUTPUT_VERSION)

print("\nBQ_OUTPUT_DATASET_NAME: ", BQ_OUTPUT_DATASET_NAME,
      "\nBQ_OUTPUT_TABLE_NAME: ", BQ_OUTPUT_TABLE_NAME,
      "\ns3_output_path: ", s3_output_path,
      "\nec2_output_path:" , ec2_output_path)


BQ_OUTPUT_DATASET_NAME:  aqueduct30v01 
BQ_OUTPUT_TABLE_NAME:  y2018m12d11_rh_master_weights_gpd_v02_v01 
s3_output_path:  s3://wri-projects/Aqueduct30/processData/Y2018M12D11_RH_Master_Weights_GPD_V02/output_V01/ 
ec2_output_path: /volumes/data/Y2018M12D11_RH_Master_Weights_GPD_V02/output_V01


In [2]:
BQ_IN

{'MASTER': 'y2018m12d04_rh_master_merge_rawdata_gpd_v02_v02',
 'WEIGHTS': 'y2018m12d06_rh_process_weights_bq_v01_v01'}

In [3]:
import time, datetime, sys
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2018M12D14 UTC 19:01


'3.5.4 |Anaconda, Inc.| (default, Nov 20 2017, 18:44:38) \n[GCC 7.2.0]'

In [4]:
!rm -r {ec2_output_path}
!mkdir -p {ec2_output_path}

In [5]:
import os
import pandas as pd
import numpy as np
import scipy.interpolate
from google.cloud import bigquery

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/.google.json"
os.environ["GOOGLE_CLOUD_PROJECT"] = "aqueduct30"
client = bigquery.Client(project=BQ_PROJECT_ID)

%matplotlib inline
pd.set_option('display.max_columns', 500)


In [6]:
sql_master = """
SELECT
  string_id,
  indicator,
  raw,
  score,
  cat,
  label
FROM
  `{}.{}.{}`
ORDER BY
  string_id
""".format(BQ_PROJECT_ID,BQ_OUTPUT_DATASET_NAME,BQ_IN["MASTER"])

In [7]:
df_master = pd.read_gbq(query=sql_master,dialect="standard")

In [8]:
df_master.head()

Unnamed: 0,string_id,indicator,raw,score,cat,label
0,111011-EGY.11_1-3365,bws,2.318042,5.0,4.0,Extremely High
1,111011-EGY.11_1-3365,udw,0.011222,0.0,0.0,Low (< 2.5%)
2,111011-EGY.11_1-3365,drr,,,,
3,111011-EGY.11_1-3365,sev,0.962396,2.887187,2.0,Medium - High
4,111011-EGY.11_1-3365,bwd,0.987061,4.948243,4.0,Extremely High


In [9]:
df_master.shape

(822132, 6)

In [10]:
df_in = df_master

In [11]:
sql_weights = """
SELECT
  id,
  group_full,
  LOWER(group_short) AS group_short,
  indicator_full,
  LOWER(indicator_short) AS indicator_short,
  industry_full,
  LOWER(industry_short) AS industry_short,
  weight_abs,
  weight_label,
  weight_interpretation,
  weight_fraction
FROM
  `{}.{}.{}`
""".format(BQ_PROJECT_ID,BQ_OUTPUT_DATASET_NAME,BQ_IN["WEIGHTS"])

In [12]:
df_weights = pd.read_gbq(query=sql_weights,dialect="standard")

In [13]:
df_weights.head()

Unnamed: 0,id,group_full,group_short,indicator_full,indicator_short,industry_full,industry_short,weight_abs,weight_label,weight_interpretation,weight_fraction
0,88,Physical Risk Quality,qal,Coastal eutrophication potential,cep,Oil & Gas,ong,0.0,No weight,Not relevant,0.0
1,9,Physical Risk Quality,qal,Untreated collected wastewater,ucw,Default,def,2.0,High,Represents high risk to the industry,0.081633
2,35,Physical Risk Quality,qal,Untreated collected wastewater,ucw,Food & Beverage,fnb,2.0,High,Represents high risk to the industry,0.081633
3,36,Physical Risk Quality,qal,Coastal eutrophication potential,cep,Food & Beverage,fnb,2.0,High,Represents high risk to the industry,0.081633
4,48,Physical Risk Quality,qal,Untreated collected wastewater,ucw,Chemicals,che,2.0,High,Represents high risk to the industry,0.07619


In [14]:
df_weights.shape

(130, 11)

In [15]:
df_groups = df_weights.loc[df_weights["industry_short"] =="def"][["indicator_short","group_short"]]

In [16]:
df_groups

Unnamed: 0,indicator_short,group_short
1,ucw,qal
15,cep,qal
20,drr,qan
42,bws,qan
43,bwd,qan
44,gtd,qan
62,iav,qan
63,sev,qan
75,rfr,qan
76,cfr,qan


In [17]:
# Add group to dataframe
df_in = pd.merge(left=df_in,
                 right=df_groups,
                 how="left",
                 left_on="indicator",
                 right_on="indicator_short")
df_in.drop("indicator_short",axis=1,inplace=True)

In [18]:
df_industries = df_weights[["indicator_short","industry_short","weight_fraction"]]

In [19]:
# Add industry to each indicator
df_w = pd.merge(left=df_in,
                right=df_industries,
                left_on = "indicator",
                right_on = "indicator_short",
                how = "right")
df_w.drop("indicator_short",axis=1,inplace=True)

In [20]:
# mask out weights where score is None
df_w["weight_fraction"] = df_w["weight_fraction"].mask(np.isnan(df_w["score"]))


In [21]:
df_w["weighted_score"] = df_w["weight_fraction"] * df_w["score"]

In [22]:
df_w.head()

Unnamed: 0,string_id,indicator,raw,score,cat,label,group_short,industry_short,weight_fraction,weighted_score
0,111011-EGY.11_1-3365,bws,2.318042,5.0,4.0,Extremely High,qan,che,0.07619,0.380952
1,111011-EGY.15_1-3365,bws,2.318042,5.0,4.0,Extremely High,qan,che,0.07619,0.380952
2,111011-EGY.15_1-None,bws,2.318042,5.0,4.0,Extremely High,qan,che,0.07619,0.380952
3,111011-None-3365,bws,2.318042,5.0,4.0,Extremely High,qan,che,0.07619,0.380952
4,111011-None-None,bws,2.318042,5.0,4.0,Extremely High,qan,che,0.07619,0.380952


In [23]:
def calculate_group_aggregate(df):
    """ Calculates the weighted scores for each industry, group pair. 
    e.g. Quantity risk for Agriculture (qan,agr).
    The dataframe will have an indicator called "awr" that stands for
    aggregated water risk. 
    
    
    Args:
        df (DataFrame) : Pandas Dataframe with Aqueduct values.
    
    Returns:
        df_agg (DataFrame) : DataFrame with aggregated scores. 
    
    """
    df_agg = df.groupby(["string_id","industry_short","group_short"])["weight_fraction","weighted_score"].agg("sum").reset_index()
    df_agg["indicator"] = "awr" # Aggregated Water Risk
    df_agg["raw"] = df_agg["weighted_score"] / df_agg["weight_fraction"]
    return df_agg

def calculate_total_aggregate(df_group):
    """ Calculates the weighted scores for each industry.
    e.g. Total risk for Agriculture
    The dataframe will have an indicator called "awr" that stands for
    aggregated water risk and a 'group' tot that stands for total.
    
    
    Args:
        df_group (DataFrame) : Pandas Dataframe with Grouped 
            Aqueduct values.
    
    Returns:
        df_totalagg (DataFrame) : DataFrame with aggregated scores. 
    
    """
    df_totalagg = df_group.groupby(["string_id","industry_short"])["weight_fraction","weighted_score"].agg("sum").reset_index()
    df_totalagg["group_short"] = "tot"
    df_totalagg["indicator"] = "awr" 
    df_totalagg["raw"] = df_totalagg["weighted_score"] / df_totalagg["weight_fraction"]
    return df_totalagg

def quantile_interp_function(s,q,y):
    """ Get a interpolated function based on quantiles.
    y and q should be the same length.
    
    Args:
        s(pandas Series): Input y data that needs to 
            be remapped.
        q(list): list with quantile x values.
        y(list): list with y value to map to.
        
    Returns:
        f(interp1d) : Scipy function object.
        quantiles(Pandas Series): list of quantile y 
            values. 
        
    Example:
    
        s = df["col"]
        q = [0,0.2,0.4,0.6,0.8,1]
        y = [0,1,2,3,4,5]
        f = quantile_interp_function(s,quantiles,y)
        y_new = f(x)
    
    """
    quantiles = s.quantile(q=q)
    f = scipy.interpolate.interp1d(quantiles,y)
    return f, quantiles

def calculate_group_remapped_scores(df_group):
    """ remap scores based on quantiles and linear
    interpolation. 
    
    See other functions for more information.
    
    Quantiles are determined per-group. 
    
    
    """
    
    groups = ["qan","qal","rrr"]
    q = [0,0.2,0.4,0.6,0.8,1]
    y = [0,1,2,3,4,5]

    ss_out = pd.Series() 
    for group in groups:
        s = df_group.loc[df_group["group_short"] == group]["raw"]
        f, quantiles = quantile_interp_function(s,q,y)
        print("quantile values used for group: ",group, "\n", quantiles)
        s_out  = df_group.loc[df_group["group_short"] == group]["raw"].apply(f)
        ss_out = ss_out.append(s_out)

    df_group["score"] = ss_out
    return df_group

def calculate_remapped_scores(df):
    q = [0,0.2,0.4,0.6,0.8,1]
    y = [0,1,2,3,4,5]
    s = df.loc[df["group_short"] == "tot"]["raw"]
    f, quantiles = quantile_interp_function(s,q,y)
    print("quantiles used for aggregate total:",quantiles)
    df["score"] = df["raw"].apply(f)
    return df

def score_to_category(score):
    if np.isnan(score):
        cat = np.nan
    elif score != 5:
        cat = int(np.floor(score))
    else:
        cat = 4
    return cat
    

def category_to_label(cat):
    if np.isnan(cat):
        label = "NoData"
    elif cat == 0:
        label = "Low"
    elif cat == 1:
        label = "Low - Medium"
    elif cat == 2:
        label = "Medium"
    elif cat == 3:
        label = "Medium - High"
    elif cat == 4: 
        label = "High"
    else:
        label = "Error"
    return label

In [24]:
df_group = calculate_group_aggregate(df_w)

In [25]:
df_group = calculate_group_remapped_scores(df_group)

quantile values used for group:  qan 
 0.0    0.000000
0.2    0.544842
0.4    0.886513
0.6    1.499137
0.8    2.383602
1.0    5.000000
Name: raw, dtype: float64
quantile values used for group:  qal 
 0.0    0.000000
0.2    1.147518
0.4    2.012350
0.6    2.850746
0.8    3.661864
1.0    5.000000
Name: raw, dtype: float64
quantile values used for group:  rrr 
 0.0    0.000000
0.2    0.304000
0.4    1.450219
0.6    2.871838
0.8    3.950681
1.0    5.000000
Name: raw, dtype: float64


In [26]:
df_total = calculate_total_aggregate(df_group)

In [27]:
df_total = calculate_remapped_scores(df_total)

quantiles used for aggregate total: 0.0    0.000000
0.2    0.934748
0.4    1.574611
0.6    2.076583
0.8    2.693158
1.0    5.000000
Name: raw, dtype: float64


In [33]:
df_agg = pd.concat([df_group, df_total],axis=0)

In [36]:
df_agg["cat"] = df_agg["score"].apply(score_to_category)
df_agg["label"] = df_agg["cat"].apply(category_to_label)

In [37]:
df_agg_out = pd.concat([df_w,df_agg],axis=0)

In [41]:
df_agg_out.sort_index(axis=1,inplace=True)

In [42]:
df_agg_out.head()

Unnamed: 0,cat,group_short,indicator,industry_short,label,raw,score,string_id,weight_fraction,weighted_score
0,4.0,qan,bws,che,Extremely High,2.318042,5.0,111011-EGY.11_1-3365,0.07619,0.380952
1,4.0,qan,bws,che,Extremely High,2.318042,5.0,111011-EGY.15_1-3365,0.07619,0.380952
2,4.0,qan,bws,che,Extremely High,2.318042,5.0,111011-EGY.15_1-None,0.07619,0.380952
3,4.0,qan,bws,che,Extremely High,2.318042,5.0,111011-None-3365,0.07619,0.380952
4,4.0,qan,bws,che,Extremely High,2.318042,5.0,111011-None-None,0.07619,0.380952


In [43]:
destination_path_s3 = "{}/{}.csv".format(ec2_output_path,SCRIPT_NAME)

In [44]:
df_agg_out.to_csv(destination_path_s3)

In [46]:
!aws s3 cp {ec2_output_path} {s3_output_path} --recursive

upload: ../../../../data/Y2018M12D11_RH_Master_Weights_GPD_V02/output_V01/Y2018M12D11_RH_Master_Weights_GPD_V02.csv to s3://wri-projects/Aqueduct30/processData/Y2018M12D11_RH_Master_Weights_GPD_V02/output_V01/Y2018M12D11_RH_Master_Weights_GPD_V02.csv


In [47]:
destination_table = "{}.{}".format(BQ_OUTPUT_DATASET_NAME,BQ_OUTPUT_TABLE_NAME)

In [49]:
# This can be sped up by using csv files, storing to GCS and ingesting from there.
df_agg_out.to_gbq(destination_table=destination_table,
                         project_id=BQ_PROJECT_ID,
                         chunksize=100000,
                         if_exists="replace")

102it [18:40, 10.98s/it]


In [50]:
end = datetime.datetime.now()
elapsed = end - start
print(elapsed)

0:28:11.269342


Previous runs:   
0:28:11.269342