<a href="https://colab.research.google.com/github/tnc-br/ddf-isoscapes/blob/npr-working/Data_Collection_for_Paper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This colab loads all the data in the DDF harness bigquery table and stores it into a dataframe.

First, update your bigquery api (you will need to restart the runtime after this step).

In [1]:
%pip install --upgrade google-cloud-bigquery

Collecting google-cloud-bigquery
  Downloading google_cloud_bigquery-3.27.0-py2.py3-none-any.whl.metadata (8.6 kB)
Downloading google_cloud_bigquery-3.27.0-py2.py3-none-any.whl (240 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.1/240.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-cloud-bigquery
  Attempting uninstall: google-cloud-bigquery
    Found existing installation: google-cloud-bigquery 3.25.0
    Uninstalling google-cloud-bigquery-3.25.0:
      Successfully uninstalled google-cloud-bigquery-3.25.0
Successfully installed google-cloud-bigquery-3.27.0


### Dependencies

In [1]:
# This stub (ddfimport) allows the Ddf EE API to be imported.
import sys
!if [ ! -d "/content/ddf_common_stub" ] ; then git clone -b test https://github.com/tnc-br/ddf_common_stub.git; fi
sys.path.append("/content/ddf_common_stub/")
import ddfimport
ddfimport.ddf_import_common()

Cloning into 'ddf_common_stub'...
remote: Enumerating objects: 18, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 18 (delta 7), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (18/18), 7.36 KiB | 7.36 MiB/s, done.
Resolving deltas: 100% (7/7), done.
executing checkout_branch ...
b''
main branch checked out as readonly. You may now use ddf_common imports


In [2]:
import importlib
import eeddf
import bqddf
importlib.reload(eeddf)
importlib.reload(bqddf)

# Use your credentials to access the BQ tables.
eeddf.initialize_ddf(test_environment = True)

### Load the data

In [119]:
def get_all_training_results():
  client = bqddf._get_big_query_client()

  # Set up SQL query
  table_name = f"{bqddf._CONFIG['DATASET']}.{bqddf._CONFIG['FLATTENED_TABLE']}"
  query = f"SELECT * FROM {table_name}"

  # Execute the query
  results = client.query_and_wait(query)
  return results

results = get_all_training_results()

# Results is iterator based, so you will get an error running this twice!
df = results.to_dataframe()

import json
from typing import Set, Dict, Any
import numpy as np
import pandas as pd

def parse_tags(tags: np.ndarray) -> Set[str]:
  return set(tags.tolist())
df['tags'] = df['tags'].apply(parse_tags)

def find_by_tags(df: pd.DataFrame, tags: Set[str]) -> Dict[str, Any]:
  return df[df['tags'].apply(lambda x: all([tag in x for tag in tags]))].squeeze().to_dict()

def print_experiment(experiment: Dict[str, Any]) -> None:
  print(f"RMSE (Means): {experiment['mean_rmse']:.3f}")
  print(f"RMSE (Variances): {experiment['var_rmse']:.3f}")



In [113]:
print_experiment(find_by_tags(df, {'author:npr', 'ordinary_kriging', 'all_standardized', 'linear_variogram'}))

RMSE (Means): 1.273
RMSE (Variances): 1.148


In [114]:
print_experiment(find_by_tags(df, {'author:npr', 'ordinary_kriging', 'all_standardized', 'gaussian_variogram', 'rev2'}))

RMSE (Means): 1.269
RMSE (Variances): 1.190


In [115]:
print_experiment(find_by_tags(df, {'author:npr', 'universal_kriging', 'all_standardized', 'linear_variogram', 'rev2'}))

RMSE (Means): 1.183
RMSE (Variances): 1.157


In [116]:
print_experiment(find_by_tags(df, {'author:npr', 'universal_kriging', 'all_standardized', 'gaussian_variogram', 'rev2'}))

RMSE (Means): 1.246
RMSE (Variances): 1.247


In [117]:
print_experiment(find_by_tags(df, {'author:npr', "regression_kriging", "all_standardized", "rev1", "krige_type:universal", "variogram:gaussian", "n_estimators:100", "max_depth:20", "regression_strategy:gradientboosting"}))

RMSE (Means): 0.639
RMSE (Variances): 1.253


In [120]:
print_experiment(find_by_tags(df, {'author:npr', "regression_kriging", "all_standardized", "rev1", "krige_type:universal", "variogram:linear", "n_estimators:100", "max_depth:20", "regression_strategy:gradientboosting"}))

RMSE (Means): 0.659
RMSE (Variances): 1.253
