Merge 2c4fff7 into b73022e

transportenergy · Nov 13, 2020 · 32349be · 32349be
2 parents b73022e + 2c4fff7
commit 32349be
Show file tree

Hide file tree

Showing 6 changed files with 91 additions and 40 deletions.
diff --git a/doc/conf.py b/doc/conf.py
@@ -38,6 +38,7 @@
     "sphinx.ext.autosummary",
     "sphinx.ext.coverage",
     "sphinx.ext.doctest",
+    "sphinx.ext.extlinks",
     "sphinx.ext.intersphinx",
     # 'sphinx.ext.linkcode',
     "sphinx.ext.napoleon",
@@ -178,6 +179,15 @@
 
 # -- Extension configuration -------------------------------------------------
 
+# -- Options for sphinx.ext.extlinks ---------------------------------------------------
+
+extlinks = {
+    "issue": ("https://github.com/transportenergy/database/issues/%s", "#"),
+    "pull": ("https://github.com/transportenergy/database/pull/%s", "PR #"),
+    "gh-user": ("https://github.com/%s", "@"),
+}
+
+
 # -- Options for intersphinx extension ---------------------------------------
 
 # Example configuration for intersphinx: refer to the Python standard library.

diff --git a/doc/whatsnew.rst b/doc/whatsnew.rst
@@ -8,6 +8,11 @@ What's new?
    :backlinks: none
    :depth: 1
 
+Next release
+============
+
+- Correct an error in the input data for :mod:`.T001` (:issue:`32`, :pull:`40`).
+
 
 v2020.11.13
 ===========

diff --git a/item/historical/__init__.py b/item/historical/__init__.py
@@ -1,5 +1,6 @@
 from copy import copy
 from functools import lru_cache
+import logging
 import os
 
 import pandas as pd
@@ -11,6 +12,8 @@
 from .scripts import T000, T001
 from .scripts.util.managers.dataframe import ColumnName
 
+log = logging.getLogger(__name__)
+
 
 #: List of data processing Jupyter/IPython notebooks.
 SCRIPTS = [
@@ -185,7 +188,9 @@ def process(id):
         print("No pre-processing checks to perform")
     except AssertionError as e:
         # An 'assert' statement in check() failed
-        print(f"Input data is invalid: {e}")
+        msg = "Input data is invalid"
+        log.error(f"{msg}: {e}")
+        raise RuntimeError(msg)
 
     # Information about columns. If not defined, use defaults.
     columns = dict(country_name="Country")

diff --git a/item/historical/scripts/T000.py b/item/historical/scripts/T000.py
@@ -1,9 +1,13 @@
 """Data cleaning code and configuration for T000."""
-from item.utils import convert_units
-from .util.managers.dataframe import ColumnName
 from functools import lru_cache
+
 import pandas as pd
 
+from item.utils import convert_units
+from item.historical.util import dropna_logged
+from .util.managers.dataframe import ColumnName
+
+
 #: Dimensions and attributes which do not vary across this data set.
 COMMON_DIMS = dict(
     variable="Passenger Activity",
@@ -47,26 +51,8 @@ def check(df):
 
 def process(df):
     """Process data set T000."""
-    # TODO The code below for identifying missing values is repeated in other
-    # cleaning scripts. We should consider moving this code into the
-    # 'item.historical import process' so that it applies to all scripts.
-
-    # Getting a generic idea of what countries are missing values and dropping
-    # NaN values
-    #
-    # Rule: Erase all value with NaN
-
-    list_of_countries_with_missing_values = list(
-        set(df[df["Value"].isnull()]["Country"])
-    )
-    print(
-        ">> Number of countries missing values: {}".format(
-            len(list_of_countries_with_missing_values)
-        )
-    )
-    print(">> Countries missing values:")
-    print(list_of_countries_with_missing_values)
-    print(">> Number of rows to erase: {}".format(len(df[df["Value"].isnull()])))
+    # Drop rows with nulls in "Value"; log corresponding values in "Country"
+    df = dropna_logged(df, "Value", ["Country"])
 
     # Assigning mode and vehicle type based on the variable name
     df = pd.concat([df, df["Variable"].apply(mode_and_vehicle_type)], axis=1)

diff --git a/item/historical/scripts/T001.py b/item/historical/scripts/T001.py
@@ -1,5 +1,19 @@
-"""Data cleaning code and configuration for T001."""
+"""Data cleaning code and configuration for T001.
+
+This module:
+
+- Detects and corrects :issue:`32`, a data error in the upstream source where China
+  observation values for years 1990 to 2001 inclusive are too low by 2 orders of
+  magnitude.
+
+"""
+import logging
+
 from item.utils import convert_units
+from item.historical.util import dropna_logged
+
+log = logging.getLogger(__name__)
+
 
 #: Dimensions and attributes which do not vary across this data set.
 COMMON_DIMS = dict(
@@ -54,30 +68,40 @@ def check(df):
     assert df["PowerCode"].unique() == ["Millions"]
     assert df["Unit"].unique() == ["Tonnes-kilometres"]
 
+    # Detect #32
+    # Data for CHN, including one year before and after the error
+    obs = df.query("COUNTRY == 'CHN' and Year >= 1985 and Year <= 2002").set_index(
+        "Year"
+    )["Value"]
+    # Delete the erroneous data
+    empty = obs.copy()
+    empty.iloc[1:-1] = None
+
+    # Expected values: interpolated between the two correct values
+    expected = empty.interpolate("index")
+
+    # Ratio of interpolated and observed values is about 100 for the years containing
+    # the error.
+    # TODO if the data is corrected in the original, this assertion will fail;
+    #      then remove this code and the corresponding correction in process(), below.
+    assert ((expected / obs).iloc[1:-1] >= 95).all()
+    log.info("Confirmed 10² magnitude error in China 1990–2001")
+
 
 def process(df):
     """Process data set T001."""
-    # Getting a generic idea of what countries are missing values and dropping
-    # NaN values
-    #
-    # Rule: Erase all value with NaN
-
-    list_of_countries_with_missing_values = list(
-        set(df[df["Value"].isnull()]["Country"])
-    )
-    print(
-        ">> Number of countries missing values: {}".format(
-            len(list_of_countries_with_missing_values)
-        )
-    )
-    print(">> Countries missing values:")
-    print(list_of_countries_with_missing_values)
-    print(">> Number of rows to erase: {}".format(len(df[df["Value"].isnull()])))
+    # Drop rows with nulls in "Value"; log corresponding values in "Country"
+    df = dropna_logged(df, "Value", ["Country"])
 
     # 1. Drop null values.
     # 2. Convert to the preferred iTEM units.
     #    TODO read the preferred units (here 'Gt km / year') from a common
     #    location
     df = df.dropna().pipe(convert_units, "Mt km / year", "Gt km / year")
 
+    # Correct #32
+    corrected = df.query("Country == 'China' and Year > 1985 and Year < 2002")
+    corrected["Value"] *= 100.0
+    df.update(corrected)
+
     return df
diff --git a/item/historical/util.py b/item/historical/util.py
@@ -1,11 +1,32 @@
 import io
+import logging
 import os
 import subprocess
 import sys
 from pathlib import Path
 
 import nbformat
 
+log = logging.getLogger(__name__)
+
+
+def dropna_logged(df, column, log_columns=[]):
+    """Drop rows from `df` with NaN values in `column`.
+
+    Counts and unique values for each of `log_columns` are logged.
+    """
+    # Rows to drop
+    to_drop = df[column].isnull()
+
+    log.info(f"{to_drop.sum()} rows with NaN in {repr(column)}")
+
+    for col in log_columns:
+        # Sorted unique values in column `col`
+        values = sorted(df[to_drop][col].unique())
+        log.info(f"… with {len(values)} unique values in {repr(col)}: {values}")
+
+    return df[~to_drop]
+
 
 def run_notebook(nb_path, tmp_path, env=os.environ, kernel=None):
     """Execute a Jupyter notebook via nbconvert and collect output.