Skip to content

Commit

Permalink
Merge 2c4fff7 into b73022e
Browse files Browse the repository at this point in the history
  • Loading branch information
khaeru committed Nov 13, 2020
2 parents b73022e + 2c4fff7 commit 32349be
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 40 deletions.
10 changes: 10 additions & 0 deletions doc/conf.py
Expand Up @@ -38,6 +38,7 @@
"sphinx.ext.autosummary",
"sphinx.ext.coverage",
"sphinx.ext.doctest",
"sphinx.ext.extlinks",
"sphinx.ext.intersphinx",
# 'sphinx.ext.linkcode',
"sphinx.ext.napoleon",
Expand Down Expand Up @@ -178,6 +179,15 @@

# -- Extension configuration -------------------------------------------------

# -- Options for sphinx.ext.extlinks ---------------------------------------------------

extlinks = {
"issue": ("https://github.com/transportenergy/database/issues/%s", "#"),
"pull": ("https://github.com/transportenergy/database/pull/%s", "PR #"),
"gh-user": ("https://github.com/%s", "@"),
}


# -- Options for intersphinx extension ---------------------------------------

# Example configuration for intersphinx: refer to the Python standard library.
Expand Down
5 changes: 5 additions & 0 deletions doc/whatsnew.rst
Expand Up @@ -8,6 +8,11 @@ What's new?
:backlinks: none
:depth: 1

Next release
============

- Correct an error in the input data for :mod:`.T001` (:issue:`32`, :pull:`40`).


v2020.11.13
===========
Expand Down
7 changes: 6 additions & 1 deletion item/historical/__init__.py
@@ -1,5 +1,6 @@
from copy import copy
from functools import lru_cache
import logging
import os

import pandas as pd
Expand All @@ -11,6 +12,8 @@
from .scripts import T000, T001
from .scripts.util.managers.dataframe import ColumnName

log = logging.getLogger(__name__)


#: List of data processing Jupyter/IPython notebooks.
SCRIPTS = [
Expand Down Expand Up @@ -185,7 +188,9 @@ def process(id):
print("No pre-processing checks to perform")
except AssertionError as e:
# An 'assert' statement in check() failed
print(f"Input data is invalid: {e}")
msg = "Input data is invalid"
log.error(f"{msg}: {e}")
raise RuntimeError(msg)

# Information about columns. If not defined, use defaults.
columns = dict(country_name="Country")
Expand Down
30 changes: 8 additions & 22 deletions item/historical/scripts/T000.py
@@ -1,9 +1,13 @@
"""Data cleaning code and configuration for T000."""
from item.utils import convert_units
from .util.managers.dataframe import ColumnName
from functools import lru_cache

import pandas as pd

from item.utils import convert_units
from item.historical.util import dropna_logged
from .util.managers.dataframe import ColumnName


#: Dimensions and attributes which do not vary across this data set.
COMMON_DIMS = dict(
variable="Passenger Activity",
Expand Down Expand Up @@ -47,26 +51,8 @@ def check(df):

def process(df):
"""Process data set T000."""
# TODO The code below for identifying missing values is repeated in other
# cleaning scripts. We should consider moving this code into the
# 'item.historical import process' so that it applies to all scripts.

# Getting a generic idea of what countries are missing values and dropping
# NaN values
#
# Rule: Erase all value with NaN

list_of_countries_with_missing_values = list(
set(df[df["Value"].isnull()]["Country"])
)
print(
">> Number of countries missing values: {}".format(
len(list_of_countries_with_missing_values)
)
)
print(">> Countries missing values:")
print(list_of_countries_with_missing_values)
print(">> Number of rows to erase: {}".format(len(df[df["Value"].isnull()])))
# Drop rows with nulls in "Value"; log corresponding values in "Country"
df = dropna_logged(df, "Value", ["Country"])

# Assigning mode and vehicle type based on the variable name
df = pd.concat([df, df["Variable"].apply(mode_and_vehicle_type)], axis=1)
Expand Down
58 changes: 41 additions & 17 deletions item/historical/scripts/T001.py
@@ -1,5 +1,19 @@
"""Data cleaning code and configuration for T001."""
"""Data cleaning code and configuration for T001.
This module:
- Detects and corrects :issue:`32`, a data error in the upstream source where China
observation values for years 1990 to 2001 inclusive are too low by 2 orders of
magnitude.
"""
import logging

from item.utils import convert_units
from item.historical.util import dropna_logged

log = logging.getLogger(__name__)


#: Dimensions and attributes which do not vary across this data set.
COMMON_DIMS = dict(
Expand Down Expand Up @@ -54,30 +68,40 @@ def check(df):
assert df["PowerCode"].unique() == ["Millions"]
assert df["Unit"].unique() == ["Tonnes-kilometres"]

# Detect #32
# Data for CHN, including one year before and after the error
obs = df.query("COUNTRY == 'CHN' and Year >= 1985 and Year <= 2002").set_index(
"Year"
)["Value"]
# Delete the erroneous data
empty = obs.copy()
empty.iloc[1:-1] = None

# Expected values: interpolated between the two correct values
expected = empty.interpolate("index")

# Ratio of interpolated and observed values is about 100 for the years containing
# the error.
# TODO if the data is corrected in the original, this assertion will fail;
# then remove this code and the corresponding correction in process(), below.
assert ((expected / obs).iloc[1:-1] >= 95).all()
log.info("Confirmed 10² magnitude error in China 1990–2001")


def process(df):
"""Process data set T001."""
# Getting a generic idea of what countries are missing values and dropping
# NaN values
#
# Rule: Erase all value with NaN

list_of_countries_with_missing_values = list(
set(df[df["Value"].isnull()]["Country"])
)
print(
">> Number of countries missing values: {}".format(
len(list_of_countries_with_missing_values)
)
)
print(">> Countries missing values:")
print(list_of_countries_with_missing_values)
print(">> Number of rows to erase: {}".format(len(df[df["Value"].isnull()])))
# Drop rows with nulls in "Value"; log corresponding values in "Country"
df = dropna_logged(df, "Value", ["Country"])

# 1. Drop null values.
# 2. Convert to the preferred iTEM units.
# TODO read the preferred units (here 'Gt km / year') from a common
# location
df = df.dropna().pipe(convert_units, "Mt km / year", "Gt km / year")

# Correct #32
corrected = df.query("Country == 'China' and Year > 1985 and Year < 2002")
corrected["Value"] *= 100.0
df.update(corrected)

return df
21 changes: 21 additions & 0 deletions item/historical/util.py
@@ -1,11 +1,32 @@
import io
import logging
import os
import subprocess
import sys
from pathlib import Path

import nbformat

log = logging.getLogger(__name__)


def dropna_logged(df, column, log_columns=[]):
"""Drop rows from `df` with NaN values in `column`.
Counts and unique values for each of `log_columns` are logged.
"""
# Rows to drop
to_drop = df[column].isnull()

log.info(f"{to_drop.sum()} rows with NaN in {repr(column)}")

for col in log_columns:
# Sorted unique values in column `col`
values = sorted(df[to_drop][col].unique())
log.info(f"… with {len(values)} unique values in {repr(col)}: {values}")

return df[~to_drop]


def run_notebook(nb_path, tmp_path, env=os.environ, kernel=None):
"""Execute a Jupyter notebook via nbconvert and collect output.
Expand Down

0 comments on commit 32349be

Please sign in to comment.