diff --git a/.github/workflows/xskillscore_testing.yml b/.github/workflows/xskillscore_testing.yml index d675843b..07a37860 100644 --- a/.github/workflows/xskillscore_testing.yml +++ b/.github/workflows/xskillscore_testing.yml @@ -51,7 +51,7 @@ jobs: run: | pytest -n 4 --cov=xskillscore --cov-report=xml --verbose - name: Upload coverage to codecov - uses: codecov/codecov-action@v1.4.1 + uses: codecov/codecov-action@v1.5.2 with: token: ${{ secrets.CODECOV_TOKEN }} file: ./coverage.xml diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 13c71757..58eec925 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -12,6 +12,7 @@ Internal Changes Documentation ~~~~~~~~~~~~~ - Added more info in ``quick-start.ipynb`` (:pr:`316`) `Ray Bell`_. +- Created ``tabular-data.ipynb`` (:pr:`330`) `Ray Bell`_. Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/ci/docs_notebooks.yml b/ci/docs_notebooks.yml index 6625a3b8..9a5ec040 100644 --- a/ci/docs_notebooks.yml +++ b/ci/docs_notebooks.yml @@ -16,13 +16,15 @@ dependencies: # see https://github.com/xgcm/xhistogram/issues/48 - xhistogram==0.1.2 - importlib_metadata + - ipykernel - jupyterlab - matplotlib-base - nbsphinx - nbstripout - sphinx - - sphinxcontrib-napoleon + - sphinx-autosummary-accessors - sphinx_rtd_theme + - sphinxcontrib-napoleon - black - doc8 - isort @@ -30,6 +32,5 @@ dependencies: - pre-commit - pip - pip: - - sphinx_autosummary_accessors # Install latest version of xskillscore. - -e .. diff --git a/docs/source/geophysical-data.ipynb b/docs/source/geophysical-data.ipynb new file mode 100644 index 00000000..09d93658 --- /dev/null +++ b/docs/source/geophysical-data.ipynb @@ -0,0 +1,39 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3 + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "source": [ + "# Geophysical Data\n", + "\n", + "See the [Quick Start](https://xskillscore.readthedocs.io/en/stable/quick-start.html) section.\n", + "\n", + "[climpred](https://climpred.readthedocs.io/en/stable/) has examples on evaluating climate predictions which uses `xskillscore`." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ] +} diff --git a/docs/source/index.rst b/docs/source/index.rst index 03244e99..7c4cb6b0 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -51,6 +51,8 @@ You can also install the bleeding edge (pre-release versions) by running: :caption: Getting Started quick-start.ipynb + geophysical-data.ipynb + tabular-data.ipynb **Help & Reference** diff --git a/docs/source/tabular-data.ipynb b/docs/source/tabular-data.ipynb new file mode 100644 index 00000000..e4babdca --- /dev/null +++ b/docs/source/tabular-data.ipynb @@ -0,0 +1,507 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.4" + }, + "orig_nbformat": 4, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.9.4 64-bit ('xskillscore-docs-notebooks': conda)" + }, + "interpreter": { + "hash": "e5607b67897ceeb4cb8d1a6f5e8f77cf995244d75ab9ff3b133e23bb37c07f75" + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "source": [ + "# Tabular Data\n", + "\n", + "`xskillscore` can be used on tabular data such as that stored in a `pandas.DataFrame`.\n", + "\n", + "It can be used most effectively when evaluating predictions over different fields." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import xskillscore as xs\n", + "from sklearn.datasets import load_boston\n", + "from sklearn.metrics import mean_squared_error\n", + "np.random.seed(seed=42)" + ] + }, + { + "source": [ + "## Boston house prices dataset" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "A small example is to take a dataset and evaluate the model according to a field (column).\n", + "\n", + "Load the Boston house prices dataset:" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n", + "0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n", + "1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n", + "2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 \n", + "3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 \n", + "4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 \n", + ".. ... ... ... ... ... ... ... ... ... ... \n", + "501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1.0 273.0 \n", + "502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1.0 273.0 \n", + "503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1.0 273.0 \n", + "504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1.0 273.0 \n", + "505 0.04741 0.0 11.93 0.0 0.573 6.030 80.8 2.5050 1.0 273.0 \n", + "\n", + " PTRATIO B LSTAT y \n", + "0 15.3 396.90 4.98 24.0 \n", + "1 17.8 396.90 9.14 21.6 \n", + "2 17.8 392.83 4.03 34.7 \n", + "3 18.7 394.63 2.94 33.4 \n", + "4 18.7 396.90 5.33 36.2 \n", + ".. ... ... ... ... \n", + "501 21.0 391.99 9.67 22.4 \n", + "502 21.0 396.90 9.08 20.6 \n", + "503 21.0 396.90 5.64 23.9 \n", + "504 21.0 393.45 6.48 22.0 \n", + "505 21.0 396.90 7.88 11.9 \n", + "\n", + "[506 rows x 14 columns]" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATy
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.9824.0
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.1421.6
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.0334.7
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.9433.4
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.3336.2
.............................................
5010.062630.011.930.00.5736.59369.12.47861.0273.021.0391.999.6722.4
5020.045270.011.930.00.5736.12076.72.28751.0273.021.0396.909.0820.6
5030.060760.011.930.00.5736.97691.02.16751.0273.021.0396.905.6423.9
5040.109590.011.930.00.5736.79489.32.38891.0273.021.0393.456.4822.0
5050.047410.011.930.00.5736.03080.82.50501.0273.021.0396.907.8811.9
\n

506 rows × 14 columns

\n
" + }, + "metadata": {}, + "execution_count": 2 + } + ], + "source": [ + "data = load_boston()\n", + "df = pd.DataFrame(data.data, columns=data.feature_names)\n", + "df['y'] = pd.Series(data.target)\n", + "df" + ] + }, + { + "source": [ + "Create a dummy prediction column by adding noise to `y`:" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "noise = np.random.uniform(-1, 1, size=len(df['y']))\n", + "df['yhat'] = (df['y'] + (df['y'] * noise)).clip(lower=df[\"y\"].min())" + ] + }, + { + "source": [ + "Evaluate the model over the field `RAD` using `pandas.groupby.apply` with `mean_squared_error` from `scikit-learn`:" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "RAD\n", + "1.0 161.237554\n", + "2.0 313.855750\n", + "3.0 307.220760\n", + "4.0 162.634430\n", + "5.0 221.852969\n", + "6.0 155.612978\n", + "7.0 214.375240\n", + "8.0 278.092560\n", + "24.0 148.840507\n", + "dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ], + "source": [ + "df.groupby('RAD').apply(lambda x: mean_squared_error(x[\"y\"], x[\"yhat\"]))" + ] + }, + { + "source": [ + "You could also do the following using `xskillscore`.\n", + "\n", + "First, structure the `pandas.DataFrame` to keep the core fields when converting to an `xarray` object:" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " y yhat\n", + "index RAD \n", + "0 1.0 24.0 17.977926\n", + "1 2.0 21.6 41.070858\n", + "2 2.0 34.7 50.800380\n", + "3 3.0 33.4 39.990387\n", + "4 3.0 36.2 11.295750\n", + "... ... ...\n", + "501 1.0 22.4 24.017117\n", + "502 1.0 20.6 12.752538\n", + "503 1.0 23.9 38.899402\n", + "504 1.0 22.0 30.128172\n", + "505 1.0 11.9 5.000000\n", + "\n", + "[506 rows x 2 columns]" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
yyhat
indexRAD
01.024.017.977926
12.021.641.070858
22.034.750.800380
33.033.439.990387
43.036.211.295750
............
5011.022.424.017117
5021.020.612.752538
5031.023.938.899402
5041.022.030.128172
5051.011.95.000000
\n

506 rows × 2 columns

\n
" + }, + "metadata": {}, + "execution_count": 5 + } + ], + "source": [ + "min_df = df.reset_index().set_index([\"index\", \"RAD\"])[[\"y\", \"yhat\"]]\n", + "min_df" + ] + }, + { + "source": [ + "Convert it to an `xarray.Dataset` using `pandas.DataFrame.to_xarray`. Note: This will create an array of `index` by `RAD` and pad the values that do not exist with `nan`." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "\n", + "Dimensions: (RAD: 9, index: 506)\n", + "Coordinates:\n", + " * index (index) int64 0 1 2 3 4 5 6 7 8 ... 498 499 500 501 502 503 504 505\n", + " * RAD (RAD) float64 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 24.0\n", + "Data variables:\n", + " y (index, RAD) float64 24.0 nan nan nan nan ... nan nan nan nan nan\n", + " yhat (index, RAD) float64 17.98 nan nan nan nan ... nan nan nan nan nan" + ], + "text/html": "
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
<xarray.Dataset>\nDimensions:  (RAD: 9, index: 506)\nCoordinates:\n  * index    (index) int64 0 1 2 3 4 5 6 7 8 ... 498 499 500 501 502 503 504 505\n  * RAD      (RAD) float64 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 24.0\nData variables:\n    y        (index, RAD) float64 24.0 nan nan nan nan ... nan nan nan nan nan\n    yhat     (index, RAD) float64 17.98 nan nan nan nan ... nan nan nan nan nan
" + }, + "metadata": {}, + "execution_count": 6 + } + ], + "source": [ + "ds = min_df.to_xarray()\n", + "ds" + ] + }, + { + "source": [ + "You call now apply any metric from `xskillscore` using the accessor method. The input for the `dim` argument is `index` as we want to reduce this dimension and apply the metric over `RAD`. In addition, there are `nan`'s in the `xarray.Dataset` so you should use `skipna=True`:" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "\n", + "array([161.23755363, 313.85575025, 307.22076012, 162.63442999,\n", + " 221.85296903, 155.6129776 , 214.37524005, 278.09256049,\n", + " 148.84050691])\n", + "Coordinates:\n", + " * RAD (RAD) float64 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 24.0" + ], + "text/html": "
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
<xarray.DataArray (RAD: 9)>\narray([161.23755363, 313.85575025, 307.22076012, 162.63442999,\n       221.85296903, 155.6129776 , 214.37524005, 278.09256049,\n       148.84050691])\nCoordinates:\n  * RAD      (RAD) float64 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 24.0
" + }, + "metadata": {}, + "execution_count": 7 + } + ], + "source": [ + "out = ds.xs.mse('y', 'yhat', dim=\"index\", skipna=True)\n", + "out" + ] + }, + { + "source": [ + "It makes sense to return the data in tabular form hence you can call `xarray.DataArray.to_series` to convert it to a `pandas.Series`:" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "RAD\n", + "1.0 161.237554\n", + "2.0 313.855750\n", + "3.0 307.220760\n", + "4.0 162.634430\n", + "5.0 221.852969\n", + "6.0 155.612978\n", + "7.0 214.375240\n", + "8.0 278.092560\n", + "24.0 148.840507\n", + "dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ], + "source": [ + "out.to_series()" + ] + }, + { + "source": [ + "## Evaluating predictions over many columns" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "`xskillscore` is built upon `xarray.apply_ufunc` which offers speed-up by vectorizing operations. As a result `xskillscore` can be faster than `pandas.groupby.apply`. This is espicially true if there are many samples in the dataset and if the predictions have to be evaluated over many fields.\n", + "\n", + "For this exercise we will create fake data for which the predictions have to be evaluated over three fields:" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " DATE STORE SKU y yhat\n", + "0 2020-01-01 0 0 3 4.617306\n", + "1 2020-01-01 0 1 6 1.000000\n", + "2 2020-01-01 0 2 2 3.039347\n", + "3 2020-01-01 0 3 3 5.102145\n", + "4 2020-01-01 0 4 5 3.563087\n", + "... ... ... ... .. ...\n", + "99995 2020-01-10 99 95 9 15.836256\n", + "99996 2020-01-10 99 96 5 7.515791\n", + "99997 2020-01-10 99 97 1 1.000000\n", + "99998 2020-01-10 99 98 6 6.676512\n", + "99999 2020-01-10 99 99 5 4.600985\n", + "\n", + "[100000 rows x 5 columns]" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
DATESTORESKUyyhat
02020-01-010034.617306
12020-01-010161.000000
22020-01-010223.039347
32020-01-010335.102145
42020-01-010453.563087
..................
999952020-01-109995915.836256
999962020-01-10999657.515791
999972020-01-10999711.000000
999982020-01-10999866.676512
999992020-01-10999954.600985
\n

100000 rows × 5 columns

\n
" + }, + "metadata": {}, + "execution_count": 9 + } + ], + "source": [ + "stores = np.arange(100)\n", + "skus = np.arange(100)\n", + "dates = pd.date_range(\"1/1/2020\", \"1/10/2020\", freq=\"D\")\n", + "\n", + "rows = []\n", + "for _, date in enumerate(dates):\n", + " for _, store in enumerate(stores):\n", + " for _, sku in enumerate(skus):\n", + " rows.append(\n", + " dict(\n", + " {\n", + " \"DATE\": date,\n", + " \"STORE\": store,\n", + " \"SKU\": sku,\n", + " \"y\": np.random.randint(9) + 1,\n", + " }\n", + " )\n", + " )\n", + "df = pd.DataFrame(rows)\n", + "\n", + "noise = np.random.uniform(-1, 1, size=len(df['y']))\n", + "df['yhat'] = (df['y'] + (df['y'] * noise)).clip(lower=df[\"y\"].min())\n", + "df" + ] + }, + { + "source": [ + "Time the `pandas.groupby.apply` method:" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "CPU times: user 2.57 s, sys: 10.3 ms, total: 2.58 s\nWall time: 2.58 s\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "STORE SKU\n", + "0 0 8.384360\n", + " 1 7.071648\n", + " 2 14.677462\n", + " 3 13.391239\n", + " 4 12.131033\n", + " ... \n", + "99 95 18.473114\n", + " 96 10.154608\n", + " 97 11.743513\n", + " 98 8.406069\n", + " 99 7.098808\n", + "Length: 10000, dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ], + "source": [ + "%%time\n", + "df.groupby(['STORE', 'SKU']).apply(lambda x: mean_squared_error(x[\"y\"], x[\"yhat\"]))" + ] + }, + { + "source": [ + "Time it using `xskillscore`:" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "CPU times: user 21.8 ms, sys: 4.02 ms, total: 25.8 ms\nWall time: 24.4 ms\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "STORE SKU\n", + "0 0 8.384360\n", + " 1 7.071648\n", + " 2 14.677462\n", + " 3 13.391239\n", + " 4 12.131033\n", + " ... \n", + "99 95 18.473114\n", + " 96 10.154608\n", + " 97 11.743513\n", + " 98 8.406069\n", + " 99 7.098808\n", + "Length: 10000, dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ], + "source": [ + "%%time\n", + "df.set_index([\"DATE\", \"STORE\", \"SKU\"]).to_xarray().xs.mse(\n", + " \"y\", \"yhat\", dim=\"DATE\"\n", + ").to_series()" + ] + }, + { + "source": [ + "See [xskillscore-tutorial](https://github.com/raybellwaves/xskillscore-tutorial) for further reading." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ] +}