# Reading csv files with pandas

In [1]:
import numpy as np

In [2]:
import pandas as pd

## Precipitation data from Japan Meteorological Agency

For details of the data, refer to `data/rain_tokyo_2020.md`.

As the first attempt, we are reading `total_na` and `max60_na` as integer, because they are encoded as integer.

In [3]:
csv = pd.read_csv(
    'data/rain_tokyo_2020.csv',
    encoding = 'shift_jis',
    header = None,
    names = ['date','total','total_na','D','E','max60','max60_na','H','I'],
    index_col = 'date',
    usecols = ['date','total','total_na','max60','max60_na'],
    parse_dates = True,
    dayfirst = False,
    dtype = {
        'total': np.float64,
        'total_na': np.int64,
        'max60': np.float64,
        'max60_na': np.int64,
    },
    skiprows = 6,
)

In [4]:
csv

Unnamed: 0_level_0,total,total_na,max60,max60_na
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-05-01,0.0,1,0.0,1
2020-05-02,0.0,1,0.0,1
2020-05-03,0.0,1,0.0,1
2020-05-04,2.0,0,1.0,0
2020-05-05,0.0,0,0.0,0
...,...,...,...,...
2020-09-26,6.0,0,1.0,0
2020-09-27,0.5,0,0.5,0
2020-09-28,0.0,1,0.0,1
2020-09-29,0.0,0,0.0,0


Note that `1` in the `*_na` columns means the corresponding values are missing data. So we are reading `total_na` and `max60_na` as `bool`, rather than integer.

In [5]:
csv = pd.read_csv(
    'data/rain_tokyo_2020.csv',
    encoding = 'shift_jis',
    header = None,
    names = ['date','total','total_na','D','E','max60','max60_na','H','I'],
    index_col = 'date',
    usecols = ['date','total','total_na','max60','max60_na'],
    parse_dates = True,
    dayfirst = False,
    dtype = {
        'total': np.float64,
        'total_na': bool,
        'max60': np.float64,
        'max60_na': bool,
    },
    skiprows = 6,
).rename_axis(None)

In [6]:
csv

Unnamed: 0,total,total_na,max60,max60_na
2020-05-01,0.0,True,0.0,True
2020-05-02,0.0,True,0.0,True
2020-05-03,0.0,True,0.0,True
2020-05-04,2.0,False,1.0,False
2020-05-05,0.0,False,0.0,False
...,...,...,...,...
2020-09-26,6.0,False,1.0,False
2020-09-27,0.5,False,0.5,False
2020-09-28,0.0,True,0.0,True
2020-09-29,0.0,False,0.0,False


Applying the `*_na` flags to the corresponding values.

In [7]:
def apply_na(value, na_flag):
    if na_flag:
        return np.nan
    else:
        return value

In [8]:
(csv
    .apply(
        lambda s: pd.Series(
            [
                apply_na(s.loc['total'], s.loc['total_na']),
                apply_na(s.loc['max60'], s.loc['max60_na'])
            ],
            index = ['total', 'max60'],
        ),
        axis = 1,
    )
)

Unnamed: 0,total,max60
2020-05-01,,
2020-05-02,,
2020-05-03,,
2020-05-04,2.0,1.0
2020-05-05,0.0,0.0
...,...,...
2020-09-26,6.0,1.0
2020-09-27,0.5,0.5
2020-09-28,,
2020-09-29,0.0,0.0
