# Read CSV into xarray Dataset with quality codes

In [1]:
import xarray as xr
import pandas as pd

  data = yaml.load(f.read()) or {}
  defaults = yaml.load(f)


## Load data

In [2]:
df = pd.read_csv("data/mooring-buoys-time-series-62450.csv", parse_dates=["DATE (yyyy-mm-ddThh:mi:ssZ)"],
                 infer_datetime_format=True)
df.head()

Unnamed: 0,PLATFORM,DATE (yyyy-mm-ddThh:mi:ssZ),LATITUDE (degree_north),LONGITUDE (degree_east),TEMP LEVEL0 (degree_Celsius),DRYT LEVEL0 (degree_Celsius),PSAL LEVEL0 (psu),BATTERY LEVEL0 (volt),CNDC LEVEL0 (S/m),DEPH LEVEL0 (meter),...,PCO2_PUMP_CURR LEVEL1 (TBD),PCO2_THERMISTOR LEVEL1 (TBD),PHPH LEVEL1 (none),PHPH_ADJUSTED LEVEL1 (none),PSAL_ADJUSTED LEVEL1 (psu),START_CYCLE_FLOW LEVEL1 (l/minute),TEMP_ADJUSTED LEVEL1 (degree_Celsius),TEMP_CO2_ADJUSTED LEVEL1 (degree_Celsius),TUR4 LEVEL1 (ntu),QC
0,62450,2000-07-19 15:33:16,48.358,-4.5518,,,,,,,...,,,,,,,,,,0111999999999999991999799991199999999999999999...
1,62450,2000-07-19 15:58:32,48.358,-4.5518,,,,,,,...,,,,,,,,,3.0,0111999999999999999999799991199999999999999999...
2,62450,2000-07-19 16:18:32,48.358,-4.5518,,,,,,,...,,,,,,,,,,0111999999999999991999799991199999999999999999...
3,62450,2000-07-19 16:33:17,48.358,-4.5518,,,,,,,...,,,,,,,,,2.0,0111999999999999991999799991199999999999999999...
4,62450,2000-07-19 16:58:32,48.358,-4.5518,,,,,,,...,,,,,,,,,,0111999999999999991999799991199999999999999999...


## Extract each quality code into its own column

In [3]:
qc_df = df.QC.str.split('', expand=True)
qc_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
0,,0,1,1,1,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,
1,,0,1,1,1,9,9,9,9,9,...,9,9,9,9,9,9,9,9,1,
2,,0,1,1,1,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,
3,,0,1,1,1,9,9,9,9,9,...,9,9,9,9,9,9,9,9,1,
4,,0,1,1,1,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,


## Merge quality code columns into the original dataset

In [4]:
for i, column in enumerate(df.columns[:-1], 1):
    df[f"QC {column}"] = qc_df[[i]]
df.head()

Unnamed: 0,PLATFORM,DATE (yyyy-mm-ddThh:mi:ssZ),LATITUDE (degree_north),LONGITUDE (degree_east),TEMP LEVEL0 (degree_Celsius),DRYT LEVEL0 (degree_Celsius),PSAL LEVEL0 (psu),BATTERY LEVEL0 (volt),CNDC LEVEL0 (S/m),DEPH LEVEL0 (meter),...,QC PCO2_LOW_REF LEVEL1 (TBD),QC PCO2_PUMP_CURR LEVEL1 (TBD),QC PCO2_THERMISTOR LEVEL1 (TBD),QC PHPH LEVEL1 (none),QC PHPH_ADJUSTED LEVEL1 (none),QC PSAL_ADJUSTED LEVEL1 (psu),QC START_CYCLE_FLOW LEVEL1 (l/minute),QC TEMP_ADJUSTED LEVEL1 (degree_Celsius),QC TEMP_CO2_ADJUSTED LEVEL1 (degree_Celsius),QC TUR4 LEVEL1 (ntu)
0,62450,2000-07-19 15:33:16,48.358,-4.5518,,,,,,,...,9,9,9,9,9,9,9,9,9,9
1,62450,2000-07-19 15:58:32,48.358,-4.5518,,,,,,,...,9,9,9,9,9,9,9,9,9,1
2,62450,2000-07-19 16:18:32,48.358,-4.5518,,,,,,,...,9,9,9,9,9,9,9,9,9,9
3,62450,2000-07-19 16:33:17,48.358,-4.5518,,,,,,,...,9,9,9,9,9,9,9,9,9,1
4,62450,2000-07-19 16:58:32,48.358,-4.5518,,,,,,,...,9,9,9,9,9,9,9,9,9,9


In [7]:
df.describe()

Unnamed: 0,PLATFORM,LATITUDE (degree_north),LONGITUDE (degree_east),TEMP LEVEL0 (degree_Celsius),DRYT LEVEL0 (degree_Celsius),PSAL LEVEL0 (psu),BATTERY LEVEL0 (volt),CNDC LEVEL0 (S/m),DEPH LEVEL0 (meter),DOX1 LEVEL0 (ml/l),...,PCO2_LOW_REF LEVEL1 (TBD),PCO2_PUMP_CURR LEVEL1 (TBD),PCO2_THERMISTOR LEVEL1 (TBD),PHPH LEVEL1 (none),PHPH_ADJUSTED LEVEL1 (none),PSAL_ADJUSTED LEVEL1 (psu),START_CYCLE_FLOW LEVEL1 (l/minute),TEMP_ADJUSTED LEVEL1 (degree_Celsius),TEMP_CO2_ADJUSTED LEVEL1 (degree_Celsius),TUR4 LEVEL1 (ntu)
count,498742.0,498742.0,498742.0,3440.0,218080.0,3440.0,3440.0,3440.0,245219.0,3440.0,...,23936.0,23942.0,23927.0,390278.0,104239.0,279030.0,75779.0,84070.0,86781.0,404233.0
mean,62450.0,48.358,-4.5518,16.592887,-19.749926,34.883959,10.997646,4.436361,0.009234,6.057846,...,4590.360795,109.706248,6505.924771,6.800564,8.36784,34.442693,2.582603,13.762018,13.311671,-31.439539
std,0.0,2.842174e-14,7.105434e-15,0.614004,78.70177,0.909576,0.015452,0.136173,0.079976,0.242817,...,84.980966,204.455178,2092.969455,16.50489,43.766607,0.845652,0.719219,3.296539,2.84523,207.936669
min,62450.0,48.358,-4.5518,12.98,-285.26,0.03,10.81,0.005,0.0,5.59,...,4585.0,92.0,0.0,0.0,7.81,23.53,0.0,7.4,7.51,-3142.0
25%,62450.0,48.358,-4.5518,16.08,0.0,34.63,10.988,4.363,0.0,5.8875,...,4587.0,101.0,6494.0,7.93,8.12,34.08,2.58,10.4,10.74,1.5
50%,62450.0,48.358,-4.5518,16.62,8.47,34.985,10.992,4.465,0.0,6.025,...,4588.0,106.0,6893.0,8.17,8.17,34.6395,2.8,15.5,13.37,3.7
75%,62450.0,48.358,-4.5518,17.03225,13.34,35.2425,11.0,4.508,0.0,6.18,...,4588.0,109.0,7665.0,8.28,8.24,35.07,2.92,16.7,16.06,9.06
max,62450.0,48.358,-4.5518,18.37,310.74,35.42,11.05,4.64,0.9949,7.85,...,7777.0,7777.0,8191.0,9999.99,9999.99,35.8,6.64,19.3,18.74,9999.99


## Index data by date

In [5]:
timed_df = df.set_index(["DATE (yyyy-mm-ddThh:mi:ssZ)"])

## Convert to xarray Dataset

In [6]:
ds = xr.Dataset.from_dataframe(timed_df)
ds

<xarray.Dataset>
Dimensions:                                       (DATE (yyyy-mm-ddThh:mi:ssZ): 498742)
Coordinates:
  * DATE (yyyy-mm-ddThh:mi:ssZ)                   (DATE (yyyy-mm-ddThh:mi:ssZ)) datetime64[ns] 2000-07-19T15:33:16 ... 2019-01-29T03:28:49
Data variables:
    PLATFORM                                      (DATE (yyyy-mm-ddThh:mi:ssZ)) int64 62450 ... 62450
    LATITUDE (degree_north)                       (DATE (yyyy-mm-ddThh:mi:ssZ)) float64 48.36 ... 48.36
    LONGITUDE (degree_east)                       (DATE (yyyy-mm-ddThh:mi:ssZ)) float64 -4.552 ... -4.552
    TEMP LEVEL0 (degree_Celsius)                  (DATE (yyyy-mm-ddThh:mi:ssZ)) float64 nan ... nan
    DRYT LEVEL0 (degree_Celsius)                  (DATE (yyyy-mm-ddThh:mi:ssZ)) float64 nan ... nan
    PSAL LEVEL0 (psu)                             (DATE (yyyy-mm-ddThh:mi:ssZ)) float64 nan ... nan
    BATTERY LEVEL0 (volt)                         (DATE (yyyy-mm-ddThh:mi:ssZ)) float64 nan ... nan
    CNDC LEVEL0