# Introduction to Xarray

In [144]:
import xarray as xr
import pandas as pd
import numpy as np

## Structure

### DataArray

In [145]:
# coords : là tập những index dùng để trỏ vào array

d = xr.DataArray(np.random.rand(4,2), coords= {'x': ['a', 'b', 'c', 'd'], 'y' : [ 'e', 'f']}, dims= ['x', 'y'])
d

In [146]:
d.loc['a','e']

In [147]:
print(d.coords)
print(d.dims)
print(d.name)
print(d.attrs)

Coordinates:
  * x        (x) <U1 'a' 'b' 'c' 'd'
  * y        (y) <U1 'e' 'f'
('x', 'y')
None
{}


### DataSet

In [148]:
temp = 15 + 8 * np.random.randn(2, 2, 3)

precip = 10 * np.random.rand(2, 2, 3)

lon = [[-99.83, -99.32], [-99.79, -99.23]]

lat = [[42.25, 42.21], [42.63, 42.59]]

ds = xr.Dataset(
    {
        "temperature": (["x", "y", "time"], temp),
        "precipitation": (["x", "y", "time"], precip),
    },
    coords={
        "lon": (["x", "y"], lon),
        "lat": (["x", "y"], lat),
        "time": pd.date_range("2014-09-06", periods=3),
        "reference_time": pd.Timestamp("2014-09-05"),
    },
)

ds

## Index, Select and Drop

In [149]:
ds['temperature'][0, 0, 1]

In [150]:
ds.sel(x = 0, y = 1)

In [151]:
## drop 
ds.drop_sel(x = [0])


In [152]:
ds

In [153]:
ds.drop_dims("time")

In [154]:
ds.where(ds.x + ds.y > 10)

## Interpolating data

In [155]:
da = xr.DataArray(
    np.sin(0.3 * np.arange(12).reshape(4, 3)),
    [("time", np.arange(4)), ("space", [0.1, 0.2, 0.3])],
)
da

In [156]:
da.sel(time = 3)

In [157]:
da.interp(time = 2.5)

In [158]:
da.sel(time = 2, space = 0.1)

In [159]:
da.interp(time = 2.5, space = 0.15)

## Group by

In [160]:

ds = xr.Dataset(
    {"foo": (("x", "y"), np.random.rand(4, 3))},
    coords={"x": [10, 20, 30, 40], "letters": ("x", list("abba"))},
)
ds

In [161]:
m = ds.groupby('letters')
m

DatasetGroupBy, grouped over 'letters'
2 groups with labels 'a', 'b'.

In [162]:
m.groups

{'a': [0, 3], 'b': [1, 2]}

In [163]:
list(ds.groupby("letters"))

[('a',
  <xarray.Dataset>
  Dimensions:  (x: 2, y: 3)
  Coordinates:
    * x        (x) int32 10 40
      letters  (x) <U1 'a' 'a'
  Dimensions without coordinates: y
  Data variables:
      foo      (x, y) float64 0.1504 0.5431 0.5311 0.4304 0.7997 0.711),
 ('b',
  <xarray.Dataset>
  Dimensions:  (x: 2, y: 3)
  Coordinates:
    * x        (x) int32 20 30
      letters  (x) <U1 'b' 'b'
  Dimensions without coordinates: y
  Data variables:
      foo      (x, y) float64 0.6473 0.7106 0.3453 0.3003 0.216 0.1397)]

In [164]:
# index groupby 
m['b']

## Compare with Pandas 

In [174]:
import random as rd
n_type = 26
n_measure = 300 
n_columns = 1000 

_data =  [[rd.randint(1,20)  for _ in range(n_columns)] for _ in range(n_type * n_measure)]

In [175]:
midx = pd.MultiIndex.from_product([[chr(x) for x in range(ord('A'), ord('A') + n_type) ],
                                [i for i in range(n_measure)]])
df = pd.DataFrame(index = midx,
        data = _data,
        columns = [str(x) + 'A' for x in range(n_columns)])

In [176]:
df

Unnamed: 0,Unnamed: 1,0A,1A,2A,3A,4A,5A,6A,7A,8A,9A,...,990A,991A,992A,993A,994A,995A,996A,997A,998A,999A
A,0,18,5,18,15,3,9,11,3,4,12,...,5,7,2,7,19,10,15,6,1,19
A,1,1,14,13,2,7,7,19,20,3,3,...,12,4,6,20,14,9,4,5,8,15
A,2,1,17,2,13,4,10,15,3,5,4,...,12,20,8,15,18,14,9,17,15,18
A,3,4,18,13,11,3,17,17,10,9,4,...,10,9,10,11,15,7,19,9,18,10
A,4,1,16,18,10,4,12,17,7,20,16,...,10,10,6,13,10,17,10,14,15,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Z,295,5,16,5,5,3,4,7,19,13,4,...,20,7,20,20,12,19,16,3,13,14
Z,296,16,8,14,5,8,6,20,17,18,5,...,6,7,2,4,17,6,12,16,16,13
Z,297,19,9,14,8,2,15,8,2,13,16,...,5,1,9,8,2,8,18,18,5,20
Z,298,4,18,1,20,7,2,13,15,16,13,...,11,9,12,5,2,15,11,3,16,2


In [177]:
df.index.set_names(['type','measure'],inplace=True)
df.columns.set_names(['cost'],inplace=True)
df

Unnamed: 0_level_0,cost,0A,1A,2A,3A,4A,5A,6A,7A,8A,9A,...,990A,991A,992A,993A,994A,995A,996A,997A,998A,999A
type,measure,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A,0,18,5,18,15,3,9,11,3,4,12,...,5,7,2,7,19,10,15,6,1,19
A,1,1,14,13,2,7,7,19,20,3,3,...,12,4,6,20,14,9,4,5,8,15
A,2,1,17,2,13,4,10,15,3,5,4,...,12,20,8,15,18,14,9,17,15,18
A,3,4,18,13,11,3,17,17,10,9,4,...,10,9,10,11,15,7,19,9,18,10
A,4,1,16,18,10,4,12,17,7,20,16,...,10,10,6,13,10,17,10,14,15,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Z,295,5,16,5,5,3,4,7,19,13,4,...,20,7,20,20,12,19,16,3,13,14
Z,296,16,8,14,5,8,6,20,17,18,5,...,6,7,2,4,17,6,12,16,16,13
Z,297,19,9,14,8,2,15,8,2,13,16,...,5,1,9,8,2,8,18,18,5,20
Z,298,4,18,1,20,7,2,13,15,16,13,...,11,9,12,5,2,15,11,3,16,2


In [178]:
# group by test 
df.groupby(level=1,axis=0).sum() 

  df.groupby(level=1,axis=0).sum()


cost,0A,1A,2A,3A,4A,5A,6A,7A,8A,9A,...,990A,991A,992A,993A,994A,995A,996A,997A,998A,999A
measure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,276,328,274,279,281,290,308,235,230,280,...,238,347,244,301,292,275,297,244,286,269
1,271,264,276,243,247,280,306,293,250,240,...,265,241,263,284,315,255,261,314,263,279
2,228,271,212,212,260,287,205,240,278,247,...,321,297,276,338,234,310,260,324,301,245
3,237,218,241,293,276,318,257,294,251,268,...,282,247,287,221,302,242,275,326,253,254
4,236,302,253,316,264,303,323,261,259,283,...,238,274,262,312,254,251,285,304,297,262
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,221,280,244,279,331,302,254,275,273,266,...,265,288,234,299,263,312,264,281,247,274
296,274,286,316,261,258,259,265,255,304,252,...,317,240,306,212,240,249,239,245,310,236
297,266,218,270,331,291,258,255,259,247,289,...,272,262,283,285,280,234,265,314,265,280
298,288,308,274,317,297,223,278,262,251,279,...,271,282,235,277,280,272,272,303,233,228


In [182]:
np_data = np.array(_data)
da = xr.DataArray(np_data.reshape(n_type, n_measure, n_columns), 
                  coords= dict(type = [chr(x) for x in range(ord('A'), ord('A') + n_type) ], 
                               measure = [i for i in range(n_measure)], 
                               cost = [str(x) + 'A' for x in range(n_columns)])  )

In [183]:
da

In [184]:
m = da.groupby("type")
list(m)

[('A',
  <xarray.DataArray (measure: 300, cost: 1000)>
  array([[18,  5, 18, ...,  6,  1, 19],
         [ 1, 14, 13, ...,  5,  8, 15],
         [ 1, 17,  2, ..., 17, 15, 18],
         ...,
         [10,  1,  5, ..., 13,  6, 11],
         [17, 12, 14, ..., 13,  3,  9],
         [ 4, 17, 12, ..., 20,  4,  9]])
  Coordinates:
      type     <U1 'A'
    * measure  (measure) int32 0 1 2 3 4 5 6 7 ... 292 293 294 295 296 297 298 299
    * cost     (cost) <U4 '0A' '1A' '2A' '3A' '4A' ... '996A' '997A' '998A' '999A'),
 ('B',
  <xarray.DataArray (measure: 300, cost: 1000)>
  array([[17,  6, 20, ...,  3, 16,  7],
         [ 7, 18,  2, ..., 11,  1,  5],
         [14,  2,  6, ..., 20,  6,  2],
         ...,
         [ 8,  4,  8, ..., 13, 10,  2],
         [18,  5, 17, ..., 11,  7, 12],
         [ 2, 17,  8, ..., 17, 12, 15]])
  Coordinates:
      type     <U1 'B'
    * measure  (measure) int32 0 1 2 3 4 5 6 7 ... 292 293 294 295 296 297 298 299
    * cost     (cost) <U4 '0A' '1A' '2A' '3A' '4A' ..

In [185]:
da.sum("type")