# Introduction to Xarray

In [37]:
import xarray as xr
import pandas as pd
import numpy as np

## Structure

### DataArray

In [38]:
# coords : là tập những index dùng để trỏ vào array

d = xr.DataArray(np.random.rand(4,2), coords= {'x': ['a', 'b', 'c', 'd'], 'y' : [ 'e', 'f']}, dims= ['x', 'y'])
d

In [39]:
d.loc['a','e']

In [40]:
print(d.coords)
print(d.dims)
print(d.name)
print(d.attrs)

Coordinates:
  * x        (x) <U1 'a' 'b' 'c' 'd'
  * y        (y) <U1 'e' 'f'
('x', 'y')
None
{}


### DataSet

In [41]:
temp = 15 + 8 * np.random.randn(2, 2, 3)

precip = 10 * np.random.rand(2, 2, 3)

lon = [[-99.83, -99.32], [-99.79, -99.23]]

lat = [[42.25, 42.21], [42.63, 42.59]]

ds = xr.Dataset(
     {
        "temperature": (["x", "y", "time"], temp),
        "precipitation": (["x", "y", "time"], precip),
    },
    coords={
        "lon": (["x", "y"], lon),
        "lat": (["x", "y"], lat),
        "time": pd.date_range("2014-09-06", periods=3),
        "reference_time": pd.Timestamp("2014-09-05"),
    },
)

ds

## Index, Select and Drop

In [42]:
ds['temperature'][0,0,1]

In [43]:
ds.sel(x = 0, y = 1)

In [44]:
## drop 
ds.drop_sel(x = [0])


In [45]:
ds

In [46]:
ds.drop_dims("time")

In [47]:
ds.where(ds.x + ds.y > 10)

## Interpolating data

In [48]:
da = xr.DataArray(
    np.sin(0.3 * np.arange(12).reshape(4, 3)),
    [("time", np.arange(4)), ("space", [0.1, 0.2, 0.3])],
)
da

In [49]:
da.sel(time = 3)

In [50]:
da.interp(time = 2.5)

In [51]:
da.sel(time = 2, space = 0.1)

In [52]:
da.interp(time = 2.5, space = 0.15)

## Group by

In [53]:

ds = xr.Dataset(
    {"foo": (("x", "y"), np.random.rand(4, 3))},
    coords={"x": [10, 20, 30, 40], "letters": ("x", list("abba"))},
)
ds

In [54]:
m = ds.groupby('letters')
m

DatasetGroupBy, grouped over 'letters'
2 groups with labels 'a', 'b'.

In [55]:
m.groups

{'a': [0, 3], 'b': [1, 2]}

In [56]:
list(ds.groupby("letters"))

[('a',
  <xarray.Dataset>
  Dimensions:  (x: 2, y: 3)
  Coordinates:
    * x        (x) int32 10 40
      letters  (x) <U1 'a' 'a'
  Dimensions without coordinates: y
  Data variables:
      foo      (x, y) float64 0.3506 0.03097 0.05142 0.1097 0.2321 0.01978),
 ('b',
  <xarray.Dataset>
  Dimensions:  (x: 2, y: 3)
  Coordinates:
    * x        (x) int32 20 30
      letters  (x) <U1 'b' 'b'
  Dimensions without coordinates: y
  Data variables:
      foo      (x, y) float64 0.9183 0.3785 0.0809 0.4583 0.9132 0.7157)]

In [57]:
# index groupby 
m['b']

## Compare with Pandas 

In [58]:
import random as rd
n_type = 26
n_measure = 300
n_columns = 1000 

_data =  [[rd.randint(1,20)  for _ in range(n_columns)] for _ in range(n_type * n_measure)]

In [59]:
midx = pd.MultiIndex.from_product([[chr(x) for x in range(ord('A'), ord('A') + n_type) ],
                                [i for i in range(n_measure)]])
df = pd.DataFrame(index = midx,
        data = _data,
        columns = [str(x) + 'A' for x in range(n_columns)])

In [60]:
df

Unnamed: 0,Unnamed: 1,0A,1A,2A,3A,4A,5A,6A,7A,8A,9A,...,990A,991A,992A,993A,994A,995A,996A,997A,998A,999A
A,0,17,9,17,1,17,4,13,17,2,14,...,14,14,9,18,15,5,14,18,3,1
A,1,3,11,16,17,7,8,20,17,17,3,...,11,2,10,15,17,18,12,20,1,12
A,2,8,11,13,14,18,18,10,5,6,17,...,18,17,1,3,9,12,17,17,15,5
A,3,9,4,16,15,17,2,19,9,12,5,...,19,7,13,9,4,15,1,10,11,1
A,4,6,5,15,13,6,15,13,7,5,16,...,7,10,3,11,11,3,12,9,8,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Z,295,18,15,2,6,17,15,11,16,13,19,...,12,13,14,6,20,17,12,14,19,16
Z,296,15,2,14,19,20,14,8,15,9,12,...,11,18,9,1,1,1,4,8,19,14
Z,297,2,16,20,14,13,1,9,4,13,15,...,5,16,20,1,17,1,10,19,1,3
Z,298,19,11,3,10,1,11,20,4,15,17,...,3,19,17,20,15,12,7,11,18,1


In [61]:
df.index.set_names(['type','measure'],inplace=True)
df.columns.set_names(['cost'],inplace=True)
df

Unnamed: 0_level_0,cost,0A,1A,2A,3A,4A,5A,6A,7A,8A,9A,...,990A,991A,992A,993A,994A,995A,996A,997A,998A,999A
type,measure,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A,0,17,9,17,1,17,4,13,17,2,14,...,14,14,9,18,15,5,14,18,3,1
A,1,3,11,16,17,7,8,20,17,17,3,...,11,2,10,15,17,18,12,20,1,12
A,2,8,11,13,14,18,18,10,5,6,17,...,18,17,1,3,9,12,17,17,15,5
A,3,9,4,16,15,17,2,19,9,12,5,...,19,7,13,9,4,15,1,10,11,1
A,4,6,5,15,13,6,15,13,7,5,16,...,7,10,3,11,11,3,12,9,8,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Z,295,18,15,2,6,17,15,11,16,13,19,...,12,13,14,6,20,17,12,14,19,16
Z,296,15,2,14,19,20,14,8,15,9,12,...,11,18,9,1,1,1,4,8,19,14
Z,297,2,16,20,14,13,1,9,4,13,15,...,5,16,20,1,17,1,10,19,1,3
Z,298,19,11,3,10,1,11,20,4,15,17,...,3,19,17,20,15,12,7,11,18,1


In [62]:
# group by test 
df.groupby(level=1,axis=0).sum() 

  df.groupby(level=1,axis=0).sum()


cost,0A,1A,2A,3A,4A,5A,6A,7A,8A,9A,...,990A,991A,992A,993A,994A,995A,996A,997A,998A,999A
measure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,278,300,266,236,249,299,274,299,269,299,...,231,281,242,288,278,255,277,276,275,266
1,233,267,292,249,275,307,239,282,261,240,...,242,275,271,239,263,332,246,265,294,288
2,282,299,268,325,294,323,297,257,265,241,...,254,304,236,299,258,250,228,269,304,263
3,280,277,257,234,272,260,262,265,260,291,...,279,262,255,304,251,272,212,245,280,211
4,274,326,296,236,302,270,270,223,303,229,...,246,329,297,289,273,288,269,250,243,275
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,355,307,284,233,261,317,306,318,278,277,...,236,317,255,241,262,265,301,266,257,232
296,221,343,237,294,252,312,259,322,242,269,...,324,279,305,197,242,278,261,280,273,254
297,251,265,293,234,267,305,280,237,235,233,...,278,281,231,277,281,300,246,291,204,204
298,309,308,293,320,240,286,303,233,329,312,...,267,297,290,252,244,235,278,258,310,292


In [63]:
np_data = np.array(_data)
da = xr.DataArray(np_data.reshape(n_type, n_measure, n_columns), 
                  coords= dict(type = [chr(x) for x in range(ord('A'), ord('A') + n_type) ], 
                               measure = [i for i in range(n_measure)], 
                               cost = [str(x) + 'A' for x in range(n_columns)])  )

In [64]:
da

In [65]:
m = da.groupby("type")
list(m)

[('A',
  <xarray.DataArray (measure: 300, cost: 1000)>
  array([[17,  9, 17, ..., 18,  3,  1],
         [ 3, 11, 16, ..., 20,  1, 12],
         [ 8, 11, 13, ..., 17, 15,  5],
         ...,
         [18, 19,  8, ...,  1,  9, 12],
         [ 5, 11, 19, ..., 13,  1, 16],
         [15,  3,  7, ..., 14, 15, 18]])
  Coordinates:
      type     <U1 'A'
    * measure  (measure) int32 0 1 2 3 4 5 6 7 ... 292 293 294 295 296 297 298 299
    * cost     (cost) <U4 '0A' '1A' '2A' '3A' '4A' ... '996A' '997A' '998A' '999A'),
 ('B',
  <xarray.DataArray (measure: 300, cost: 1000)>
  array([[17, 13, 17, ..., 19, 20, 13],
         [ 3, 18, 10, ..., 14, 12,  2],
         [16,  7, 16, ..., 12, 14,  9],
         ...,
         [ 3, 20, 11, ..., 19, 17,  6],
         [19, 13, 19, ..., 16, 17, 11],
         [12, 13,  2, ..., 16, 10,  9]])
  Coordinates:
      type     <U1 'B'
    * measure  (measure) int32 0 1 2 3 4 5 6 7 ... 292 293 294 295 296 297 298 299
    * cost     (cost) <U4 '0A' '1A' '2A' '3A' '4A' ..

In [66]:
da.sum("type")