# Representing Data

In [1]:
# from fxy.a import * -- convenience import

## **Creating Representations**

### **Python's Basic Native Data Types**

In [2]:
class Person:
    pass

#### `class`

In [3]:
me = Person()

#### `string`

In [4]:
me.name = 'Alice'

#### `float`

In [5]:
me.height = 158.5 

#### `int`

In [6]:
me.age = 15

#### `bool`

In [7]:
me.teaching = True

#### `list`

In [8]:
me.times = [1, 2, 3]

#### `set`

In [9]:
me.set = set([1, 2, 3])

#### `tuple` (immutable `list`)

In [10]:
me.tuple = (1, 2, 3)

#### `dict`

In [11]:
me.dict = {'x': [1, 2, 3], 'y': (4, 5)}

### **Basic NumPy and Pandas Data Types**

In [12]:
import numpy as np

#### `np.array`

In [13]:
me.array = np.array([184.1, 180.1, 171.4, 177.1])

#### "`matrix`"

In [14]:
me.matrix = np.array([[184.1, 180.1],[171.4, 177.1]])

In [15]:
import pandas as pd

#### `pd.Index`

In [16]:
me.index = pd.Index(['a', 'b', 1])

#### `pd.Series`

In [17]:
me.s = pd.Series([1, 2, 3])

In [18]:
me.xseries = pd.Series([1, 2, 3], me.index)

#### `pd.MultiIndex`

In [19]:
me.mindex = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)])

In [20]:
me.mseries = pd.Series([1, 2, 3], me.mindex)

#### `pd.DataFrame`

In [21]:
me.df = pd.DataFrame({'xVar': [1, 2, 3], 'yVar': [4, 5, 6]})

In [22]:
me.df

Unnamed: 0,xVar,yVar
0,1,4
1,2,5
2,3,6


In [23]:
me.yseries = pd.Series([4, 5, 6], ['a', 'b', 2])

In [24]:
me.dataframe = pd.DataFrame({'xVar': me.xseries, 'yVar': me.yseries})

In [25]:
me.dataframe

Unnamed: 0,xVar,yVar
a,1.0,4.0
b,2.0,5.0
1,3.0,
2,,6.0


#### `pd.Categorical`

In [26]:
me.categorical = pd.Categorical(['yes', 'no', 'yes', 'yes'])
me.categorical

['yes', 'no', 'yes', 'yes']
Categories (2, object): ['no', 'yes']

In [27]:
me.dataframe['zVar'] = me.categorical
me.dataframe

Unnamed: 0,xVar,yVar,zVar
a,1.0,4.0,yes
b,2.0,5.0,no
1,3.0,,yes
2,,6.0,yes


### **Variously Indexed DataFrames**

#### ``DataFrame from 1 MultiIndex series``

In [28]:
me.xMseries = pd.Series([1,2,3], pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]))
me.yMseries = pd.Series([4,5,6], pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]))
me.Mdataframe = pd.DataFrame({'xVar': me.xMseries, 'yVar': me.yMseries})

In [29]:
me.Mdataframe

Unnamed: 0,Unnamed: 1,xVar,yVar
a,1,1,4
a,2,2,5
b,1,3,6


#### ``DataFrame with 2 MultiIndex series``

In [30]:
me.Mdataframe = pd.DataFrame(np.arange(12).reshape(4,3),
                         index=[['a','a','b','b'], [1,2,1,2]],
                         columns=[['VAR-A','VAR-A', 'VAR-B'],
                                  ['var-a','var-b', 'var-c']])

In [31]:
me.Mdataframe

Unnamed: 0_level_0,Unnamed: 1_level_0,VAR-A,VAR-A,VAR-B
Unnamed: 0_level_1,Unnamed: 1_level_1,var-a,var-b,var-c
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


#### ``Naming Axes``

In [32]:
me.Mdataframe.index.names = ['/letter/', '/number/']
me.Mdataframe.columns.names = ['/capital/', '/lowercase/']

In [33]:
me.Mdataframe

Unnamed: 0_level_0,/capital/,VAR-A,VAR-A,VAR-B
Unnamed: 0_level_1,/lowercase/,var-a,var-b,var-c
/letter/,/number/,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


#### ``More Levels``

In [34]:
me.Mdataframe = pd.DataFrame(np.arange(12).reshape(4,3),
                         index=[['x','x','x','y'],
                                ['a','a','b','b'],
                                [1,2,1,2,]],
                         columns=[['VAR', 'VAR', 'VAR'],
                                  ['VAR-A','VAR-A', 'VAR-B'],
                                  ['var-a','var-b', 'var-c']])
me.Mdataframe.index.names = ['note1　⇣', 'note2　⇣', 'note3　⇣']
me.Mdataframe.columns.names = ['info1　⇢', 'info2　⇢', 'info3　⇢']

In [35]:
me.Mdataframe

Unnamed: 0_level_0,Unnamed: 1_level_0,info1　⇢,VAR,VAR,VAR
Unnamed: 0_level_1,Unnamed: 1_level_1,info2　⇢,VAR-A,VAR-A,VAR-B
Unnamed: 0_level_2,Unnamed: 1_level_2,info3　⇢,var-a,var-b,var-c
note1　⇣,note2　⇣,note3　⇣,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
x,a,1,0,1,2
x,a,2,3,4,5
x,b,1,6,7,8
y,b,2,9,10,11


#### ``pd.DatetimeIndex``

In [36]:
me.dates = pd.date_range('1/1/2000', periods=8)

In [37]:
me.dates

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
              dtype='datetime64[ns]', freq='D')

#### ``pd.Panel`` - **DEPRECATED, THOUGH pandas is NAMED AFTER IT :)**

In [38]:
np.random.seed(1) # ⇠ Important for reproducibility of scientific work.

In [39]:
me.df = pd.DataFrame(np.random.randn(8,4), index=me.dates, columns=['A','B','C','D'])

In [40]:
me.df

Unnamed: 0,A,B,C,D
2000-01-01,1.624345,-0.611756,-0.528172,-1.072969
2000-01-02,0.865408,-2.301539,1.744812,-0.761207
2000-01-03,0.319039,-0.24937,1.462108,-2.060141
2000-01-04,-0.322417,-0.384054,1.133769,-1.099891
2000-01-05,-0.172428,-0.877858,0.042214,0.582815
2000-01-06,-1.100619,1.144724,0.901591,0.502494
2000-01-07,0.900856,-0.683728,-0.12289,-0.935769
2000-01-08,-0.267888,0.530355,-0.691661,-0.396754


**THEREFORE BELOW LINE DOESN'T WORK ANYMORE, USE XARRAY INSTEAD**

In [41]:
#me.panel = pd.Panel({'one': me.df[0:4], 'two': (me.df - me.df.mean())[['A', 'C']]})

#### **Anatomy of DataFrame**

In [42]:
me.df.index

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
              dtype='datetime64[ns]', freq='D')

In [43]:
me.df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [44]:
me.df.values

array([[ 1.62434536, -0.61175641, -0.52817175, -1.07296862],
       [ 0.86540763, -2.3015387 ,  1.74481176, -0.7612069 ],
       [ 0.3190391 , -0.24937038,  1.46210794, -2.06014071],
       [-0.3224172 , -0.38405435,  1.13376944, -1.09989127],
       [-0.17242821, -0.87785842,  0.04221375,  0.58281521],
       [-1.10061918,  1.14472371,  0.90159072,  0.50249434],
       [ 0.90085595, -0.68372786, -0.12289023, -0.93576943],
       [-0.26788808,  0.53035547, -0.69166075, -0.39675353]])

### **Xarray Data Types**

In [45]:
import xarray as xr

#### ``xr.DataArray``

In [46]:
me.arr = xr.DataArray([1, 2, 3])

In [47]:
me.arr

In [48]:
me.data = xr.DataArray(np.random.randn(2, 3), dims=("x", "y"), coords={"x": [10, 20]})

In [49]:
me.data

#### ``xr.Dataset``

In [50]:
me.ds = xr.Dataset(
    {'x': (['space'], [1,2]),
     'y': (['time'], [3,4,5])})
me.ds

#### **NOTE:** Casting: ``pd.DataFrame <= xr.Dataset``

In [51]:
me.ds.to_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,x,y
space,time,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,1,3
0,1,1,4
0,2,1,5
1,0,2,3
1,1,2,4
1,2,2,5


#### **NOTE:** Casting: ``pd.DataFrame => xr.Dataset``

In [52]:
me.dfs = pd.DataFrame({'x': [1,2,3], 'y': [4,5,6]})
me.dfs.to_xarray()

In [53]:
me.ds = xr.Dataset({"foo": me.data,
                    "bar": ("x", [1, 2]),
                    "baz": np.pi})

In [54]:
me.ds

#### ``Operations across dimensions``

In [55]:
me.ds.mean(dim='x')

**More examples:** [https://gist.github.com/mindey/f06f7c9d2c1aa17950dc16a90f6a9acb](https://gist.github.com/mindey/f06f7c9d2c1aa17950dc16a90f6a9acb)

## **Modifying Representations**

### **Adding/Removing elements to Series**

In [56]:
# So we had Series
me.xseries

a    1
b    2
1    3
dtype: int64

In [57]:
# Adding an element:
me.xseries['c'] = 4

In [58]:
# Removing an element
del me.xseries['c']  # or: me.xseries.drop('c') -- to temporarily remove

In [59]:
# Extending with multiple elements
me.xseries.append(me.yseries)

a    1
b    2
1    3
a    4
b    5
2    6
dtype: int64

In [60]:
# Removing multiple elements
me.xseries.drop(['a', 'b'])

1    3
dtype: int64

### **Adding Data to DataFrames**

In [61]:
# So, we had a DataFrame
me.dataframe

Unnamed: 0,xVar,yVar,zVar
a,1.0,4.0,yes
b,2.0,5.0,no
1,3.0,,yes
2,,6.0,yes


In [62]:
# Adding an element
me.dataframe.at['newRow','newCol'] = 1

In [63]:
me.df

Unnamed: 0,A,B,C,D
2000-01-01,1.624345,-0.611756,-0.528172,-1.072969
2000-01-02,0.865408,-2.301539,1.744812,-0.761207
2000-01-03,0.319039,-0.24937,1.462108,-2.060141
2000-01-04,-0.322417,-0.384054,1.133769,-1.099891
2000-01-05,-0.172428,-0.877858,0.042214,0.582815
2000-01-06,-1.100619,1.144724,0.901591,0.502494
2000-01-07,0.900856,-0.683728,-0.12289,-0.935769
2000-01-08,-0.267888,0.530355,-0.691661,-0.396754


In [64]:
# Adding a variable:
me.dataframe['wVar'] = [1, 2, 3, 4, 5]

In [65]:
me.dataframe

Unnamed: 0,xVar,yVar,zVar,newCol,wVar
a,1.0,4.0,yes,,1
b,2.0,5.0,no,,2
1,3.0,,yes,,3
2,,6.0,yes,,4
newRow,,,,1.0,5


In [66]:
# Adding a new Variable: (conforming to indices)
me.dataframe['wVar'] = pd.Series([1,2,3,4,5], index=['b','a',1,2,'newRow'])

In [67]:
me.dataframe

Unnamed: 0,xVar,yVar,zVar,newCol,wVar
a,1.0,4.0,yes,,2
b,2.0,5.0,no,,1
1,3.0,,yes,,3
2,,6.0,yes,,4
newRow,,,,1.0,5


### **Renaming a Variable**

In [68]:
# Produce a new DataFrame with the columns renamed:
me.dataframe.rename(columns={'wVar' : 'newCol2'}).head(2)

Unnamed: 0,xVar,yVar,zVar,newCol,newCol2
a,1.0,4.0,yes,,2
b,2.0,5.0,no,,1


In [69]:
# Rename the columns inside original DataFrame
me.dataframe.rename(columns={'wVar' : 'newCol2'}, inplace=True)

### **Adding Variables and Observations to Multi-Index**

In [70]:
# We had Multi-Indexed Dataframe:
me.Mdataframe

Unnamed: 0_level_0,Unnamed: 1_level_0,info1　⇢,VAR,VAR,VAR
Unnamed: 0_level_1,Unnamed: 1_level_1,info2　⇢,VAR-A,VAR-A,VAR-B
Unnamed: 0_level_2,Unnamed: 1_level_2,info3　⇢,var-a,var-b,var-c
note1　⇣,note2　⇣,note3　⇣,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
x,a,1,0,1,2
x,a,2,3,4,5
x,b,1,6,7,8
y,b,2,9,10,11


#### **Variable**

In [71]:
# Adding a Variable to it:
me.Mdataframe[('VAR','VAR-B','var-d')] = 1

In [72]:
me.Mdataframe

Unnamed: 0_level_0,Unnamed: 1_level_0,info1　⇢,VAR,VAR,VAR,VAR
Unnamed: 0_level_1,Unnamed: 1_level_1,info2　⇢,VAR-A,VAR-A,VAR-B,VAR-B
Unnamed: 0_level_2,Unnamed: 1_level_2,info3　⇢,var-a,var-b,var-c,var-d
note1　⇣,note2　⇣,note3　⇣,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
x,a,1,0,1,2,1
x,a,2,3,4,5,1
x,b,1,6,7,8,1
y,b,2,9,10,11,1


In [73]:
# Adding self-defined Variable to it:
newVar = pd.Series([1, 2], index=pd.MultiIndex.from_tuples([('x','a',1), ('x','a',2)]))
me.Mdataframe[('VAR','VAR-B','var-d')] = newVar

In [74]:
me.Mdataframe

Unnamed: 0_level_0,Unnamed: 1_level_0,info1　⇢,VAR,VAR,VAR,VAR
Unnamed: 0_level_1,Unnamed: 1_level_1,info2　⇢,VAR-A,VAR-A,VAR-B,VAR-B
Unnamed: 0_level_2,Unnamed: 1_level_2,info3　⇢,var-a,var-b,var-c,var-d
note1　⇣,note2　⇣,note3　⇣,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
x,a,1,0,1,2,1.0
x,a,2,3,4,5,2.0
x,b,1,6,7,8,
y,b,2,9,10,11,


#### **Observation**

In [75]:
# Adding a new Observatio
me.Mdataframe.append(pd.Series([1, 2, 3, 4], name=('y', 'b', 3), index=me.Mdataframe.columns))

Unnamed: 0_level_0,Unnamed: 1_level_0,info1　⇢,VAR,VAR,VAR,VAR
Unnamed: 0_level_1,Unnamed: 1_level_1,info2　⇢,VAR-A,VAR-A,VAR-B,VAR-B
Unnamed: 0_level_2,Unnamed: 1_level_2,info3　⇢,var-a,var-b,var-c,var-d
note1　⇣,note2　⇣,note3　⇣,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
x,a,1,0,1,2,1.0
x,a,2,3,4,5,2.0
x,b,1,6,7,8,
y,b,2,9,10,11,
y,b,3,1,2,3,4.0


### **Removing Data from DataFrame**

In [76]:
# So we had:
me.dataframe

Unnamed: 0,xVar,yVar,zVar,newCol,newCol2
a,1.0,4.0,yes,,2
b,2.0,5.0,no,,1
1,3.0,,yes,,3
2,,6.0,yes,,4
newRow,,,,1.0,5


In [77]:
# Removing Observations:
me.dataframe.drop(['a', 'b'])

Unnamed: 0,xVar,yVar,zVar,newCol,newCol2
1,3.0,,yes,,3
2,,6.0,yes,,4
newRow,,,,1.0,5


In [78]:
me.Mdataframe.drop([('y', 'b'), ('x', 'b')])

Unnamed: 0_level_0,Unnamed: 1_level_0,info1　⇢,VAR,VAR,VAR,VAR
Unnamed: 0_level_1,Unnamed: 1_level_1,info2　⇢,VAR-A,VAR-A,VAR-B,VAR-B
Unnamed: 0_level_2,Unnamed: 1_level_2,info3　⇢,var-a,var-b,var-c,var-d
note1　⇣,note2　⇣,note3　⇣,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
x,a,1,0,1,2,1.0
x,a,2,3,4,5,2.0


In [79]:
# Removing Variables:
me.dataframe.drop(['newCol', 'newCol2'], axis=1)

Unnamed: 0,xVar,yVar,zVar
a,1.0,4.0,yes
b,2.0,5.0,no
1,3.0,,yes
2,,6.0,yes
newRow,,,


In [80]:
me.Mdataframe.drop([('VAR','VAR-B','var-d')], axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,info1　⇢,VAR,VAR,VAR
Unnamed: 0_level_1,Unnamed: 1_level_1,info2　⇢,VAR-A,VAR-A,VAR-B
Unnamed: 0_level_2,Unnamed: 1_level_2,info3　⇢,var-a,var-b,var-c
note1　⇣,note2　⇣,note3　⇣,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
x,a,1,0,1,2
x,a,2,3,4,5
x,b,1,6,7,8
y,b,2,9,10,11


## **Combining Representations**

### Concatenation

In [81]:
me.df

Unnamed: 0,A,B,C,D
2000-01-01,1.624345,-0.611756,-0.528172,-1.072969
2000-01-02,0.865408,-2.301539,1.744812,-0.761207
2000-01-03,0.319039,-0.24937,1.462108,-2.060141
2000-01-04,-0.322417,-0.384054,1.133769,-1.099891
2000-01-05,-0.172428,-0.877858,0.042214,0.582815
2000-01-06,-1.100619,1.144724,0.901591,0.502494
2000-01-07,0.900856,-0.683728,-0.12289,-0.935769
2000-01-08,-0.267888,0.530355,-0.691661,-0.396754


In [82]:
df1 = me.df[['A', 'C']]['2000-01-02':'2000-01-04']
df1

Unnamed: 0,A,C
2000-01-02,0.865408,1.744812
2000-01-03,0.319039,1.462108
2000-01-04,-0.322417,1.133769


In [83]:
df2 = me.df[['B', 'C']]['2000-01-07':'2000-01-08']
df2

Unnamed: 0,B,C
2000-01-07,-0.683728,-0.12289
2000-01-08,0.530355,-0.691661


In [84]:
# Combine vertically (aligning Columns)
pd.concat([df1, df2])

Unnamed: 0,A,C,B
2000-01-02,0.865408,1.744812,
2000-01-03,0.319039,1.462108,
2000-01-04,-0.322417,1.133769,
2000-01-07,,-0.12289,-0.683728
2000-01-08,,-0.691661,0.530355


In [85]:
# Combine horizontally (aligning Indexes)
pd.concat([df1, df2], axis=1)

Unnamed: 0,A,C,B,C.1
2000-01-02,0.865408,1.744812,,
2000-01-03,0.319039,1.462108,,
2000-01-04,-0.322417,1.133769,,
2000-01-07,,,-0.683728,-0.12289
2000-01-08,,,0.530355,-0.691661


In [86]:
# Sub-index with keys:
pd.concat([df1, df2], keys=['group1', 'group2'])

Unnamed: 0,Unnamed: 1,A,C,B
group1,2000-01-02,0.865408,1.744812,
group1,2000-01-03,0.319039,1.462108,
group1,2000-01-04,-0.322417,1.133769,
group2,2000-01-07,,-0.12289,-0.683728
group2,2000-01-08,,-0.691661,0.530355


In [87]:
# Sub-index with keys:
pd.concat([df1, df2], axis=1, keys=['group1', 'group2'])

Unnamed: 0_level_0,group1,group1,group2,group2
Unnamed: 0_level_1,A,C,B,C
2000-01-02,0.865408,1.744812,,
2000-01-03,0.319039,1.462108,,
2000-01-04,-0.322417,1.133769,,
2000-01-07,,,-0.683728,-0.12289
2000-01-08,,,0.530355,-0.691661


### **Joining/Merging DataFrames**

#### Inner Join
`pd.merge(df, df, on='var', how='inner')`
#### Outer Left Join
`pd.merge(df, df, on='var', how='left')`
#### Outer Right Join
`pd.merge(df, df, on='var', how='right')`
#### Outer Join
`pd.merge(df, df, on='var', how='outer')`
#### Cross Join
`pd.merge(df, df, on='var')`

## **Cleaning Representations**
#### Removing Observations/Variables with Missing Data
``.dropna()``
#### Filling Observations/Variables with Missing Data
``fillna()``

In [88]:
me.Mdataframe

Unnamed: 0_level_0,Unnamed: 1_level_0,info1　⇢,VAR,VAR,VAR,VAR
Unnamed: 0_level_1,Unnamed: 1_level_1,info2　⇢,VAR-A,VAR-A,VAR-B,VAR-B
Unnamed: 0_level_2,Unnamed: 1_level_2,info3　⇢,var-a,var-b,var-c,var-d
note1　⇣,note2　⇣,note3　⇣,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
x,a,1,0,1,2,1.0
x,a,2,3,4,5,2.0
x,b,1,6,7,8,
y,b,2,9,10,11,
