# Pint + Pandas =  MyDataFrame


## Background

1.
Manual unit conversion

    df["Z [g/km]"] = 0.001*df["X [g/s]"] / df["Y [km/h]] / 3600 # WRAAAAAAAAAAAA

What if

    df["Z"] = (df["X"] / df["Y]).to("gram per kilometer")
    df["Z"].unit
    >>> g/km

2.
Passing arguments in plotting

    plt.plot(df["X"],df["Y"],xlabel = "X [g/km]", ylabel = "Y [km/h]", xlim = ....)
    
What if

    df.plot("X", "Y") 
    
units, labels, labels taken care of by default.

3.
Error handling

    is_neg = df["X"] < 0
    df.loc[is_neq,"X"] = np.nan
    # repeat for every column

What if

    # defined centrally for all columns
    df["X"].lower_limit
    >>> 0
    df["X"].upper_limit
    >>> 200   
    
    df = df.enforce_limits()

4.
Documentation

What if
    
    df["X"].info
    >>> "X [g/s] measured with sensor AAAA. Call Guus at 0642114412 for more info."


In [54]:
import pint
import pandas as pd
import numpy as np

# Simple and extended functionality

## Simple: Unit - Column association 

In [55]:
_u = pint.UnitRegistry()

units = {}
units["gpsv"] = _u("kilometer per hour")
units["emissions"] = _u("milligram per second")

In [56]:
units

{'emissions': <Quantity(1.0, 'milligram / second')>,
 'gpsv': <Quantity(1.0, 'kilometer / hour')>}

## Extended: MetaData - Column association incl. unit support

### E.g. using a register (e.g. flat YAML or from a DB )

In [57]:
%%writefile myRegister.yaml

emissions :
    repr : CO2 [mg/s]
    unit : milligram per second
    type : number
    upper_limit : 400
    lower_limit : 0    
    description : CO2 emission breathing while riding the bike.
    info : Measured using XXX sensor, etc.
        
gpsv :
    repr : V [km/h]
    unit : kilometer per hour
    type : number
    upper_limit : 30
    lower_limit : 0
    description : GPS Velocity
    docstring : Measured using YYY sensor etc.

Overwriting myRegister.yaml


Human readable, easy to debug. 

Even better: Separate files per property, e.g.:

register2/unit.yaml:
<code>
emissions : milligram per second
gpsv : kilomter per hour
</code>

register2/repr.yaml:
<code>
after_cat_nox_mf : CO2 [mg/s]
gpsv : V [km/h]
</code>

Allows one to add new properties by creating new files, e.g. 

is_emission.yaml:
<code>
emissions : True
gpsv : False
</code>

Loading a register

In [58]:
import yaml

with open("MyRegister.yaml", 'r') as f:
    read_data = f.read()
    myRegister = yaml.load(read_data)
f.closed

units2 = {key: _u(prop["unit"]) for key, prop in myRegister.items()}

Units

In [59]:
units2

{'emissions': <Quantity(1.0, 'milligram / second')>,
 'gpsv': <Quantity(1.0, 'kilometer / hour')>}

String representations

In [60]:
reprs = {key: prop["repr"] for key, prop in myRegister.items()}
reprs

{'emissions': 'CO2 [mg/s]', 'gpsv': 'V [km/h]'}

## The Classes

In [61]:
class MyDataFrame:
    """Unit-ed Data Frame"""
    def __init__(self,df,units):
        union = [col for col in list(df) if col in list(units)]
        if len(union) != len(list(df)):
            print("Warning: Not all columns have an associated unit.")
            
        self.df = df.copy()
        self.units = units
    
    def __getitem__(self, key):
        if key in units.keys():
            return MySeries(self.df[key],self.units[key])
        else:
            print("No unit associated to key.")
        
    def __setitem__(self, key, mySeries):
            self.df[key] = mySeries.series
            self.units[key] = mySeries.unit
            
    def __repr__(self):
        
        header = "\n".join(["{:20s}: {:10s}".format(colname,u.units) for colname,u in self.units.items()])
        return header+"\n\n"+self.df.head().__repr__()
    
    def describe(self, stats = ["mean","std"]):

        real_var_names = list(self.df.select_dtypes(include=['int', 'float']).columns) # real = real numbers
        string_var_names = [col for col in list(df) if col not in real_var_names]

        def padder(input,width,right=True):
            if right:
                return "{:>{width}}".format(input,width=width)
            else:
                return "{:<{width}}".format(input,width=width)
        
        header = []
        
        # REAL VAR NAME COLUMN
        
        
        name_col = []
        width = 20
        
        for name in list(real_var_names):
            padded = "{:<{width}}".format(name,width=width)
            name_col.append(padded)       
        
        header.append("{:<{width}}".format("Variable",width=width))
        # UNIT COLUMN
        unit_col = []
        width = 6
        
        for name in real_var_names:            
            if name in self.units.keys():
                unit = self.units[name].units
                unit_str = "{:~}".format(unit)
                
                # if missing pretty format
                if len(unit_str) == 0:
                    unit_str = str(unit)
            else:
                unit_str = '?'
            padded = "{:>{width}}".format(unit_str,width=width)
            
            unit_col.append(padded)
            
        header.append("{:<{width}}".format("Unit",width=width))
                    
        # STATISTICS COLUMN
        
        stat_cols = []
        width = 12
        for fn in stats:
            stat_col = []
            for name in real_var_names:
                val = getattr(mdf[name].series, fn)()
                padded = "{:>{width}.2f}".format(val,width=width)
                stat_col.append(padded)
            stat_cols.append(stat_col)
            header.append("{:>{width}}".format(fn,width=width))
        
        # ASEMMBLE
        rows = []
        head = "".join(header)
        line = len(head)*"-"
        for i in range(len(real_var_names)):
            row = "".join([name_col[i],unit_col[i]] + [stat_col[i] for stat_col in stat_cols])
            rows.append(row)
        
        print("\n".join([head,line]+rows))


In [62]:
class MySeries:
    """Unit-ed Series"""
    def __init__(self,series,unit):
        self.series = series.copy()
        self.unit = unit
        
    def __repr__(self):
        print("Unit: ", self.unit)
        return self.series.head().__repr__()
    
    def __truediv__(self,other):
        return self.__class__(self.series/other.series,1*(self.unit/other.unit))
    
    def __mul__(self,other):
        return self.__class__(self.series*other.series,1*(self.unit*other.unit))
    
    def __eq__(self, other):
        return self.series.__eq__(self.series,other)
    
    def __getitem__(self,key):
        return self.series[key]
    
    def notnull(self):
        return self.series.notnull()
    
    def to(self, unit):
        original = 1*self.unit
        altered = original.to(unit)
        return MySeries(self.series*altered.magnitude,1*altered.units)
    
    def loc(self,*args,**kwargs):
        return self.series.loc(*args,**kwargs)
    
    def plot(self,*args,**kwargs):
        return self.series.plot(*args,**kwargs)      

# Example

In [70]:
df = pd.DataFrame({"gpsv":[0,1,10,20,30,40,50],"emissions":[10,10,10,15,20,20,30]}).astype(float)

In [71]:
mdf = MyDataFrame(df,units)

In [72]:
mdf

emissions           : milligram / second
emissions gpkm      : gram / kilometer
gpsv                : kilometer / hour

   emissions  gpsv
0       10.0   0.0
1       10.0   1.0
2       10.0  10.0
3       15.0  20.0
4       20.0  30.0

In [73]:
mdf.describe()

Variable            Unit          mean         std
--------------------------------------------------
emissions           mg / s       16.43        7.48
gpsv                km / hr       21.57       19.34


In [74]:
mdf["emissions gpkm"] = (mdf["emissions"]/mdf["gpsv"]).to("gram per kilometer")
mdf["emissions gpkm"]

Unit:  1 gram / kilometer


0          inf
1    36.000000
2     3.600000
3     2.700000
4     2.400000
Name: emissions gpkm, dtype: float64

In [75]:
mdf

emissions           : milligram / second
emissions gpkm      : gram / kilometer
gpsv                : kilometer / hour

   emissions  gpsv  emissions gpkm
0       10.0   0.0             inf
1       10.0   1.0       36.000000
2       10.0  10.0        3.600000
3       15.0  20.0        2.700000
4       20.0  30.0        2.400000

In [76]:
mdf.describe(stats=["mean","std","min","max"])

Variable            Unit          mean         std         min         max
--------------------------------------------------------------------------
emissions           mg / s       16.43        7.48       10.00       30.00
gpsv                km / hr       21.57       19.34        0.00       50.00
emissions gpkm      g / km         inf         nan        1.80         inf


# Plans: 

Build out prototype functionality: error handling, plotting to explore the concept.

Demonstrate the concept via refactor of existing (core) code.