In [1]:
# %load_ext autoreload
# %autoreload on

In [2]:
# import p2d2 as pandas
import pandas
import functools
import typing
import types
from types import SimpleNamespace
import json

import LazyDFDemo.ir2sql as ir2sql
import LazyDFDemo.cost_optimizer as cost_optimizer

pd = SimpleNamespace()
def read_sql(sql):
    return LazyDataFrame('sql', sql)

pd.read_sql = read_sql

class Supported():
    def __init__(self, func, lazydf, possible_mappings):
        self.func = func
        self.lazydf=lazydf
        self.possible_mappings = possible_mappings
        
    def __call__(self, *args, **kwargs):
        #print('checking args')
        call_argtypes = list(map(lambda x:type(x).__name__, args))
        for mapping in self.possible_mappings['mappings']:
            #print(f"{mapping['argtypes']=}")
            #print(f"{call_argtypes=}")
            if mapping['argtypes'] == call_argtypes:
                ir_func = mapping['maps'] # 'PROJECTION'
                return LazyDataFrame(self.lazydf.backend,
                                    getattr(self.lazydf.ir2backend, ir_func)(*args, self.lazydf.declarative),
                                    self.lazydf.df)
        print('should not happen')
        
class Unsupported():
    def __init__(self, df, attr):
        self.attr=attr
    def __call__(self, *args, **kwargs):
        df = pandas.DataFrame({'l_lineitem':[1,2,3,4,5], 'l_cost':[1,2,3,4,5]})
        return LazyDataFrame('sql', '', df = df)
    
class LazyDataFrame(SimpleNamespace):
    def __init__(self, backend, declarative = '', df = None):
        self.backend = backend
        self.declarative = declarative
        self.df = pandas.DataFrame()
        if backend == 'sql':
            self.ir2backend = ir2sql
        with open('LazyDFDemo/pandas2ir.json') as fp:
            self.frontend2ir = json.load(fp)
        print('created a lazy data frame')
            
    def __getattr__(self, attr):
        #print('help')
        if attr in self.frontend2ir:
            #print('optimizing')
            return Supported(attr, self, self.frontend2ir[attr])
        elif hasattr(self.df, attr) and type(getattr(self.df, attr)) == types.MethodType:
            print(f'Unsupported operation detected: {attr}. Evaluated declarative backend')
            print(self.ir2backend.commit(self.declarative))
            return Unsupported(self.df, attr)
        else:
            print(f'Unsupported attribute detected: {attr}. Evaluated declarative backend')
            print(self.ir2backend.commit(self.declarative))
            return getattr(self.df, attr)
        print('should not happen')
        
    def __getitem__(self, getitem):
        acallable = self.__getattr__('__getitem__')
        return acallable(getitem)



In [3]:
df = pd.read_sql('SELECT * FROM mytable')

created a lazy data frame


In [4]:
df = df.head(5)

created a lazy data frame


In [5]:
df = df[['l_lineitem', 'l_cost']]

created a lazy data frame


In [6]:
df.declarative

'SELECT l_lineitem, l_cost FROM (SELECT * FROM (SELECT * FROM mytable) AS e18c25cd-a2aa-11eb-a910-a0afbd9715b3 LIMIT 5) AS e18e70a3-a2aa-11eb-83a0-a0afbd9715b3'

In [7]:
# not in IR mapping
df.dtypes

Unsupported attribute detected: dtypes. Evaluated declarative backend
SELECT l_lineitem, l_cost FROM (SELECT * FROM (SELECT * FROM mytable) AS e18c25cd-a2aa-11eb-a910-a0afbd9715b3 LIMIT 5) AS e18e70a3-a2aa-11eb-83a0-a0afbd9715b3;


Series([], dtype: object)

In [8]:
# not in IR mapping
dfunsupp = df.fillna(0)

Unsupported operation detected: fillna. Evaluated declarative backend
SELECT l_lineitem, l_cost FROM (SELECT * FROM (SELECT * FROM mytable) AS e18c25cd-a2aa-11eb-a910-a0afbd9715b3 LIMIT 5) AS e18e70a3-a2aa-11eb-83a0-a0afbd9715b3;
created a lazy data frame


DF is still a lazy data frame after an unsupported operation. This means we can still implement data push to RDBMS after an unsupported operation

# Questions

- What do I say if they ask me why I used static code analysis? I am myself not convinced anymore
- What language __other than SQL__ can I look at in order to build a more generalized IR? Should I create my own __SQL spin-off__ in order to show the flexibility of the IR?



In [10]:

mapping

NameError: name 'mapping' is not defined

In [None]:
'__getitem__' in mapping

In [None]:
type(int).__dict__


In [None]:
int.__name__