In [99]:
import logging
import os
import re
from collections import OrderedDict
from pathlib import Path
from itertools import chain
from copy import deepcopy

from IPython.display import HTML, Image
import pandas as pd
from pandas import DataFrame, Series

from rolling_pin.blob_etl import BlobETL
import rolling_pin.tools as rpt
from rolling_pin.tools import COLOR_SCHEME

In [125]:
CONFORM_COLOR_SCHEME = deepcopy(COLOR_SCHEME)
CONFORM_COLOR_SCHEME.update({
    'node_font': '#DE958E',
    'node_value_font': '#B6ECF3',
    'edge': '#DE958E',
    'edge_value': '#B6ECF3',
    'node_library_font': '#B6ECF3',
    'node_module_font': '#DE958E',
    'edge_library': '#B6ECF3',
    'edge_module': '#DE958E'
})

def copy_lines(source, target, include_regex=None, exclude_regex=None):
    with open(source) as f:
        lines = f.read().split('\n')
        
    include_re = re.compile(include_regex or '')
    exclude_re = re.compile(exclude_regex or '')
    if include_regex is not None:
        lines = list(filter(lambda x: re.search(include_regex, x), lines))
    if exclude_regex is not None:
        lines = list(filter(lambda x: not re.search(exclude_regex, x), lines))

    lines = '\n'.join(lines)
    with open(target, 'w') as f:
        f.write(lines)
        
def copy_file(source, target):
    parent = Path(target).parent
    os.makedirs(parent, exist_ok=True)
    os.copy(source, target)

class ConformETL:
    def __init__(self, source_rules=[], rename_rules=[], group_rules=[], lines_rules=[]):
        src = []
        for r in source_rules:
            files = rpt.list_all_files(
                r['path'], include_regex=r['include'], exclude_regex=r['exclude']
            )
            src.extend(files)
        src = sorted([x.as_posix() for x in src])
        data = DataFrame()
        data['source'] = src
        data['target'] = src

        for r in rename_rules:
            data.target = data.target \
                .apply(lambda x: re.sub(r['regex'], r['replace'], x))

        data['groups'] = data.source.apply(lambda x: [])
        for r in group_rules:
            mask = data.source \
                .apply(lambda x: re.search(r['regex'], x)) \
                .astype(bool)
            data.loc[mask, 'groups'] = data.groups.apply(lambda x: x + [r['name']])
        mask = data.groups.apply(lambda x: x == [])
        data.loc[mask, 'groups'] = data.loc[mask, 'groups'].apply(lambda x: ['base'])

        self._data = data
        self._line_rules = line_rules
        
    def __repr__(self):
        return self._data \
            .rename(lambda x: x.upper(), axis=1) \
            .to_string(
                index=False,
                max_colwidth=150,
                col_space=[50, 50, 20]
            )

    @property
    def groups(self):
        output = self._data.groups.tolist()
        output = sorted(list(set(chain(*output))))
        output.remove('base')
        output.insert(0, 'base')
        return output
    
    def to_html(self, orient='lr', color_scheme=CONFORM_COLOR_SCHEME):
        data = self._data
        keys = data.target.tolist()
        vals = data.source.tolist()
        data = dict(zip(keys, vals))
        return BlobETL(data).to_html(orient=orient, color_scheme=color_scheme)
    
    def conform(self, groups='all'):
        if isinstance(groups, str):
            if groups == 'all':
                groups = self.groups
            else:
                groups = [groups]

        data = self._data
        grps = set(groups)
        mask = data.groups \
            .apply(lambda x: set(x).intersection(grps)) \
            .apply(lambda x: len(x) > 0)
        data = data[mask]
        data.apply(lambda x: copy_file(x.source, x.target), axis=1)
        
        for r in self._line_rules:
            mask = data.groups.apply(lambda x: r['group'] in x)
            temp = data[mask]
            temp.apply(
                lambda x: copy_lines(
                    x.source,
                    x.target,
                    include_regex=r['include'],
                    exclude_regex=r['exclude'],
                ),
                axis=1
            )

In [128]:
exclude_re = '.(git|mypy|pytest)|__(pycache|mypy)__'
source_dir = '/home/ubuntu/rolling-pin'
target_dir = '/tmp/repo'
source_rules = [
    dict(
        path=source_dir + '/python/rolling_pin',
        include=None,
        exclude=exclude_re,
    ),
    dict(
        path=source_dir + '/docker',
        include='pyproject\.toml|pdm\.lock|pdm\.toml',
        exclude=exclude_re,
    ),
    dict(
        path=source_dir,
        include='README|LICENSE',
        exclude=exclude_re + '|docker',
    ),
]

rename_rules = [
    dict(regex='/home/ubuntu/rolling-pin', replace='/tmp/repo'),
    dict(regex='/docker', replace=''),
    dict(regex='/python', replace=''),
    dict(regex='/pdm.lock', replace='/.pdm.lock'),
]

group_rules = [
    dict(name='init', regex='__init__.py$'),
    dict(name='test', regex='_test.py$'),
    dict(name='resource', regex='/resources'),
]

line_rules = [
    dict(
        group='init',
        include=None,
        exclude='test',
    )
]

d = ConformETL(source_rules, rename_rules, group_rules, line_rules)
d.to_html()

In [127]:
def buffer_filepath(data):
    data = data.copy()
    output = data \
        .apply(lambda x: Path(x).parts[1:-1]) \
        .apply(lambda x: list(chain(*zip(['/'] * len(x), x)))) \
        .tolist()
    output = DataFrame(output)
    output['sep'] = '/'
    output['filename'] = data.apply(lambda x: Path(x).name)
    output = output.fillna('')
    output.columns = list(range(output.shape[1]))
    output = output.to_string(index=False, header=False).split('\n')
    output = Series(output)
    return output

data = d._data.copy()
data.source = buffer_filepath(data.source)
data.target = buffer_filepath(data.target)
data['==>'] = '==>'
data = data[['source', '==>', 'target', 'groups']]
data.rename(lambda x: x.upper(), axis=1, inplace=True)
x = data.to_string(index=False, max_colwidth=150, col_space=[80, 10, 80, 20], justify='left')
print(x)

SOURCE                                                                           ==>        TARGET                                                                           GROUPS              
/ home / ubuntu / rolling-pin                        /           LICENSE         ==>        / tmp / repo               /           LICENSE                                   [base]              
/ home / ubuntu / rolling-pin                        /         README.md         ==>        / tmp / repo               /         README.md                                   [base]              
/ home / ubuntu / rolling-pin / python / rolling_pin /       __init__.py         ==>        / tmp / repo / rolling_pin /       __init__.py                                   [init]              
/ home / ubuntu / rolling-pin / python / rolling_pin /            app.py         ==>        / tmp / repo / rolling_pin /            app.py                                   [base]              
/ home / ubuntu / rolling-pin 

In [126]:
print(d)

                                                       SOURCE                                             TARGET               GROUPS
                             /home/ubuntu/rolling-pin/LICENSE                                  /tmp/repo/LICENSE               [base]
                           /home/ubuntu/rolling-pin/README.md                                /tmp/repo/README.md               [base]
      /home/ubuntu/rolling-pin/python/rolling_pin/__init__.py                  /tmp/repo/rolling_pin/__init__.py               [init]
           /home/ubuntu/rolling-pin/python/rolling_pin/app.py                       /tmp/repo/rolling_pin/app.py               [base]
      /home/ubuntu/rolling-pin/python/rolling_pin/blob_etl.py                  /tmp/repo/rolling_pin/blob_etl.py               [base]
 /home/ubuntu/rolling-pin/python/rolling_pin/blob_etl_test.py             /tmp/repo/rolling_pin/blob_etl_test.py               [test]
     /home/ubuntu/rolling-pin/python/rolling_pin/radon_etl.py 

In [None]:
config = '''
source_rules:
  - path: "/home/ubuntu/lunchbox"
    include: "lunchbox/python/lunchbox"
    exclude: ".(git|mypy|pytest)|__(pycache|mypy)__"
  - path: /home/ubuntu/lunchbox/docker
    include: "pyproject\\.toml|pdm\\.lock|pdm\\.toml"
    exclude: ".(git|mypy|pytest)|__(pycache|mypy)__"
  - path: /home/ubuntu/lunchbox
    include: "README|LICENSE"
    exclude: ".(git|mypy|pytest)|__(pycache|mypy)__|docker"

rename_rules:
  - regex: "/home/ubuntu/lunchbox"
    replace: "/tmp/repo"
  - regex: "/docker"
    replace: ""
  - regex: "/python"
    replace: ""
  - regex: "/pdm.lock"
    replace: "/.pdm.lock"

group_rules:
  - name: init
    regex: "__init__.py$"
  - name: test
    regex: "_test.py$"
  - name: resource
    regex: "/resources"

line_rules:
  - group: init
    include: None
    exclude: "test"

'''