In [1]:
import linref as lr
import numpy as np
import pandas as pd
import hashlib

In [2]:
ar1 = np.random.rand(100)

In [3]:
h11 = hashlib.sha256(ar1)

In [4]:
ar1 = np.random.rand(1000)
h12 = hashlib.sha256(ar1)

In [5]:
%%timeit
h11 = hashlib.sha256(ar1)

4.96 µs ± 135 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [6]:
%%timeit
h11.hexdigest() == h12.hexdigest()

416 ns ± 3.75 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [49]:
import linref as lr
import numpy as np
import pandas as pd
import hashlib

class LRS:
    """
    Class for defining and managing linear referencing systems (LRS) in `pandas` DataFrames.
    """

    def __init__(self, keys=None, beg=None, end=None, loc=None, geom=None):
        # Log and validate parameters
        self.keys = keys
        self.beg = beg
        self.end = end
        self.loc = loc
        self.geom = geom

        # Ensure adequate information to define LRS
        self._validate_lrs()

    def __repr__(self):
        return f"LRS(keys={self.keys}, beg={self.beg}, end={self.end}, loc={self.loc}, geom={self.geom})"

    def __str__(self):
        return f"LRS(keys={self.keys}, beg={self.beg}, end={self.end}, loc={self.loc}, geom={self.geom})"

    @property
    def keys(self):
        return self._keys

    @keys.setter
    def keys(self, keys):
        # Define key validator
        def _validate_key(key):
            if not isinstance(key, str):
                raise ValueError(f"LRS key {key} is invalid. Must be a valid `pandas` column label.")
            return key

        # Null input
        if keys is None:
            self._keys = []
        elif isinstance(keys, str):
            self._keys = [self._validate_label(keys, name='key')]
        elif isinstance(keys, list):
            self._keys = [self._validate_label(key, name='key') for key in keys]
        else:
            try:
                self._keys = [self._validate_label(key, name='key') for key in keys]
            except:
                raise ValueError(
                    "Input LRS keys must be a single or list-like of valid `pandas` column labels."
                )

    @property
    def beg(self):
        return self._beg

    @beg.setter
    def beg(self, beg):
        self._beg = self._validate_label(beg, name='beg', allow_null=True)

    @property
    def end(self):
        return self._end
    
    @end.setter
    def end(self, end):
        self._end = self._validate_label(end, name='end', allow_null=True)

    @property
    def loc(self):
        return self._loc
    
    @loc.setter
    def loc(self, loc):
        self._loc = self._validate_label(loc, name='loc', allow_null=True)
    
    @property
    def geom(self):
        return self._geom

    @geom.setter
    def geom(self, geom):
        self._geom = self._validate_label(geom, name='geom', allow_null=True)

    @property
    def is_linear(self):
        return (self.beg is not None) and (self.end is not None)
    
    @property
    def is_point(self):
        return (self.loc is not None) and (self.beg is None) and (self.end is None)

    @property
    def is_locational(self):
        return self.loc is not None

    def _validate_label(self, label, name='label', allow_null=False):
        if allow_null and label is None:
            return None
        if not isinstance(label, str):
            raise ValueError(f"LRS {name} `{label}` is invalid. Must be a valid `pandas` column label.")
        return label

    def _validate_lrs(self):
        # Validate LRS location labels
        if (not self.beg) and (not self.end) and (not self.loc):
            raise ValueError("LRS must define at least `beg` and `end`, or `loc`. None provided.")
        if (self.beg) and (not self.end):
            raise ValueError("LRS must define both `beg` and `end` if one is provided. Only `beg` provided.")
        if (not self.beg) and (self.end):
            raise ValueError("LRS must define both `beg` and `end` if one is provided. Only `end` provided.")


def _only_if_hashing(m):
    def wrapper(*args, **kwargs):
        if args[0].hashing:
            return m(*args, **kwargs)
    return wrapper

class LRS_Manager:
    """
    Class for managing linear referencing system data in `pandas` DataFrames.
    """

    def __init__(self, df, lrs, hashing=True):
        # Log DataFrame and LRS objects
        self.df = df
        self.lrs = lrs
        self.hashing = hashing

        # Set column indices for LRS keys and location information to improve performance
        self._set_column_indices()

        # Create data hashes to log current dataframe state
        self._hash_columns(save=True, compare=False)
        self._hash_lrs_data(save=True, compare=False)

    @property
    def df(self):
        return self._df

    @df.setter
    def df(self, df):
        if not isinstance(df, pd.DataFrame):
            raise ValueError("Input DataFrame must be of type `pandas.DataFrame`.")
        self._df = df
        
    def _set_column_indices(self):
        """
        Set the indices of the columns in the DataFrame that correspond to the LRS keys.
        """
        # Get the indices of the LRS keys in the DataFrame
        self._key_indices = [self.df.columns.get_loc(key) for key in self.lrs.keys]
        self._beg_index = self.df.columns.get_loc(self.lrs.beg) if self.lrs.beg else None
        self._end_index = self.df.columns.get_loc(self.lrs.end) if self.lrs.end else None
        self._loc_index = self.df.columns.get_loc(self.lrs.loc) if self.lrs.loc else None
        self._geom_index = self.df.columns.get_loc(self.lrs.geom) if self.lrs.geom else None

    @_only_if_hashing
    def _hash_columns(self, save=False, compare=False):
        """ 
        Hash the columns of the DataFrame to ensure that the LRS is applied to the correct columns.
        """
        # Hash the columns of the DataFrame
        columns_hash = hashlib.sha256(self.df.columns.to_numpy()).hexdigest()

        # Save hash information
        if compare:
            # Log comparisons with previous hashes
            self._columns_hash_status = self._columns_hash == columns_hash
        else:
            # Set comparison to successful
            self._columns_hash_status = True
        if save:
            # Save latest hashes
            self._columns_hash = columns_hash

    @_only_if_hashing
    def _hash_lrs_data(self, save=False, compare=False):
        """
        Hash the LRS data to easily check for changes to LRS locational information.
        """
        # Hash key columns
        key_hash = hashlib.sha256(self.df.iloc[:, self._key_indices].to_numpy()).hexdigest()
        # Hash location columns
        if self.lrs.is_linear:
            beg_hash = hashlib.sha256(self.df.iloc[:, self._beg_index].to_numpy()).hexdigest()
            end_hash = hashlib.sha256(self.df.iloc[:, self._end_index].to_numpy()).hexdigest()
        else:
            beg_hash = None
            end_hash = None
        if self.lrs.is_point or self.lrs.is_locational:
            loc_hash = hashlib.sha256(self.df.iloc[:, self._loc_index].to_numpy()).hexdigest()
        else:
            loc_hash = None
        # Hash geometry columns
        if self.lrs.geom:
            geom_hash = hashlib.sha256(self.df.iloc[:, self._geom_index].to_numpy()).hexdigest()
        else:
            geom_hash = None

        # Save hash information
        if compare:
            # Log comparisons with previous hashes
            self._key_hash_status = self._key_hash == key_hash
            self._beg_hash_status = self._beg_hash == beg_hash
            self._end_hash_status = self._end_hash == end_hash
            self._loc_hash_status = self._loc_hash == loc_hash
            self._geom_hash_status = self._geom_hash == geom_hash
        else:
            # Set comparison to successful
            self._key_hash_status = True
            self._beg_hash_status = True
            self._end_hash_status = True
            self._loc_hash_status = True
            self._geom_hash_status = True
        if save:
            # Save latest hashes
            self._key_hash = key_hash
            self._beg_hash = beg_hash
            self._end_hash = end_hash
            self._loc_hash = loc_hash
            self._geom_hash = geom_hash



@pd.api.extensions.register_dataframe_accessor("lrs")
class LRS_Accessor:

    def __init__(self, obj):
        # Log extended DataFrame
        self._obj = obj
        # Set null LRS
        self._lrs = []
        self._lrs_data = []

    def __repr__(self):
        if self.is_lrs_set:
            lrs_lines = '\n'.join(['- ' + str(o) for o in self._lrs])
        else:
            lrs_lines = "- No LRS set"
        return "LRS_Accessor with linear referencing system (LRS) objects:\n" + lrs_lines

    def __str__(self):
        if self.is_lrs_set:
            lrs_lines = '\n'.join(['- ' + str(o) for o in self._lrs])
        else:
            lrs_lines = "- No LRS set"
        return "LRS_Accessor with linear referencing system (LRS) objects:\n" + lrs_lines

    @property
    def lrs(self):
        return self._lrs

    @property
    def lrs_managers(self):
        return self._lrs_managers

    @property
    def managers(self):
        return self._lrs_managers

    @lrs.setter
    def lrs(self, lrs):
        # Check for valid LRS objects and create managers
        lrs_managers = [LRS_Manager(self._obj, lrs) for lrs in lrs]
        # Set LRS objects
        self._lrs = lrs
        self._lrs_managers = lrs_managers

    @property
    def is_lrs_set(self):
        return len(self._lrs) > 0

    def set_lrs(self, lrs=None, **kwargs):
        """
        Set one or more linear referencing systems (LRS) for the DataFrame. The LRS objects can be provided as a 
        single `LRS` object or a list of `LRS` objects using the `lrs` keyword argument, or as a set of keyword 
        arguments to create a new `LRS` object.
        """
        # Check for valid LRS objects
        if lrs is not None:
            if isinstance(lrs, LRS):
                lrs = [lrs]
            elif isinstance(lrs, list):
                if not all([isinstance(lrs, LRS) for lrs in lrs]):
                    raise ValueError("Input LRS objects must be of type `LRS`.")
            else:
                raise ValueError("Input LRS objects must be of type `LRS` or a list of `LRS` objects.")
        else:
            # Create LRS objects from keyword arguments
            lrs = [LRS(**kwargs)]
        # Set LRS objects
        self.lrs = lrs

    def clear_lrs(self):
        """
        Clear all linear referencing systems (LRS) from the DataFrame.
        """
        self._lrs = []

  class LRS_Accessor:


In [84]:
# Define sample dataset
df = pd.DataFrame({
    'route': ['A', 'A', 'B', 'B', 'B', 'C', 'C', 'D'],
    'bmp': [0, 1, 0, 1, 2, 10, 12, 0],
    'emp': [1, 3, 1, 2, 3, 11, 13, 10],
    'val1': [1, 2, 3, 4, 5, 6, 7, 8],
    'val2': [10, 20, 30, 40, 50, 60, 70, 80],
})
df = pd.concat([df] * 100000, ignore_index=True)

In [85]:
lrs1 = LRS(keys='route', beg='bmp', end='emp')

df.lrs.set_lrs(lrs1)

In [89]:
df.lrs.managers[0]._hash_lrs_data(save=True, compare=True)

In [90]:
df.lrs.managers[0]._beg_hash_status

False

In [88]:
df['bmp'] = df['bmp'] * 2