In [8]:
from abc import abstractmethod, abstractproperty, ABC
from typing import Any, TypeAlias, TypeVar, Generic
from enum import Enum

import pandas as pd
import numpy as np

# Using *Enums* for data type

In [9]:
# Abstract type
class BaseDataFormat(ABC):
    pass

# Define concrete types
class StructuredDataFormat(Enum):
    PD_DATAFRAME = pd.DataFrame
    NP_ARRAY = np.ndarray

class SemiStructuredDataFormat(Enum):
    JSONL = "jsonl"

class FileFormat(Enum):
    PATH = "path"
    PATH_LIST = "path_list"

# *Register* enums as concrete types
BaseDataFormat.register(FileFormat)
BaseDataFormat.register(StructuredDataFormat)
BaseDataFormat.register(SemiStructuredDataFormat);

## Parameterize Generics using *abstract* type

In [10]:
DataFormat = TypeVar("DataFormat", bound=BaseDataFormat)

class DataSetInterface(Generic[DataFormat]):
    @abstractmethod
    def __init__(self, data_format: DataFormat):
        pass

    @abstractmethod
    def to_format(self, data_format: DataFormat):
        pass

StructuredDataSetInterface = DataSetInterface[StructuredDataFormat]

print(
    isinstance(StructuredDataFormat.PD_DATAFRAME, BaseDataFormat),
    isinstance(StructuredDataFormat, BaseDataFormat)
)

True False


## Parameterize Generics listing individual *concrete* types

In [39]:
DataFormat_v2 = TypeVar(
    "DataFormat_v2", 
    StructuredDataFormat, SemiStructuredDataFormat, FileFormat,
)

class DataSetInterface(Generic[DataFormat_v2]):
    @abstractmethod
    def __init__(self, data_format: DataFormat_v2):
        pass

    @abstractmethod
    def to_format(self, data_format: DataFormat_v2):
        pass

StructuredDataSetInterface = DataSetInterface[StructuredDataFormat]

# Replace Enum with class hierarchy

In [3]:
%%writefile /var/tmp/generics_v3.py

from abc import abstractmethod, abstractproperty, ABC
from typing import Any, TypeAlias, NewType, TypeVar, Type, Generic

import pandas as pd
import numpy as np


# Data Formats
# ============

# Abstract type for data formats
class BaseDataFormat(ABC):
    @abstractproperty
    def object_type(self):
        pass

# Structured Data Formats
# -----------------------
class StructuredDataFormat(BaseDataFormat):
    pass

class PandasFormat(StructuredDataFormat):
    def object_type(self) -> type[pd.DataFrame]:
        return pd.DataFrame

class NumpyFormat(StructuredDataFormat):
    def object_type(self) -> type[np.ndarray]:
        return np.ndarray

# Semi-structured Data Formats
# ----------------------------
class SemiStructuredDataFormat(BaseDataFormat):
    pass

class JsonlinesFormat(SemiStructuredDataFormat):
    def object_type(self) -> type[dict]:
        return dict


# DataSetInterface
# ================

DataFormat = TypeVar(
    "DataFormat", 
    StructuredDataFormat, SemiStructuredDataFormat, JsonlinesFormat,
    covariant=False, contravariant=False,  # invariant
)

StructuredData: TypeAlias = pd.DataFrame | np.ndarray
SemiStructuredData: TypeAlias = dict | list[dict]
Data: TypeAlias = StructuredData | SemiStructuredData

class DataSetInterface(Generic[DataFormat]):
    @abstractmethod
    def __init__(self, data: Data, data_format: Type[Data]):
        pass

    @abstractmethod
    def to_format(self, data_format: Type[Data]) -> Data:
        pass


# Proper usage
StructuredDataSetInterface = DataSetInterface[StructuredDataFormat]

# Wrong usage
PandasDataSetInterface = DataSetInterface[PandasFormat]


# Implementation
# ==============

class StructuredDataSetImplementation(DataSetInterface[StructuredDataFormat]):
    def __init__(self, data_format: StructuredDataFormat, data: StructuredData):
        self.data = data 

    def to_format(self, data_format: StructuredDataFormat) -> :
        return self.data 

Overwriting /var/tmp/generics_v3.py


In [4]:
! mypy /var/tmp/generics_v3.py

/var/tmp/generics_v3.py:66: [1m[31merror:[m Value of type variable [m[1m"DataFormat"[m of [m[1m"DataSetInterface"[m cannot be [m[1m"PandasFormat"[m  [m[33m[type-var][m
/var/tmp/generics_v3.py:76: [1m[31merror:[m Argument 1 of [m[1m"to_format"[m is incompatible with supertype [m[1m"DataSetInterface"[m; supertype defines the argument type as [m[1m"type[DataFrame] | type[ndarray[Any, Any]]"[m  [m[33m[override][m
/var/tmp/generics_v3.py:76: [34mnote:[m This violates the Liskov substitution principle[m
/var/tmp/generics_v3.py:76: [34mnote:[m See [4mhttps://mypy.readthedocs.io/en/stable/common_issues.html#incompatible-overrides[m[m
[1m[31mFound 2 errors in 1 file (checked 1 source file)[m


# Replacing Generics with separate interfaces for each dataset type

In [None]:
# %%writefile /var/tmp/no_generics.py

from abc import abstractmethod, abstractproperty, ABC
from typing import Any, TypeAlias, NewType, TypeVar, Type, NoReturn

import pandas as pd
import numpy as np


# DataSetInterface
# ================


class DataSetInterface(ABC):
    pass

class StructuredDataSetInterface(DataSetInterface):
    @abstractmethod
    @classmethod
    def from_pandas(cls, pd.DataFrame):
        pass

    @abstractmethod
    def to_pandas(self) -> pd.DataFrame:
        pass

    @abstractmethod
    @classmethod
    def from_numpy(cls, np.ndarray):
        pass

    @abstractmethod
    def to_numpy(self) -> np.ndarray:
        pass

    @abstractmethod
    def get_column_names(self) -> list[str]:
        pass


# Implementation
# ==============

class StructuredDataSetImplementation(DataSetInterface):
    def __init__(self, pd_data_frame: pd.DataFrame):
        self.data = pd_data_frame
        

    @classmethod
    def from_pandas(cls, data: pd.DataFrame):
        return cls(pd_data_frame=data)

    def to_pandas(self) -> pd.DataFrame:
        return self.data    
            
    @classmethod
    def from_numpy(cls, data: np.ndarray):
        return cls(
            pd_data_frame=pd.DataFrame(data)
        )

    def to_numpy(self) -> np.ndarray:
        return self.data.to_numpy()
    
    def get_column_names(self) -> list[str]:
        return self.data.columns.tolist()