https://towardsdatascience.com/the-unreasonable-effectiveness-of-method-chaining-in-pandas-15c2109e3c69

In [12]:
import pandas as pd
import numpy as np
wine = pd.read_csv("wine-clustering.csv")
wine.columns = [x.lower() for x in wine.columns]
wine.head()

Unnamed: 0,alcohol,malic_acid,ash,ash_alcanity,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280,proline
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [13]:
(
    wine.rename(columns={"color_intensity": "ci"})
    .assign(color_filter=lambda x: np.where((x.hue > 1) & (x.ci > 7), 1, 0))
    .query("alcohol > 14 and color_filter == 1")
    .sort_values("alcohol", ascending=False)
    .reset_index(drop=True)
    .loc[:, ["alcohol", "ci", "hue"]]
)

Unnamed: 0,alcohol,ci,hue
0,14.38,7.5,1.2
1,14.19,8.7,1.23


In [15]:
def csnap(df, fn=lambda x: x.shape, msg=None):
    """ Custom Help function to print things in method chaining.
        Returns back the df to further use in chaining.
    """
    if msg:
        print(msg)
    display(fn(df))
    return df

In [16]:
(
    wine.pipe(csnap)
    .rename(columns={"color_intensity": "ci"})
    .assign(color_filter=lambda x: np.where((x.hue > 1) & (x.ci > 7), 1, 0))
    .pipe(csnap)
    .query("alcohol > 14")
    .pipe(csnap)
    .sort_values("alcohol", ascending=False)
    .reset_index(drop=True)
    .loc[:, ["alcohol", "ci", "hue"]]
    .pipe(csnap, lambda x: x.sample(5))
)

(178, 13)

(178, 14)

(22, 14)

Unnamed: 0,alcohol,ci,hue
17,14.1,6.2,1.07
7,14.3,6.2,1.07
13,14.19,8.7,1.23
2,14.39,5.25,1.02
14,14.16,9.7,0.62


Unnamed: 0,alcohol,ci,hue
0,14.83,5.2,1.08
1,14.75,5.4,1.25
2,14.39,5.25,1.02
3,14.38,4.9,1.04
4,14.38,7.5,1.2
5,14.37,7.8,0.86
6,14.34,13.0,0.57
7,14.3,6.2,1.07
8,14.23,5.64,1.04
9,14.22,5.1,0.89


In [17]:
import logging
import sys

logger = logging.getLogger()
fhandler = logging.FileHandler(filename="mylog.log", mode="a")
formatter = logging.Formatter(
    "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)


def csnap(df, fn=lambda x: x.shape, msg=None):
    """ Custom Help function to print things in method chaining.
        Returns back the df to further use in chaining.
    """
    global logging
    if msg:
        print(msg)
    logging.debug(f"{fn(df)}")
    return df

In [18]:
def setcols(df, fn=lambda x: x.columns.map('_'.join), cols=None):
    """Sets the column of the data frame to the passed column list.
    """
    if cols:
        df.columns = cols
    else:
        df.columns = fn(df)
    return df

In [21]:
from sklearn import datasets
data = datasets.load_iris()
#data = load_iris()
iris = pd.DataFrame(data.data, columns=data.feature_names)

(
    iris.pipe(csnap, lambda x: x.head(), msg="Before")
    .pipe(
        setcols,
        fn=lambda x: x.columns.str.lower()
        .str.replace(r"\(cm\)", "")
        .str.strip()
        .str.replace(" ", "_"),
    )
    .pipe(csnap, lambda x: x.head(), msg="After")
)

Before
After


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [22]:
def cfilter(df, fn, axis="rows"):
    """ Custom Filters based on a condition and returns the df.
        function - a lambda function that returns a binary vector
        thats similar in shape to the dataframe
        axis = rows or columns to be filtered.
        A single level indexing
    """
    if axis == "rows":
        return df[fn(df)]
    elif axis == "columns":
        return df.iloc[:, fn(df)]
    
(
    iris.pipe(
        setcols,
        fn=lambda x: x.columns.str.lower()
        .str.replace(r"\(cm\)", "")
        .str.strip()
        .str.replace(" ", "_"),
    ).pipe(cfilter, lambda x: x.columns.str.contains("sepal"), axis="columns")
)

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6
...,...,...
145,6.7,3.0
146,6.3,2.5
147,6.5,3.0
148,6.2,3.4
