# More on Piping, Intentions, and Column Expressions

In [1]:
import pandas as pd
from dfply import *
import matplotlib.pylab as plt
%matplotlib inline

In [2]:
artists = pd.read_csv("./data/Artists.csv")

In [3]:
# carried over from the last lecture
bad_lbls = (artists >> 
             filter_by(X.Nationality.str.lower().str.startswith('nation').astype('bool')) >>
             pull('Nationality')).unique()
recode_bad_lbls = {old_lbl:'Nationality unknown' for old_lbl in bad_lbls}
replace_zero = {0:np.NaN}

## Why we love piping? 

### Reason 1: Composition Baby!

It is very easy to put separate pipe together.

In [5]:
artists_renamed = (artists >>
                    rename(Wiki_QID = 'Wiki QID'))
artists_new = (artists_renamed >>
                mutate(Nationality = X.Nationality.replace(recode_bad_lbls)))
artists_new = (artists_new >>
                mutate(BeginDate = X.BeginDate.replace(replace_zero)))

## To compose separate pipes

1. Switch ending `)` to `>>`
2. Remove the next assignment
3. ??
4. Profit!

In [5]:
artists_renamed = (artists >>
                    rename(Wiki_QID = 'Wiki QID') >> #)
#artists_new = (artists >>
                mutate(Nationality = X.Nationality.replace(recode_bad_lbls)) >> #)
#artists_new = (artists_renamed >>
                mutate(BeginDate = X.BeginDate.replace(replace_zero)))

## End product ... full process in a single pipe

In [6]:
artists_renamed = (artists >>
                    rename(Wiki_QID = 'Wiki QID') >>
                    mutate(Nationality = X.Nationality.replace(recode_bad_lbls)) >>
                    mutate(BeginDate = X.BeginDate.replace(replace_zero)))

## Why we love piping? 

### Reason 2: Textual Gravity!

A pipe clearly expression the intention of our code by

1. Reading left-to-right and top-to-bottom
2. Putting the verbs up front

In [7]:
artists_renamed = (artists >>
                    rename(Wiki_QID = 'Wiki QID') >>
                    mutate(Nationality = X.Nationality.replace(recode_bad_lbls)) >>
                    mutate(BeginDate = X.BeginDate.replace(replace_zero)))

## Why we love piping? 

### Reason 3: Easy debugging

Comments make it easy to debug a pipe.

## Debugging Step 1 - Start at the top

Use comments to remove all part of the chain

*Don't forget the ending `)`*

In [11]:
artists_renamed = (artists
                    >> rename(Wiki_QID = 'Wiki QID')
                    >> mutate(Nationality = X.Nationality.replace(recode_bad_lbls))
                    >> mutate(BeginDate = X.BeginDate.replace(replace_zero))
                  )

## Debugging Step 2 - Work your way down the pipe

Add in each part, one-at-a-time, checking the results

*Don't forget the ending `)`*

In [9]:
artists_renamed = (artists >>
                    rename(Wiki_QID = 'Wiki QID') ) #>>
                    #mutate(Nationality = X.Nationality.replace(recode_bad_lbls)) >>
                    #mutate(BeginDate = X.BeginDate.replace(replace_zero)))

In [10]:
artists_renamed = (artists >>
                    rename(Wiki_QID = 'Wiki QID') >>
                    mutate(Nationality = X.Nationality.replace(recode_bad_lbls)) ) #>>
                    #mutate(BeginDate = X.BeginDate.replace(replace_zero)))

In [11]:
artists_renamed = (artists >>
                    rename(Wiki_QID = 'Wiki QID') >>
                    mutate(Nationality = X.Nationality.replace(recode_bad_lbls)) >>
                    mutate(BeginDate = X.BeginDate.replace(replace_zero)))

## `composable.pipeable`, `dfply.Intention`, and `pandas.Series`

In [4]:
from composable import pipeable
@pipeable
def identify(col):
    return col

In [5]:
artists >> identify

Unnamed: 0,ConstituentID,DisplayName,ArtistBio,Nationality,Gender,BeginDate,EndDate,Wiki QID,ULAN
0,1,Robert Arneson,"American, 1930–1992",American,Male,1930,1992,,
1,2,Doroteo Arnaiz,"Spanish, born 1936",Spanish,Male,1936,0,,
2,3,Bill Arnold,"American, born 1941",American,Male,1941,0,,
3,4,Charles Arnoldi,"American, born 1946",American,Male,1946,0,Q1063584,500027998.0
4,5,Per Arnoldi,"Danish, born 1941",Danish,Male,1941,0,,
...,...,...,...,...,...,...,...,...,...
15217,133006,Andrew Chesnutt,"American, 1861–1934",American,Male,1861,1934,,
15218,133007,Lewis Chesnutt,"American, 1860–1933",American,Male,1860,1933,,
15219,133026,Alfred Tritschler,"German, 1905 – 1970",German,,1905,1970,,
15220,133027,Studio of Dr. Paul Wolff & Tritschler,,,,0,0,,


In [6]:
artists.DisplayName >> identify

0                                      Robert Arneson
1                                      Doroteo Arnaiz
2                                         Bill Arnold
3                                     Charles Arnoldi
4                                         Per Arnoldi
                             ...                     
15217                                 Andrew Chesnutt
15218                                  Lewis Chesnutt
15219                               Alfred Tritschler
15220           Studio of Dr. Paul Wolff & Tritschler
15221    Hamburgrer Frobelspiel-Handlung, Herm. Evers
Name: DisplayName, Length: 15222, dtype: object

In [8]:
expr = X.DisplayName >> identify

In [9]:
expr.evaluate(artists)

AttributeError: 'Series' object has no attribute '__rshift__'

In [11]:
expr = X >> identify

In [12]:
expr.evaluate('a')

AttributeError: 'str' object has no attribute '__rshift__'

In [13]:
getattr('a', '__rshift__')

AttributeError: 'str' object has no attribute '__rshift__'

In [14]:
help(artists.Nationality.str.contains)

Help on method contains in module pandas.core.strings:

contains(pat, case=True, flags=0, na=nan, regex=True) method of pandas.core.strings.StringMethods instance
    Test if pattern or regex is contained within a string of a Series or Index.
    
    Return boolean Series or Index based on whether a given pattern or regex is
    contained within a string of a Series or Index.
    
    Parameters
    ----------
    pat : str
        Character sequence or regular expression.
    case : bool, default True
        If True, case sensitive.
    flags : int, default 0 (no flags)
        Flags to pass through to the re module, e.g. re.IGNORECASE.
    na : default NaN
        Fill value for missing values.
    regex : bool, default True
        If True, assumes the pat is a regular expression.
    
        If False, treats the pat as a literal string.
    
    Returns
    -------
    Series or Index of boolean values
        A Series or Index of boolean values indicating whether the
        gi

In [19]:
artists.Nationality.str.contains('America', na=False)

0         True
1        False
2         True
3         True
4        False
         ...  
15217     True
15218     True
15219    False
15220    False
15221    False
Name: Nationality, Length: 15222, dtype: bool

In [20]:
~artists.Nationality.str.contains('America', na=False)

0        False
1         True
2        False
3        False
4         True
         ...  
15217    False
15218    False
15219     True
15220     True
15221     True
Name: Nationality, Length: 15222, dtype: bool

In [21]:
getattr('a', '__rshift__')

AttributeError: 'str' object has no attribute '__rshift__'

In [22]:
getattr(X, '__rshift__')

<bound method _set_magic_method.<locals>.magic_method of <dfply.base.Intention object at 0x11fb36c50>>

In [23]:
from collections import defaultdict
from dfply import make_symbolic, pipe, symbolic_evaluation, Intention, dfpipe, rename, flatten, X
from dfply.base import Intention as DfplyIntention
import pandas as pd
import numpy as np
import re
from string import punctuation, whitespace
from composable import pipeable
from functools import reduce
from toolz import identity, last


In [24]:
def maybe_combine(acc, col):
    return acc.combine_first(col) if acc.isna().any() else acc


@symbolic_evaluation
def coalesce(*args):
    return reduce(maybe_combine, args)

In [26]:
equal_or_na = lambda s1, s2: ((s1 == s2) | pd.isna(s1)).all()
o1 = pd.Series([1, 2, 2, np.nan])
o2 = pd.Series([1, 2, 1, 2])
o3 = pd.Series([1, 2, 2, 2])
df = pd.DataFrame({'c1':[1,      2,      np.nan, np.nan],
                   'c2':[np.nan, 1,      2,      np.nan],
                   'c3':[np.nan, np.nan, 1,      2]})

In [27]:
assert equal_or_na(coalesce(df.c1, df.c2), o1)
assert equal_or_na(coalesce(df.c1, df.c3), o2)
assert equal_or_na(coalesce(df.c1, df.c2, df.c3), o3)

In [28]:
assert equal_or_na(coalesce(X.c1, X.c2).evaluate(df), o1)
assert equal_or_na(coalesce(X.c1, X.c3).evaluate(df), o2)
assert equal_or_na(coalesce(X.c1, X.c2, X.c3).evaluate(df), o3)

TypeError: __index__ returned non-int (type Intention)

In [29]:
coalesce(X.c1, X.c2)

TypeError: __index__ returned non-int (type Intention)