In [78]:
import numpy as np
import pandas as pd
import seaborn as sns

# for display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"  #default 'last_expr'

## Categorical Type in Pandas
Pros: less memory usage and faster access speed.

In [38]:
fruits = ['apple', 'orange', 'apple', 'apple'] * 10000

N = len(fruits)
df = pd.DataFrame({'fruit'    : fruits,
                   'basket_id': np.arange(N),
                   'count'    : np.random.randint(3, 15, size=N)
                  },
                  columns=['basket_id', 'fruit', 'count', ])
df.head(3)

Unnamed: 0,basket_id,fruit,count
0,0,apple,11
1,1,orange,14
2,2,apple,6


#### For 10,000 rows of data, switching to Categorical data type results in <br> 
- 30% less memory usage
- 5 times faster computational speed for the given calculation.

In [39]:
print('Without using Categorical data type:')
print('Memory usage:', df.memory_usage().sum())
print('Access time for value counts:')
%timeit df['fruit'].value_counts()

Without using Categorical data type:
Memory usage: 960080
Access time for value counts:
3.47 ms ± 82.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [40]:
# conversion is not free, but just one-time cost
%time df['fruit'] = df['fruit'].astype('category')

CPU times: user 3.07 ms, sys: 774 µs, total: 3.84 ms
Wall time: 2.98 ms


In [41]:
print('\nAfter using Categorical dtype:')

print('Memory usage:',df.memory_usage().sum())
print('Access time for value counts:')
%timeit df['fruit'].value_counts()


After using Categorical dtype:
Memory usage: 680176
Access time for value counts:
707 µs ± 14.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


#### Access the content of a categorical column

In [37]:
c = df['fruit'].values
type(c), c.categories
c.codes

(pandas.core.arrays.categorical.Categorical,
 Index(['apple', 'orange'], dtype='object'))

array([0, 1, 0, ..., 1, 0, 0], dtype=int8)

## Advanced GroupBy
### Group Transforms 
#### Task: de-center the data using group means instead of using overall means

In [100]:
df = sns.load_dataset('flights')
df.head()

Unnamed: 0,year,month,passengers
0,1949,January,112
1,1949,February,118
2,1949,March,132
3,1949,April,129
4,1949,May,121


##### de-center column passenges by overall mean

In [102]:
(df['passengers'] - df['passengers'].mean()).head(3)

0   -168.298611
1   -162.298611
2   -148.298611
Name: passengers, dtype: float64

##### de-center column passenges using GROUP means by Month

In [104]:
(df['passengers']- df.groupby('month')['passengers'].transform('mean')).head(3)

0   -129.750000
1   -117.000000
2   -138.166667
Name: passengers, dtype: float64

## Method Chaining 
### Using assign( )
Method chaining Pros and Cons:
    - Pros: fast implementation, less temporary variables
    - Cons: lower readibility, may take more CPU time

In [77]:
import seaborn as sns

# NO chaining
df = sns.load_dataset('flights')
df2 = df.copy()
df2['centered'] = df2['passengers'] - df2['passengers'].mean()
result = df2.groupby('month')['centered'].mean()
result

# WITH chaining
df2.assign(centered=df2['passengers'] - df2['passengers'].mean())\
    .groupby('month')['centered'].mean()

month
January     -38.548611
February    -45.298611
March       -10.131944
April       -13.215278
May          -8.465278
June         31.368056
July         71.034722
August       70.784722
September    22.118056
October     -13.715278
November    -47.465278
December    -18.465278
Name: centered, dtype: float64

month
January     -38.548611
February    -45.298611
March       -10.131944
April       -13.215278
May          -8.465278
June         31.368056
July         71.034722
August       70.784722
September    22.118056
October     -13.715278
November    -47.465278
December    -18.465278
Name: centered, dtype: float64

### Using pipe( )
Enable method chaining when user-defined functions are involved, very powerful!

In [None]:
def my_read(csv_file:
    df = (pd.read_csv(csv_file)
            .rename(columns=str.lower)            
            .pipe(my_func_1)
            .pipe(my_func_2)
            .assign(col1=lambda x: pd.to_datetime(x[col1]),
                    col2=lambda x: pd.Categorical(x[col2]))
         )
    return df

def my_func_1(df):
    '''
    does some cleaning to df
    '''    
    return df

def my_func_2(df):
    '''
    does some transformation to df
    '''    
    return df

output = 'examples/flights.h5'

if not os.path.exists(output):
    df = my_read("examples/flights_raw.csv")
    df.to_hdf(output, 'flights', format='table', data_columns=True)
else:
    df = pd.read_hdf(output, 'flights')

df.info()

Some may complain that a long chain is hard to read, but let's watch a great example (adapted from Jeff Allen, RStudio) that demonstrates that a well-CHAINED story-telling style is much easier to understand than a NESTED function-calling style.

In [None]:
Chaining: 

jack_jill.pipe(went_up("hill"))
         .pipe(fetch("water"))
         .pipe(fell_down("jack"))
         .pipe(broke("crown"))
         .pipe(tumble_after("jill"))

Nested Function:
tumble_after(
    broke(
        fell_down(
            fetch(went_up(jack_jill, "hill"), "water"),
            jack),
        "crown"),
    "jill"
)

The Cons for chaining is debugging, because the output is not direct. So each step needs to be checked sequentially as well before deployment.