# Advanced pandas

## 12.1 Categorical Data

### Background and Motivation

In [None]:
import numpy as np; import pandas as pd

In [None]:
values = pd.Series(['apple','orange','apple','apple']*2)

In [None]:
values

In [None]:
pd.unique(values)

In [None]:
pd.value_counts(values)

In [None]:
values = pd.Series([0,1,0,0]*2)

In [None]:
dim = pd.Series(['apple','orange'])

In [None]:
values

In [None]:
dim

In [None]:
dim.take(values)

In [None]:
pd.Series(['Volvo','BMW']).take([0,1,1,0]*2)

### Categorical Type in pandas

In [None]:
fruits = ['apple','orange','apple','apple'] * 2

In [None]:
N = len(fruits)

In [None]:
df = pd.DataFrame({'fruit': fruits,
                   'basket_id': np.arange(N),
                   'count': np.random.randint(3,15,size=N),
                   'weight': np.random.uniform(0,4,size=N)
                  }, columns=['fruit','basket_id','count','weight'])

In [None]:
df

In [None]:
df['fruit']

In [None]:
fruit_cat = df['fruit'].astype('category')

In [None]:
fruit_cat

In [None]:
c = fruit_cat.values

In [None]:
c

In [None]:
type(c)

In [None]:
c.categories

In [None]:
c.codes

In [None]:
df['fruit'] = df['fruit'].astype('category')

In [None]:
df.fruit

In [None]:
my_c = pd.Categorical(['foo','bar','baz','foo','bar'])

In [None]:
my_c

In [None]:
categories = ['foo','bar','baz']

In [None]:
codes = [0,1,2,0,0,1]

In [None]:
my_cats2 = pd.Categorical.from_codes(codes, categories)

In [None]:
my_cats2

In [None]:
ordered_cats = pd.Categorical.from_codes(codes, categories, ordered=True)

In [None]:
ordered_cats

In [None]:
my_cats2.as_ordered()

### Computations With Categorical

In [None]:
np.random.seed(12345)

In [None]:
draws = np.random.randn(1000)

In [None]:
draws[:5]

In [None]:
bins = pd.qcut(draws, 4)

In [None]:
bins

In [None]:
bins = pd.qcut(draws, 4, labels=['Q1','Q2','Q3','Q4'])

In [None]:
bins

In [None]:
bins.codes[:10]

In [None]:
bins.categories

In [None]:
bins = pd.Series(bins, name='quartile')

In [None]:
results = (pd.Series(draws).groupby(bins).agg(['count','min','max']).reset_index())

In [None]:
results

In [None]:
results['quartile']

In [None]:
results['count']

#### Better performance with categoricals

In [None]:
N = 10000000

In [None]:
draws = pd.Series(np.random.randn(N))

In [None]:
labels = pd.Series(['foo','bar','baz','qux']*(N//4))

In [None]:
categories = labels.astype('category')

In [None]:
labels.memory_usage()

In [None]:
categories.memory_usage()

In [None]:
%time _ = labels.astype('category')

## Categorical Methods

In [None]:
s = pd.Series(['a','b','c','d']*2)

In [None]:
cat_s = s.astype('category')

In [None]:
cat_s

In [None]:
cat_s.cat.codes

In [None]:
cat_s.cat.categories

In [None]:
cat_s2 = cat_s.cat.set_categories(['a','b','c','d','e'])

In [None]:
cat_s2

In [None]:
cat_s.value_counts()

In [None]:
cat_s2.value_counts()

In [None]:
cat_s.isin(['a','b'])

In [None]:
cat_s3 = cat_s[cat_s.isin(['a','b'])]

In [None]:
cat_s3

In [None]:
cat_s3.cat.remove_unused_categories()

#### Table 12-1. Categorical methods for Series in pandas

![Table%2012-1.%20Categorical%20methods%20for%20Series%20in%20pandas](images/Table%2012-1.%20Categorical%20methods%20for%20Series%20in%20pandas.png)

### Creating dummy variables for modeling

In [None]:
cat_s = pd.Series(['a','b','c','d']*2,dtype='category')

In [None]:
cat_s

In [None]:
pd.get_dummies(cat_s)

## 12.2 Advanced GroupBy Use

### Group Transforms and "unwrapped" GroupBys

In [None]:
df = pd.DataFrame({'key':['a','b','c']*4, 'value':np.arange(12.)})

In [None]:
df

In [None]:
g = df.groupby('key').value

In [None]:
g.mean()

In [None]:
g.transform(lambda x: x.mean())

In [None]:
g.transform('mean')

In [None]:
g.transform(lambda x: x*2)

In [None]:
g.transform(lambda x: x.rank(ascending=False))

In [None]:
def normalise(x):
    return (x-x.mean())/x.std()

In [None]:
g.transform(normalise)

In [None]:
g.apply(normalise)

In [None]:
normalised = (df['value']- g.transform('mean'))/g.transform('std')

In [None]:
normalised

### Grouped Time Resampling

In [None]:
N = 15

In [None]:
times = pd.date_range('2018-02-25 00:00', freq='1min', periods=N)

In [None]:
df = pd.DataFrame({'time':times, 'value':np.arange(N)})

In [None]:
df

In [None]:
df.set_index('time').resample('5min').count()

In [None]:
df2 = pd.DataFrame({'time':times.repeat(3), 'key':np.tile(['a','b','c'], N), 'value':np.arange(N*3.)})

In [None]:
df2[:7]

In [None]:
time_key = pd.TimeGrouper('5min')

In [None]:
resampled = (df2.set_index('time').groupby(['key', time_key]).sum())

In [None]:
resampled

In [None]:
resampled.reset_index()

## 12.3 Techiniques for Method Chaining

```python
# Usual - non-function way
df2 = df.copy()
df2.k=v

# Functional assign way
df2 = df.assign(k=v)
```

### The pipe Method

```python
# a sequence of function calls
a = f(df, arg1=v1)
b = g(a, v2, arg3=v3)
c = h(b, arg4=v4)

# rewrited using calls to pipe
result = (df.pipe(f, arg1=v1)
          .pipe(g, v2, arg3=v3)
          .pipe(h, arg4=v4))
```