# Grouping and Categorical Data Type

In [40]:
import numpy as np
import pandas as pd

## Grouping

In [41]:
df = pd.DataFrame(
    {
        "A": ["foo"] * 4 + ["bar"] * 4,
        "B": [1, 2, 1, 2, 1, 2, 3, 2],
        "C": np.random.default_rng(42).random(8),
        "D": np.random.default_rng(43).random(8),
    }
)
df

Unnamed: 0,A,B,C,D
0,foo,1,0.773956,0.652299
1,foo,2,0.438878,0.043775
2,foo,1,0.858598,0.02003
3,foo,2,0.697368,0.839213
4,bar,1,0.094177,0.587143
5,bar,2,0.975622,0.224705
6,bar,3,0.76114,0.751792
7,bar,2,0.786064,0.263692


In [42]:
df.groupby("A").sum()

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,8,2.617004,1.827333
foo,6,2.7688,1.555317


In [43]:
df.groupby("A").max()

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,3,0.975622,0.751792
foo,2,0.858598,0.839213


In [44]:
df.groupby(["A", "B"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,1,0.094177,0.587143
bar,2,1.761687,0.488397
bar,3,0.76114,0.751792
foo,1,1.632554,0.672329
foo,2,1.136246,0.882988


## Categorical Data Type

### Discrete Values

In [45]:
df = pd.DataFrame(
    {"id": [1, 2, 3, 4, 5, 6], 
     "grade": ["a", "b", "e", "a", "a", "c"]}
)

df["grade"] = df["grade"].astype("category")
df

Unnamed: 0,id,grade
0,1,a
1,2,b
2,3,e
3,4,a
4,5,a
5,6,c


In [46]:
df["grade"]

0    a
1    b
2    e
3    a
4    a
5    c
Name: grade, dtype: category
Categories (4, object): ['a', 'b', 'c', 'e']

In [47]:
df["grade"].cat.categories = [1, 2, 3, 4]
df

Unnamed: 0,id,grade
0,1,1
1,2,2
2,3,4
3,4,1
4,5,1
5,6,3


In [48]:
df.sort_values(by="grade")

Unnamed: 0,id,grade
0,1,1
3,4,1
4,5,1
1,2,2
5,6,3
2,3,4


In [49]:
df.groupby("grade").size()

grade
1    3
2    1
3    1
4    1
dtype: int64

### Continuous Values

In [50]:
df = pd.DataFrame({
    "score": np.random.default_rng(42).integers(0, 100, 8)
})
df

Unnamed: 0,score
0,8
1,77
2,65
3,43
4,43
5,85
6,8
7,69


In [51]:
labels = list("EDCBA")  # divide score into five levels (0-20), (20-40), (40-60), (60-80), (80-100)
df["grade-nolabels"] = pd.cut(df["score"], bins=range(0, 120, 20))
df["grade-labels"] = pd.cut(df["score"], bins=range(0, 120, 20), labels=list("EDCBA"))
df

Unnamed: 0,score,grade-nolabels,grade-labels
0,8,"(0, 20]",E
1,77,"(60, 80]",B
2,65,"(60, 80]",B
3,43,"(40, 60]",C
4,43,"(40, 60]",C
5,85,"(80, 100]",A
6,8,"(0, 20]",E
7,69,"(60, 80]",B


# Reference

- https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html#grouping
- https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html#categoricals
- https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html
- https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html