In [1]:
import numpy as np
import pandas as pd

##### pandas can include categorical data in a DataFrame. For full docs, see the categorical introduction and the API documentation.

In [2]:
df = pd.DataFrame(
    {"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]}
)

In [3]:
# Converting the raw grades to a categorical data type:
df["grade"] = df["raw_grade"].astype("category")
df.dtypes


id              int64
raw_grade      object
grade        category
dtype: object

In [4]:
# Rename the categories to more meaningful names:
new_categories = ["very good", "good", "very bad"]
df["grade"] = df["grade"].cat.rename_categories(new_categories)
df

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,a,very good
5,6,e,very bad


In [5]:
# Reorder the categories and simultaneously add the missing categories (methods under Series.cat() return a new Series by default):
df["grade"] = df["grade"].cat.set_categories(
    ["very bad", "bad", "medium", "good", "very good"]
)
df

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,a,very good
5,6,e,very bad


In [6]:
# Sorting is per order in the categories, not lexical order:

df.sort_values(by="grade")

Unnamed: 0,id,raw_grade,grade
5,6,e,very bad
1,2,b,good
2,3,b,good
0,1,a,very good
3,4,a,very good
4,5,a,very good


In [7]:
# Grouping by a categorical column also shows empty categories:
df.groupby("grade").size()

grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64