In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.DataFrame(["A+", "A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D+", "D"],
                index=["excellent", "excellent", "excellent", "good", "good", "good", 
                "ok", "ok", "ok", "poor", "poor"],
                columns=["Grades"])
print(df)

          Grades
excellent     A+
excellent      A
excellent     A-
good          B+
good           B
good          B-
ok            C+
ok             C
ok            C-
poor          D+
poor           D


In [3]:
print(df.dtypes)

Grades    object
dtype: object


In [6]:
print(df["Grades"].astype("category").head())
# the data is sorted and pandas knows the categories

excellent    A+
excellent     A
excellent    A-
good         B+
good          B
Name: Grades, dtype: category
Categories (11, object): ['A', 'A+', 'A-', 'B', ..., 'C+', 'C-', 'D', 'D+']


In [7]:
my_categories = pd.CategoricalDtype(categories=["D", "D+", "C-", "C", "C+", "B-", "B", "B+", "A-", "A", "A+"], ordered=True)
grades = df["Grades"].astype(my_categories)
print(grades.head())
# now it knows the right order of these categories

excellent    A+
excellent     A
excellent    A-
good         B+
good          B
Name: Grades, dtype: category
Categories (11, object): ['D' < 'D+' < 'C-' < 'C' ... 'B+' < 'A-' < 'A' < 'A+']


In [8]:
print(df[df["Grades"] > "C"])
# it is wrong because it is comparing the values lexicographically

     Grades
ok       C+
ok       C-
poor     D+
poor      D


In [9]:
print(grades[grades > "C"])
# once we set grades with astype(my_categories), it is in the correct order

excellent    A+
excellent     A
excellent    A-
good         B+
good          B
good         B-
ok           C+
Name: Grades, dtype: category
Categories (11, object): ['D' < 'D+' < 'C-' < 'C' ... 'B+' < 'A-' < 'A' < 'A+']


In [15]:
df = pd.read_csv("census.csv")
df = df[df["SUMLEV"] == 50]
df = df.set_index("STNAME").groupby(level=0)["CENSUS2010POP"].agg(np.average)
print(df.head())

STNAME
Alabama        71339.343284
Alaska         24490.724138
Arizona       426134.466667
Arkansas       38878.906667
California    642309.586207
Name: CENSUS2010POP, dtype: float64


In [16]:
print(pd.cut(df, 10)) # 10 bins

STNAME
Alabama                   (11706.087, 75333.413]
Alaska                    (11706.087, 75333.413]
Arizona                 (390320.176, 453317.529]
Arkansas                  (11706.087, 75333.413]
California              (579312.234, 642309.586]
Colorado                 (75333.413, 138330.766]
Connecticut             (390320.176, 453317.529]
Delaware                (264325.471, 327322.823]
District of Columbia    (579312.234, 642309.586]
Florida                 (264325.471, 327322.823]
Georgia                   (11706.087, 75333.413]
Hawaii                  (264325.471, 327322.823]
Idaho                     (11706.087, 75333.413]
Illinois                 (75333.413, 138330.766]
Indiana                   (11706.087, 75333.413]
Iowa                      (11706.087, 75333.413]
Kansas                    (11706.087, 75333.413]
Kentucky                  (11706.087, 75333.413]
Louisiana                 (11706.087, 75333.413]
Maine                    (75333.413, 138330.766]
Maryland     