In [1]:
# A categorical variable takes on a limited, and usually fixed, number of possible values
# Example: gender, social class, blood type, country affiliation, observation time or rating via Likert scales
# categorical data might have an order (eg: 'strongly agree' vs. 'agree'), but numerical operation are not possible

import pandas as pd
import numpy as np

In [4]:
# OBJECT CREATION
# Series creation:
s1 = pd.Series(["a", "b", "c", "a"], dtype="category")
print(s1)

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]


In [5]:
# converting an existing Series or column to a category type
df1 = pd.DataFrame({"A": ["a", "b", "c", "a"]})
df1["B"] = df1["A"].astype('category')
print(df1)

   A  B
0  a  a
1  b  b
2  c  c
3  a  a


In [6]:
# groups data into discrete bins
df2 = pd.DataFrame({'value': np.random.randint(0, 100, 20)})
labels = ["{0} - {1}".format(i, i+9) for i in range(0, 100, 10)]
df2['group'] = pd.cut(df2.value, range(0, 105, 10), right=False, labels=labels)
df2.head(10)

Unnamed: 0,value,group
0,21,20 - 29
1,70,70 - 79
2,52,50 - 59
3,24,20 - 29
4,51,50 - 59
5,84,80 - 89
6,20,20 - 29
7,0,0 - 9
8,25,20 - 29
9,57,50 - 59


In [7]:
# creation of categorical objects
# pd.categorical(values, categories, ordered)
cat = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
print(cat)

[a, b, c, a, b, c]
Categories (3, object): [a, b, c]


In [10]:
# second argument signifies the categories, thus any value which is not present in the categories will be teated as NaN
cat1 = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c', 'd'], ['c', 'b', 'a'])
print(cat1)

[a, b, c, a, b, c, NaN]
Categories (3, object): [c, b, a]


In [11]:
# ordered
cat2 = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c', 'd'], ['c', 'b', 'a'], ordered=True)
print(cat2)

[a, b, c, a, b, c, NaN]
Categories (3, object): [c < b < a]


In [18]:
# Description --> get similar output to a Series or DataFrame of type string
cat = pd.Categorical(["a", "c", "c", np.nan], categories=["b", "a", "c"])
df = pd.DataFrame({"cat":cat, "s":["a", "c", "c", np.nan]})
print(df.describe())

print(df["cat"].describe())

       cat  s
count    3  3
unique   2  2
top      c  c
freq     2  2
count     3
unique    2
top       c
freq      2
Name: cat, dtype: object


In [19]:
# Get the properties of the category
s2 = pd.Categorical(["a", "c", "c", np.nan], categories=["b", "a", "c"])
print(s2.categories)

Index(['b', 'a', 'c'], dtype='object')


In [20]:
print(s2.ordered)

False


In [22]:
# Renaming categories
s3 = pd.Series(["a", "b", "c", "a"], dtype="category")
s3.cat.categories = ["Group %s" % g for g in s3.cat.categories]

print(s3.cat.categories)

Index(['Group a', 'Group b', 'Group c'], dtype='object')


In [23]:
# Appending new categories
s4 = pd.Series(["a", "b", "c", "a"], dtype="category")
s4 = s4.cat.add_categories([4])
print(s4.cat.categories)

Index(['a', 'b', 'c', 4], dtype='object')


In [24]:
# Removing categories
s5 = pd.Series(["a", "b", "c", "a"], dtype="category")
print("original object:")
print(s5)
print("after removal:")
print(s5.cat.remove_categories("a"))

original object:
0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]
after removal:
0    NaN
1      b
2      c
3    NaN
dtype: category
Categories (2, object): [b, c]


In [25]:
# Comparison of categorical data
cat4 = pd.Series([1,2,3]).astype("category", categories=[1,2,3], ordered=True)
cat5 = pd.Series([2,2,2]).astype("category", categories=[1,2,3], ordered=True)

print(cat4>cat5)

0    False
1    False
2     True
dtype: bool


  
  This is separate from the ipykernel package so we can avoid doing imports until


In [26]:
print(cat4 == cat5)

0    False
1     True
2    False
dtype: bool
