# Data Types

In [1]:
import numpy as np
import pandas as pd

## Categorical
The category data type looks and behaves like a string, but internally it's represented by an array of integers. This allows data to be sorted in a custom order and to be more efficiently stored

In [2]:
df = pd.DataFrame({
    'month': ['May', 'October', 'January', 'July', 'September'],
    'value': np.random.randint(0, 100, size=5)
})
df

Unnamed: 0,month,value
0,May,34
1,October,39
2,January,27
3,July,92
4,September,61


Convert a column to a Categorical with a specific order

In [3]:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

In [4]:
df['month'] = pd.Categorical(df['month'], categories=months, ordered=True)

In [5]:
df['month'].dtype

CategoricalDtype(categories=['January', 'February', 'March', 'April', 'May', 'June',
                  'July', 'August', 'September', 'October', 'November',
                  'December'],
, ordered=True)

Convert category to string and concatenate it..

In [6]:
df.index.astype(str) + '-' + df['month'].astype(str)

0          0-May
1      1-October
2      2-January
3         3-July
4    4-September
dtype: object

In [7]:
df['month'].cat.codes

0    4
1    9
2    0
3    6
4    8
dtype: int8

### Use Categorical to anonymize data

In [11]:
df = pd.DataFrame({
    'customer': ['Gwen Stacy', 'Peter Parker', 'MJ Watson', 'Peter Parker', 'Gwen Stacy'],
    'value': np.random.randint(0, 100, size=5)
})
df

Unnamed: 0,customer,value
0,Gwen Stacy,6
1,Peter Parker,92
2,MJ Watson,14
3,Peter Parker,46
4,Gwen Stacy,73


In [12]:
df['anonymized'] = 'Customer ' + df.customer.astype('category').cat.codes.astype('str')
df

Unnamed: 0,customer,value,anonymized
0,Gwen Stacy,6,Customer 0
1,Peter Parker,92,Customer 2
2,MJ Watson,14,Customer 1
3,Peter Parker,46,Customer 2
4,Gwen Stacy,73,Customer 0
