# Working with Categorical Data in Python

## Introduction to Categorical Data

In [1]:
import pandas as pd

In [2]:
adult = pd.read_csv('adult.csv')

In [3]:
adult.head()

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Country,Above/Below 50k
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:

print(adult['Above/Below 50k'].describe())

count      32561
unique         2
top        <=50K
freq       24720
Name: Above/Below 50k, dtype: object


In [5]:

print(adult['Above/Below 50k'].value_counts())

Above/Below 50k
<=50K    24720
>50K      7841
Name: count, dtype: int64


In [6]:
print(adult['Above/Below 50k'].value_counts(normalize=True))

Above/Below 50k
<=50K    0.75919
>50K     0.24081
Name: proportion, dtype: float64


### Setting dtypes and saving memory

In [7]:
adult['Occupation'].value_counts()

Occupation
Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
?                    1843
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
Name: count, dtype: int64

In [8]:
list_of_occupations = adult['Occupation'].values.tolist()

In [9]:
series1 = pd.Series(list_of_occupations)

In [10]:

print("series1 data type:", series1.dtype)
print("series1 number of bytes:", series1.nbytes)

series1 data type: object
series1 number of bytes: 260488


In [11]:

series2 = pd.Series(list_of_occupations, dtype="category")

In [12]:
print("series2 data type:", series2.dtype)
print("series2 number of bytes:", series2.nbytes)

series2 data type: category
series2 number of bytes: 32681


### Setting dtype when reading data


In [13]:
adult.head()

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Country,Above/Below 50k
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [14]:
print(adult.dtypes)

Age                 int64
Workclass          object
fnlgwt              int64
Education          object
Education Num       int64
Marital Status     object
Occupation         object
Relationship       object
Race               object
Sex                object
Capital Gain        int64
Capital Loss        int64
Hours/Week          int64
Country            object
Above/Below 50k    object
dtype: object


### Using pandas functions effectively

- Define the columns for grouping
- Group the dataset by the specified columns
- Calculate and print the mean hours worked per week for each group


In [16]:
user_list = ['Education', 'Above/Below 50k']
gb = adult.groupby(by=user_list)
print(gb['Hours/Week'].mean())

Education     Above/Below 50k
10th          <=50K              36.574053
              >50K               43.774194
11th          <=50K              33.322870
              >50K               45.133333
12th          <=50K              35.035000
              >50K               44.818182
1st-4th       <=50K              37.864198
              >50K               48.833333
5th-6th       <=50K              38.539432
              >50K               46.000000
7th-8th       <=50K              38.830033
              >50K               47.500000
9th           <=50K              37.667351
              >50K               44.851852
Assoc-acdm    <=50K              39.264339
              >50K               44.256604
Assoc-voc     <=50K              40.817826
              >50K               43.853186
Bachelors     <=50K              40.586152
              >50K               45.475462
Doctorate     <=50K              45.429907
              >50K               47.513072
HS-grad       <=50K     