In [1]:
# Libraries
import pandas as pd

In [2]:
# Ingest data
raw = pd.read_csv("ce.data.0.AllCESSeries",
                 sep="\t",          # file is tab separated
                 dtype=str,         # keep all columns as strings (preserve codes that lead with zeroes)
                 na_values=["."])   # treat . as a missing value

series = pd.read_csv("ce.series", sep="\t", dtype=str)
industry = pd.read_csv("ce.industry", sep="\t", dtype=str)
data_type = pd.read_csv("ce.datatype.txt", sep="\t", dtype=str)
month = pd.read_csv("ce.period", sep="\t", dtype=str)
sector = pd.read_csv("ce.supersector.txt", sep="\t", dtype=str) 
                      

# Remove extra space in column names
raw.columns = raw.columns.str.strip()
series.columns = series.columns.str.strip()
industry.columns = industry.columns.str.strip()
data_type.columns = data_type.columns.str.strip()
month.columns = month.columns.str.strip()
sector.columns = sector.columns.str.strip()

In [3]:
# Merge
df = (
    raw
    .merge(series, on="series_id", how="left")          # adds: series_title, seasonal, supersector_code, industry_code, data_type_code, footnote_codes, begin_year, begin_period, end_year, end_period, naics_code, publishing_status
    .merge(industry, on="industry_code", how="left")    # adds: industry_name, display_level, selectable, sort_sequence
    .merge(data_type, on="data_type_code", how="left")  # adds: data_type_text 
    .merge(month, on="period", how="left")              # adds: period_name
    .merge(sector, on="supersector_code", how="left")   # adds: supersector_name
)

# Filter for data_type_code = 01 (Employment #'s)
# df = df[df["data_type_code"] == "01"].copy()

In [4]:
df.head()

Unnamed: 0,series_id,year,period,value,footnote_codes_x,supersector_code,industry_code,data_type_code,seasonal,series_title,...,naics_code,publishing_status,industry_name,display_level,selectable,sort_sequence,data_type_text,mm,month,supersector_name
0,CES0000000001,1939,M01,29923,,0,0,1,S,"All employees, thousands, total nonfarm, seaso...",...,-,B,Total nonfarm,0,T,1,"ALL EMPLOYEES, THOUSANDS",JAN,January,Total nonfarm
1,CES0000000001,1939,M02,30100,,0,0,1,S,"All employees, thousands, total nonfarm, seaso...",...,-,B,Total nonfarm,0,T,1,"ALL EMPLOYEES, THOUSANDS",FEB,February,Total nonfarm
2,CES0000000001,1939,M03,30280,,0,0,1,S,"All employees, thousands, total nonfarm, seaso...",...,-,B,Total nonfarm,0,T,1,"ALL EMPLOYEES, THOUSANDS",MAR,March,Total nonfarm
3,CES0000000001,1939,M04,30094,,0,0,1,S,"All employees, thousands, total nonfarm, seaso...",...,-,B,Total nonfarm,0,T,1,"ALL EMPLOYEES, THOUSANDS",APR,April,Total nonfarm
4,CES0000000001,1939,M05,30299,,0,0,1,S,"All employees, thousands, total nonfarm, seaso...",...,-,B,Total nonfarm,0,T,1,"ALL EMPLOYEES, THOUSANDS",MAY,May,Total nonfarm


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8084418 entries, 0 to 8084417
Data columns (total 25 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   series_id          object
 1   year               object
 2   period             object
 3   value              object
 4   footnote_codes_x   object
 5   supersector_code   object
 6   industry_code      object
 7   data_type_code     object
 8   seasonal           object
 9   series_title       object
 10  footnote_codes_y   object
 11  begin_year         object
 12  begin_period       object
 13  end_year           object
 14  end_period         object
 15  naics_code         object
 16  publishing_status  object
 17  industry_name      object
 18  display_level      object
 19  selectable         object
 20  sort_sequence      object
 21  data_type_text     object
 22  mm                 object
 23  month              object
 24  supersector_name   object
dtypes: object(25)
memory usage: 1.5+ GB


In [6]:
# Save unfiltered data
df.to_csv("bls_employment_unfiltered.csv", index=False) 