# Employment - Occupation

The data is sourced from BLS' Occupational Employment and Wage Statistics (OEWS) program database. The data link is the All OEWS data file for MSAs from 2003 to 2020. The data is updated on an yearly basis in May.

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
from os import *
import sys
sys.path.append("../")
import zipfile
import urllib.request
from shutil import copyfile
import shutil

In [3]:
# Go to the input folder
def change_dir_to_input(newpath):
    get_path = sys.path[0].split("\\")      
    del get_path[-1]                        
    cwd = "input\\" + str(newpath)                          
    get_path.append(cwd)                    
    path = "\\".join(get_path)              
    os.chdir(path)    

In [4]:
# Get input files from 2003 to 2020
change_dir_to_input('')
# os.mkdir('occupation_orig')
for i in range(5,21):
    change_dir_to_input('occupation_orig')
    if i in range(5,10):
        url = 'https://www.bls.gov/oes/special.requests/oesm0'+ str(i) + 'ma.zip'
        urllib.request.urlretrieve(url, 'oesm0'+ str(i) + 'ma.zip')
        zip_filename = 'oesm0'+ str(i) + 'ma.zip'
    else:
        url = 'https://www.bls.gov/oes/special.requests/oesm'+ str(i) + 'ma.zip'
        urllib.request.urlretrieve(url, 'oesm'+ str(i) + 'ma.zip')
        zip_filename = 'oesm'+ str(i) + 'ma.zip'
    zf = zipfile.ZipFile(zip_filename)
    if i in range(5,14):
        files = [i.filename  for i in zf.infolist()]
        files_to_del = filter( lambda f: f.startswith('a') or f.startswith('BOS') or f.startswith('field') or f.startswith('file'), zf.namelist())
        for k in list(files_to_del):
            files.remove(k)
        df = pd.DataFrame()
        for j in files:
            li = pd.read_excel(zf.open(j))
            df = df.append(li, ignore_index=True)
    elif i in range(14, 21):
        df = pd.read_excel(zf.open('oesm'+ str(i) + 'ma/MSA_M20'+ str(i) +'_dl.xlsx'))
    df.columns = map(str.upper, df.columns)
    if 'OCC_GROUP' in list(df.columns):
        df = df.rename(columns={'OCC_GROUP': 'O_GROUP'})
    if 'GROUP' in list(df.columns):
        df = df.rename(columns={'GROUP': 'O_GROUP'})
    if 'AREA_NAME' in list(df.columns):
        df = df.rename(columns={'AREA_NAME': 'AREA_TITLE'})
    if 'PRIM_STATE' not in list(df.columns):
        df['PRIM_STATE'] = df['AREA_TITLE'].str.split(', ').str[-1]
    lst = ['MSA','PMSA']
    df['AREA_TITLE'] = df['AREA_TITLE'].replace(lst,'', regex=True)
    df['AREA_TITLE'] = df['AREA_TITLE'].str.strip()
    df = df[df.O_GROUP.str.contains("major",na=False)]
    df = df.drop(['NAICS','NAICS_TITLE','I_GROUP','OWN_CODE','PCT_TOTAL','ANNUAL','HOURLY','OCC_TITLE'],axis=1,errors='ignore')
    change_dir_to_input('occupation_out')
    if i in range(3,10):
        df['year'] = '200' + str(i)
        df.to_csv('MSA200'+str(i)+'.csv')
    else:
        df['year'] = '20' + str(i)
        df.to_csv('MSA20'+str(i)+'.csv')

In [6]:
change_dir_to_input('occupation_out')
li = []
for i in range(5,21):
    if i in range(3,10):
        df = pd.read_csv('MSA200'+str(i)+'.csv')
    else:
        df = pd.read_csv('MSA20'+str(i)+'.csv')
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

In [7]:
# Go to the Output folder
get_path = sys.path[0].split("\\")
del get_path[-1]
cwd = "output"
get_path.append(cwd)
path = "\\".join(get_path)
os.chdir(path)

In [8]:
# Convert the output to csv
frame.to_csv('occupations.csv')

In [9]:
df = pd.read_csv('occupations.csv')
df.columns

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Index(['Unnamed: 0', 'Unnamed: 0.1', 'PRIM_STATE', 'AREA', 'AREA_TITLE',
       'OCC_CODE', 'O_GROUP', 'TOT_EMP', 'EMP_PRSE', 'H_MEAN', 'A_MEAN',
       'MEAN_PRSE', 'H_PCT10', 'H_PCT25', 'H_MEDIAN', 'H_PCT75', 'H_PCT90',
       'A_PCT10', 'A_PCT25', 'A_MEDIAN', 'A_PCT75', 'A_PCT90', 'year',
       'JOBS_1000', 'LOC QUOTIENT', 'AREA_TYPE', 'LOC_QUOTIENT'],
      dtype='object')

In [10]:
# Drop irrelevant columns
df.drop(df.columns.difference(['PRIM_STATE', 'AREA','AREA_TITLE','OCC_CODE','O_GROUP','TOT_EMP','year']), 1, inplace=True)
df

Unnamed: 0,PRIM_STATE,AREA,AREA_TITLE,OCC_CODE,O_GROUP,TOT_EMP,year
0,AK,11260,"Anchorage, AK",11-0000,major,12880,2005
1,AK,11260,"Anchorage, AK",13-0000,major,5880,2005
2,AK,11260,"Anchorage, AK",15-0000,major,2590,2005
3,AK,11260,"Anchorage, AK",17-0000,major,4360,2005
4,AK,11260,"Anchorage, AK",19-0000,major,2820,2005
...,...,...,...,...,...,...,...
140570,MA,79600,"Worcester, MA-CT",45-0000,major,330,2020
140571,MA,79600,"Worcester, MA-CT",47-0000,major,9110,2020
140572,MA,79600,"Worcester, MA-CT",49-0000,major,9160,2020
140573,MA,79600,"Worcester, MA-CT",51-0000,major,18430,2020


In [11]:
# Filter Puerto Rico
df = df[df.PRIM_STATE != 'PR']
df

Unnamed: 0,PRIM_STATE,AREA,AREA_TITLE,OCC_CODE,O_GROUP,TOT_EMP,year
0,AK,11260,"Anchorage, AK",11-0000,major,12880,2005
1,AK,11260,"Anchorage, AK",13-0000,major,5880,2005
2,AK,11260,"Anchorage, AK",15-0000,major,2590,2005
3,AK,11260,"Anchorage, AK",17-0000,major,4360,2005
4,AK,11260,"Anchorage, AK",19-0000,major,2820,2005
...,...,...,...,...,...,...,...
140570,MA,79600,"Worcester, MA-CT",45-0000,major,330,2020
140571,MA,79600,"Worcester, MA-CT",47-0000,major,9110,2020
140572,MA,79600,"Worcester, MA-CT",49-0000,major,9160,2020
140573,MA,79600,"Worcester, MA-CT",51-0000,major,18430,2020


In [19]:
df = df.sort_values('AREA_TITLE')

In [20]:
# Parse years as columns
df3 = df.set_index(['PRIM_STATE', 'AREA','AREA_TITLE','OCC_CODE','O_GROUP','year'])['TOT_EMP'].unstack()
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
PRIM_STATE,AREA,AREA_TITLE,OCC_CODE,O_GROUP,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
AK,11260,"Anchorage, AK",11-0000,major,12880,13200,13230,12430,11910,10720,10350,10040,10370,10000,10090,9820,9950,10610,11830,12000
AK,11260,"Anchorage, AK",13-0000,major,5880,6200,5860,6360,6940,7340,7270,7900,8180,8320,8980,9190,8940,8000,7920,7780
AK,11260,"Anchorage, AK",15-0000,major,2590,2510,2500,2760,3110,3320,3390,3500,3360,3190,3350,3370,3570,3300,3280,2980
AK,11260,"Anchorage, AK",17-0000,major,4360,4290,4520,4810,5370,5530,5440,5530,5260,5140,5270,5070,4820,4610,4440,4390
AK,11260,"Anchorage, AK",19-0000,major,2820,2530,2550,2840,2830,2690,2750,3000,3010,2870,2710,2590,2480,2390,2930,3050
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WY,16940,"Cheyenne, WY",45-0000,major,,,,,,,,,,,30,40,50,40,30,
WY,16940,"Cheyenne, WY",47-0000,major,2260,2750,2560,2950,2720,2780,2600,2770,3220,3000,3330,3110,2930,2960,3120,3190
WY,16940,"Cheyenne, WY",49-0000,major,1930,1840,2100,1960,2100,2190,2300,2380,2300,2210,2180,2090,2300,2180,2250,2110
WY,16940,"Cheyenne, WY",51-0000,major,1430,1540,1690,1740,1420,1110,1200,1390,1410,1300,1130,1120,1240,1260,1560,1570


In [14]:
df3.to_csv('occs_by_year.csv')   # naming convention 