# Employment - Occupation

The data is sourced from BLS' Occupational Employment and Wage Statistics (OEWS) program database. The data link is the All OEWS data file for MSAs from 2003 to 2020. The data is updated on an yearly basis in May.

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
from os import *
import sys
sys.path.append("../")
import zipfile
import urllib.request
from shutil import copyfile
import shutil

In [2]:
# Change directory to input
def change_dir_to_input(newpath):
    get_path = sys.path[0].split("\\")      
    del get_path[-1]                        
    cwd = "input\\" + str(newpath)                          
    get_path.append(cwd)                    
    path = "\\".join(get_path)              
    os.chdir(path)    

In [3]:
# Get input files from 2003 to 2020
change_dir_to_input('')
# os.mkdir('occupation_orig')
for i in range(5,21):
    change_dir_to_input('occupation_orig')
    if i in range(5,10):
        url = 'https://www.bls.gov/oes/special.requests/oesm0'+ str(i) + 'ma.zip'
        urllib.request.urlretrieve(url, 'oesm0'+ str(i) + 'ma.zip')
        zip_filename = 'oesm0'+ str(i) + 'ma.zip'
    else:
        url = 'https://www.bls.gov/oes/special.requests/oesm'+ str(i) + 'ma.zip'
        urllib.request.urlretrieve(url, 'oesm'+ str(i) + 'ma.zip')
        zip_filename = 'oesm'+ str(i) + 'ma.zip'
    zf = zipfile.ZipFile(zip_filename)
    if i in range(5,14):
        files = [i.filename  for i in zf.infolist()]
        files_to_del = filter( lambda f: f.startswith('a') or f.startswith('BOS') or f.startswith('field') or f.startswith('file'), zf.namelist())
        for k in list(files_to_del):
            files.remove(k)
        df = pd.DataFrame()
        for j in files:
            li = pd.read_excel(zf.open(j))
            df = df.append(li, ignore_index=True)
    elif i in range(14, 21):
        df = pd.read_excel(zf.open('oesm'+ str(i) + 'ma/MSA_M20'+ str(i) +'_dl.xlsx'))
    df.columns = map(str.upper, df.columns)
    if 'OCC_GROUP' in list(df.columns):
        df = df.rename(columns={'OCC_GROUP': 'O_GROUP'})
    if 'GROUP' in list(df.columns):
        df = df.rename(columns={'GROUP': 'O_GROUP'})
    if 'AREA_NAME' in list(df.columns):
        df = df.rename(columns={'AREA_NAME': 'AREA_TITLE'})
    if 'PRIM_STATE' not in list(df.columns):
        df['PRIM_STATE'] = df['AREA_TITLE'].str.split(', ').str[-1]
    lst = ['MSA','PMSA']
    df['AREA_TITLE'] = df['AREA_TITLE'].replace(lst,'', regex=True)
    df['AREA_TITLE'] = df['AREA_TITLE'].str.strip()
    df = df[df.O_GROUP.str.contains("major",na=False)]
    df = df[df.PRIM_STATE != 'PR']
    df = df.drop(['NAICS','NAICS_TITLE','I_GROUP','OWN_CODE','PCT_TOTAL','ANNUAL','HOURLY','OCC_TITLE','AREA_TITLE'],axis=1,errors='ignore')
    change_dir_to_input('occupation_out')
    if i in range(3,10):
        df['year'] = '200' + str(i)
        df.to_csv('MSA200'+str(i)+'.csv')
    else:
        df['year'] = '20' + str(i)
        df.to_csv('MSA20'+str(i)+'.csv')

In [4]:
change_dir_to_input('occupation_out')
li = []
for i in range(5,21):
    if i in range(3,10):
        df = pd.read_csv('MSA200'+str(i)+'.csv')
    else:
        df = pd.read_csv('MSA20'+str(i)+'.csv')
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

In [5]:
# Convert the output to csv
frame.to_csv('occupations.csv')

In [6]:
df = pd.read_csv('occupations.csv')
df.columns

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Index(['Unnamed: 0', 'Unnamed: 0.1', 'PRIM_STATE', 'AREA', 'OCC_CODE',
       'O_GROUP', 'TOT_EMP', 'EMP_PRSE', 'H_MEAN', 'A_MEAN', 'MEAN_PRSE',
       'H_PCT10', 'H_PCT25', 'H_MEDIAN', 'H_PCT75', 'H_PCT90', 'A_PCT10',
       'A_PCT25', 'A_MEDIAN', 'A_PCT75', 'A_PCT90', 'year', 'JOBS_1000',
       'LOC QUOTIENT', 'AREA_TYPE', 'LOC_QUOTIENT'],
      dtype='object')

In [7]:
# Drop irrelevant columns
df.drop(df.columns.difference(['AREA','OCC_CODE','O_GROUP','TOT_EMP','year']), 1, inplace=True)
df

Unnamed: 0,AREA,OCC_CODE,O_GROUP,TOT_EMP,year
0,11260,11-0000,major,12880,2005
1,11260,13-0000,major,5880,2005
2,11260,15-0000,major,2590,2005
3,11260,17-0000,major,4360,2005
4,11260,19-0000,major,2820,2005
...,...,...,...,...,...
138129,79600,45-0000,major,330,2020
138130,79600,47-0000,major,9110,2020
138131,79600,49-0000,major,9160,2020
138132,79600,51-0000,major,18430,2020


In [8]:
# Parse years as columns
df3 = df.set_index(['AREA','OCC_CODE','O_GROUP','year'])['TOT_EMP'].unstack()
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
AREA,OCC_CODE,O_GROUP,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
10180,11-0000,major,2350,2150,2150,2220,2230,2370,2440,2480,2150,2190,2110,2030,1960,2210,2670,2910
10180,13-0000,major,1430,1470,1510,1970,2020,1870,1610,1960,1760,2100,2200,2230,2270,2130,2280,2150
10180,15-0000,major,390,380,380,600,680,720,710,740,800,770,1010,920,810,540,630,820
10180,17-0000,major,,510,490,570,540,530,560,540,600,460,670,710,770,680,770,840
10180,19-0000,major,220,230,210,230,250,270,350,330,340,440,410,380,290,180,330,370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79600,45-0000,major,70,,160,220,100,90,190,190,230,190,180,200,210,190,290,330
79600,47-0000,major,7950,8210,8160,8040,7520,6880,6910,7060,7040,6800,7670,7960,8700,9480,9800,9110
79600,49-0000,major,7850,8250,8480,8980,8000,8120,8080,8490,8220,8410,9640,9720,9770,9680,9790,9160
79600,51-0000,major,18680,19280,18830,18070,16210,16150,16180,16790,16710,17090,19210,18640,18870,19230,19050,18430


In [9]:
# Change the directory
def change_dir(folder):
    get_path = sys.path[0].split("\\")      
    del get_path[-1]                                                   
    get_path.append(folder)                    
    path = "\\".join(get_path)              
    os.chdir(path) 

In [10]:
change_dir('output')
df3.to_csv('occupations.csv')   # naming convention 