In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
path = os.path.join(os.getcwd(),'../kaggle')
files = [ file for file in os.listdir(path) if file.endswith('.csv') ]
files

['degrees-that-pay-back.csv',
 'salaries-by-college-type.csv',
 'salaries-by-region-id.csv',
 'salaries-by-region.csv',
 'salaries-by-region-updated.csv']

In [3]:
dfs = { filename[:-4]: pd.read_csv(os.path.join(path, filename), encoding='utf-8', low_memory=False) for filename in files }

In [4]:
test = dfs['salaries-by-region-id'][dfs['salaries-by-region-id']['UNITID'].isna() == False]
test.reset_index(drop=True, inplace=True)
test = test.astype({'UNITID': int})

In [5]:
print(test.shape)
test.head(2)

(317, 9)


Unnamed: 0,UNITID,School Name,Region,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 25th Percentile Salary,Mid-Career 75th Percentile Salary,Mid-Career 90th Percentile Salary
0,243744,Stanford University,California,"$70,400.00","$129,000.00","$68,400.00","$93,100.00","$184,000.00","$257,000.00"
1,110404,California Institute of Technology (CIT),California,"$75,500.00","$123,000.00",,"$104,000.00","$161,000.00",


In [6]:
# load CBSA data
cbsa = pd.read_excel('2018_cbsa.xls',sheet_name=0, skiprows=2)
cbsa = cbsa[['CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']].dropna(how='all')
cbsa = cbsa.iloc[:-3]
cbsa['CBSA Code'] = pd.to_numeric(cbsa['CBSA Code'], errors='coerce').astype(int)
cbsa.shape

(1900, 4)

In [7]:
metro_df = pd.DataFrame(columns=['CBSA Code','CBSA Title','Metro'])

In [8]:
# Glassdoor Metro Areas and lookup keys
areas = {
    'Atlanta': 'Atlanta',
    'Boston': 'Boston',
    'Chicago': 'Chicago',
    'Houston': 'Houston',
    'Los Angeles': 'Los Angeles',
    'New York': 'New York City',
    'Philadelphia-Camden': 'Philadelphia',
    'San Francisco': 'San Francisco',
    'Seattle': 'Seattle',
    'DC': 'Washington DC'
}

In [9]:
for area, orig in areas.items():
    df = cbsa[cbsa['CBSA Title'].str.contains(area)][['CBSA Code','CBSA Title']].groupby(['CBSA Code','CBSA Title']).count()
    df.reset_index(inplace=True)
    df['CBSA Code'] = df['CBSA Code'].astype(int)
    df['Metro'] = orig
    metro_df = metro_df.append(df, ignore_index=True)
metro_df['CBSA Code'] = metro_df['CBSA Code'].astype(int)

In [10]:
print(metro_df.shape)
print(metro_df.dtypes)
metro_df

(10, 3)
CBSA Code      int64
CBSA Title    object
Metro         object
dtype: object


Unnamed: 0,CBSA Code,CBSA Title,Metro
0,12060,"Atlanta-Sandy Springs-Roswell, GA",Atlanta
1,14460,"Boston-Cambridge-Newton, MA-NH",Boston
2,16980,"Chicago-Naperville-Elgin, IL-IN-WI",Chicago
3,26420,"Houston-The Woodlands-Sugar Land, TX",Houston
4,31080,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles
5,35620,"New York-Newark-Jersey City, NY-NJ-PA",New York City
6,37980,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",Philadelphia
7,41860,"San Francisco-Oakland-Hayward, CA",San Francisco
8,42660,"Seattle-Tacoma-Bellevue, WA",Seattle
9,47900,"Washington-Arlington-Alexandria, DC-VA-MD-WV",Washington DC


In [11]:
metro_df.to_csv('cbsa_metro_list.csv', index=False, encoding='utf-8')

In [12]:
zips = pd.read_excel('ZIP_CBSA_122019.xlsx', sheet_name=0).rename(columns={'CBSA': 'CBSA Code'})[['ZIP','CBSA Code']]

In [13]:
print(zips.shape)
print(zips.dtypes)
zips.head(2)

(50754, 2)
ZIP          int64
CBSA Code    int64
dtype: object


Unnamed: 0,ZIP,CBSA Code
0,501,35620
1,601,38660


In [14]:
cbsa_df = zips.merge(metro_df, on='CBSA Code', how='inner').drop_duplicates(subset=None,keep='first',inplace=False)
cbsa_df = cbsa_df.dropna(how='any')

In [15]:
cbsa_df.dtypes

ZIP            int64
CBSA Code      int64
CBSA Title    object
Metro         object
dtype: object

In [16]:
cbsa_df.to_csv('cbsa_zip_list.csv', index=False, encoding='utf-8')