# Creating regions Table

In [None]:
# import dependencies
import os
import pandas as pd
from sqlalchemy import create_engine

In [None]:
# set up display area to show dataframe in jupyter qtconsole
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1248)

In [None]:
# create a connection to the database
engine = create_engine(f'postgresql://postgres:postgres@localhost:5432/HigherEducation')
con = engine.connect()

### Process CVS files

### HUD-USPS ZIP Crosswalk Files
https://www.huduser.gov/portal/datasets/usps_crosswalk.html#data


In [None]:
# load CSV files from current directory

# create dataframe to hold all zip code data from files
zips_df = pd.DataFrame(columns=['zipcode', 'cbsa_code'])

# get current working directory
path = os.getcwd()

# loop through current working directory ZIP code excel files
for filename in os.listdir(path):
    if filename.startswith('ZIP')&filename.endswith('.xlsx'):
        df = pd.read_excel(os.path.join(path, filename), sheet_name=0, converters={'ZIP': str, 'zip': str})
        df = df.rename(columns={'zip': 'zipcode','cbsa': 'cbsa_code', 'ZIP': 'zipcode', 'CBSA': 'cbsa_code'})[['zipcode', 'cbsa_code']]
        zips_df = zips_df.append(df, ignore_index=True)

print(zips_df.shape)

# drop any duplicate rows
zips_df.drop_duplicates(inplace=True)

# sort table by zip code
zips_df.sort_values(by=['zipcode'], inplace=True)

# reset the index
zips_df.reset_index(drop=True, inplace=True)

print(zips_df.shape)
zips_df.head(2)

### CBSA data from Census
https://www2.census.gov/programs-surveys/metro-micro/geographies/reference-files/2018/delineation-files/list1.xls


In [None]:
# create dataframe to hold all cbsa code data from files
cbsa_columns = [ 
                 'cbsa_code', 'cbsa_title',
                 'csa_code', 'title', 
                 'state_name', 'state_fips', 
                 'county', 'county_code'
               ]

cbsa_cols_renamed = {
                        'CBSA Code': 'cbsa_code', 'CBSA Title': 'cbsa_title',
                        'CSA Code': 'csa_code', 'CSA Title': 'title', 
                        'State Name': 'state_name', 'FIPS State Code': 'state_fips',
                        'County/County Equivalent': 'county',
                        'FIPS County Code': 'county_code'
                    }

cbsa_df = pd.DataFrame(columns=cbsa_columns)

# get current working directory
path = os.getcwd()

# loop through current working directory CBSA code excel files
for filename in os.listdir(path):
    if filename.endswith('_cbsa.xls'):
        # read excel file and delete the last three rows containing text
        df = pd.read_excel(os.path.join(path, filename), sheet_name=0, skiprows=2).iloc[:-3]
        df = df.rename(columns=cbsa_cols_renamed)[cbsa_columns]
        cbsa_df = cbsa_df.append(df, ignore_index=True)

print(cbsa_df.shape)

# drop any duplicate rows
cbsa_df.drop_duplicates(inplace=True)

# replace missing csa titles with the cbsa title
cbsa_df.title.fillna(cbsa_df.cbsa_title, inplace=True)

# sort table by zip code
cbsa_df.sort_values(by=['state_fips'], inplace=True)

# reset the index
cbsa_df.reset_index(drop=True, inplace=True)

# make sure that the codes are integers
cbsa_int_cols = ['cbsa_code', 'csa_code', 'state_fips', 'county_code']
cbsa_df.fillna({ col: 0 for col in cbsa_int_cols }, inplace=True)
cbsa_df = cbsa_df.astype({ col: int for col in cbsa_int_cols })

# delete unneeded columns
del cbsa_df['cbsa_title']

# drop rows with any missing data
cbsa_df.dropna(how='any', inplace=True)

# restructure columns
cbsa_df = cbsa_df[['cbsa_code', 'title','state_name','state_fips']]

print(cbsa_df.shape)
cbsa_df.head()

### Glassdoor Economic Research
https://www.glassdoor.com/research/job-market-report-historical/


In [None]:
# Glassdoor Metro Areas and lookup keys
areas = {
    'Atlanta': 'Atlanta',
    'Boston': 'Boston',
    'Chicago': 'Chicago',
    'Houston': 'Houston',
    'Los Angeles': 'Los Angeles',
    'New York': 'New York City',
    'Philadelphia-Camden': 'Philadelphia',
    'San Francisco': 'San Francisco',
    'Seattle': 'Seattle',
    'DC': 'Washington DC'
}

In [None]:
# create metro column in table with default string 'N/A'
cbsa_df['metro'] = 'N/A'

# loop through lookup keys
for area,metro in areas.items():
    # set glassdoor metro name where lookup key is found in CBSA title
    cbsa_df.loc[cbsa_df.title.str.contains(area), 'metro'] = metro

In [None]:
# merge transformations into one table for storing to the database
regions_tbl = pd.merge(zips_df, cbsa_df, on=['cbsa_code'], how='left')

# drop any duplicate rows
regions_tbl.drop_duplicates(inplace=True)

# drop rows with any missing data
regions_tbl.dropna(how='any', thresh=3, inplace=True)

# reset the index
regions_tbl.reset_index(drop=True, inplace=True)

# make sure that the codes are integers
regions_int_cols = ['cbsa_code', 'state_fips']
regions_tbl.fillna({ col: 0 for col in regions_int_cols }, inplace=True)
regions_tbl = regions_tbl.astype({ col: int for col in regions_int_cols })

# set `zipcode` as the index
regions_tbl.set_index('zipcode', inplace=True)

# display table information
print(regions_tbl.shape)
regions_tbl.head()

In [None]:
# extract table to CSV file
# regions_tbl.to_csv('../../resources/regions.csv', sep=',', encoding='utf-8', mode='w')

In [None]:
# store table in database
regions_tbl.to_sql(name='regions', con=engine, if_exists='replace', index=True, method='multi')

In [None]:
# query database table
pd.read_sql('SELECT * FROM regions', con=engine, index_col='zipcode')