In [18]:
# import dependencies
import pandas as pd
import os
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect

In [19]:
# create a connection to the database
engine = create_engine(f'postgresql://postgres:postgres@localhost:5432/HigherEducation')
con = engine.connect()

### college_names table

In [20]:
### College Scorecard Data Dictionary

HIGHDEG = {
	0:	'Non-degree-granting',
	1:	'Certificate degree',
	2:	'Associate degree',
	3:	'Bachelors degree',
	4:	'Graduate degree'
}

ICLEVEL = {
	1:	'4-year',
	2:	'2-year',
	3:	'Less-than-2-year'
}

REGION = {
	0:	'U.S. Service Schools',
	1:	'New England (CT, ME, MA, NH, RI, VT)',
	2:	'Mid East (DE, DC, MD, NJ, NY, PA)',
	3:	'Great Lakes (IL, IN, MI, OH, WI)',
	4:	'Plains (IA, KS, MN, MO, NE, ND, SD)',
	5:	'Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)',
	6:	'Southwest (AZ, NM, OK, TX)',
	7:	'Rocky Mountains (CO, ID, MT, UT, WY)',
	8:	'Far West (AK, CA, HI, NV, OR, WA)',
	9:	'Outlying Areas (AS, FM, GU, MH, MP, PR, PW, VI)'
}

DISTANCEONLY = {
	0:	'Not distance-education only',
	1:	'Distance-education only'
}
    
CURROPER = {
	0:	'Not currently certified as an operating institution',
	1:	'Currently certified as operating'
}
    
SCHTYPE = { 
	1:	'Public',
	2:	'Private, Nonprofit',
	3:	'Private, For-profit'
}

In [21]:
# set files path
path = 'data/scorecard'

# columns to select from datasets
college_columns = [ 'UNITID','INSTNM','INSTURL',\
                    'CITY','STABBR','ZIP',\
                    'LATITUDE','LONGITUDE',\
                    'ST_FIPS', 'REGION','SCHTYPE',\
                    'HIGHDEG','CURROPER'
                  ]

# columns renamed as user friendly labels for use in database
college_cols_renamed = {
                        'UNITID'    : 'college_id',
                        'INSTNM'    : 'name',
                        'INSTURL'   : 'website',
                        'SCHTYPE'   : 'schtype',
                        'CITY'      : 'city',
                        'STABBR'    : 'state',
                        'ZIP'       : 'zipcode',
                        'ST_FIPS'   : 'state_fips',
                        'LATITUDE'  : 'latitude',
                        'LONGITUDE' : 'longitude',
                        'REGION'    : 'region'
                       }
# create an empty dataframe for storing the CSV files data
college_names_df = pd.DataFrame() 

# loop through files in current directory
for filename in os.listdir(path):
    if 'MERGED' in filename:
        year = filename[6:][:7]
        df = pd.read_csv(os.path.join(path, filename), encoding='utf-8', low_memory=False)
        df = df[college_columns]
        try:
            college_names_df = college_names_df.append(df, ignore_index=True)
        except:
            pass
            
# keep only the first 5 characters of the zip code
college_names_df['ZIP'] = college_names_df['ZIP'].str[:5]

cond1 = (college_names_df['CURROPER'] == 1)           # select currently operating schools
cond2 = (college_names_df['HIGHDEG'] > 0)             # select degree granting schools only
college_names_df = college_names_df[cond1 & cond2]    # filter data and save as updated dataframe

# rename columns
college_names_df = college_names_df.rename(columns=college_cols_renamed)

# drop last two rows
df_cols = len(college_names_df.columns)
college_names_df.drop(college_names_df.iloc[:, df_cols-2:df_cols], inplace = True, axis = 1)

# update region names using the data dictionary provided by College Scorecard
college_names_df['region'] = college_names_df['region'].replace(REGION)
college_names_df['schtype'] = college_names_df['schtype'].replace(SCHTYPE)

# Import schools from Kaggle CSV
schools_df = pd.read_csv('data/kaggle/salaries-by-region-id.csv', encoding='utf-8').iloc[:, 0:2]

# drop any rows with a missing college id
schools_df = schools_df[schools_df['UNITID'].isna() == False]

# convert the college id to an integer
schools_df = schools_df.astype({ 'UNITID': int })

# rename the `UNITID` column to `college_id`
schools_df.rename(columns={'UNITID': 'college_id', 'School Name': 'name'}, inplace=True)

# drop duplicates
schools_df.drop_duplicates(inplace=True)

# sort `college_id`
schools_df.sort_values(by='college_id', inplace=True)

# reset index
schools_df.reset_index(drop=True, inplace=True)

# Import school types from Kaggle CSV file
school_types_df = pd.read_csv('data/kaggle/salaries-by-college-type-id.csv', encoding='utf-8').iloc[:, 0:3]

# drop any rows with a missing college id
school_types_df = school_types_df[school_types_df['UNITID'].isna() == False]

# convert the college id to an integer
school_types_df = school_types_df.astype({ 'UNITID': int })

# rename the `UNITID` column to `college_id`
school_types_df.rename(columns={'UNITID': 'college_id', 'School Name': 'name', 'School Type': 'type'}, inplace=True)

# update school types of `Party` or `State` to `Public`
school_types_df.loc[school_types_df['type'].isin(['Party','State']), 'type'] = 'Public'

# drop duplicates
school_types_df.drop_duplicates(inplace=True)

# remove bad data
school_types_df = school_types_df.drop(school_types_df[(school_types_df['college_id']==233295) & (school_types_df['type']=='Public')].index)

# sort by `college_id`
school_types_df.sort_values(by='college_id', inplace=True)

# reset index
school_types_df.reset_index(drop=True,inplace=True)

# merge Kaggle schools data with Scorecard schools data
college_names_tbl = pd.merge(schools_df.merge(school_types_df, on=['college_id','name'], how='left'),
              college_names_df, on=['college_id'], how='left')

# replace missing Kaggle school type with Scorecard school type
college_names_tbl.type.fillna(college_names_tbl.schtype, inplace=True)

# delete unneeded columns
del college_names_tbl['schtype']
del college_names_tbl['name_y']

# rename `name` column
college_names_tbl.rename(columns={'name_x': 'name'}, inplace=True)

# set `college_id` as the index
college_names_tbl.set_index('college_id', inplace=True)

# store table in database
college_names_tbl.to_sql(name='college_names', con=engine, if_exists='replace', index=True, method='multi')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd3 in position 8: invalid continuation byte

### regions table

In [22]:
# set files path
path = 'data/cbsa'

# create dataframe to hold all zip code data from files
zips_df = pd.DataFrame(columns=['zipcode', 'cbsa_code'])

# loop through current working directory ZIP code excel files
for filename in os.listdir(path):
    if filename.startswith('ZIP')&filename.endswith('.xlsx'):
        df = pd.read_excel(os.path.join(path, filename), sheet_name=0, converters={'ZIP': str, 'zip': str})
        df = df.rename(columns={'zip': 'zipcode','cbsa': 'cbsa_code', 'ZIP': 'zipcode', 'CBSA': 'cbsa_code'})[['zipcode', 'cbsa_code']]
        zips_df = zips_df.append(df, ignore_index=True)

# drop any duplicate rows
zips_df.drop_duplicates(inplace=True)

# sort table by zip code
zips_df.sort_values(by=['zipcode'], inplace=True)

# reset the index
zips_df.reset_index(drop=True, inplace=True)

# cbsa dataframe column names
cbsa_columns = [ 
                 'cbsa_code', 'cbsa_title',
                 'csa_code', 'title', 
                 'state_name', 'state_fips', 
                 'county', 'county_code'
               ]

cbsa_cols_renamed = {
                        'CBSA Code': 'cbsa_code', 'CBSA Title': 'cbsa_title',
                        'CSA Code': 'csa_code', 'CSA Title': 'title', 
                        'State Name': 'state_name', 'FIPS State Code': 'state_fips',
                        'County/County Equivalent': 'county',
                        'FIPS County Code': 'county_code'
                    }

# create dataframe to hold all cbsa code data from files
cbsa_df = pd.DataFrame(columns=cbsa_columns)

# loop through current working directory CBSA code excel files
for filename in os.listdir(path):
    if filename.endswith('_cbsa.xls'):
        # read excel file and delete the last three rows containing text
        df = pd.read_excel(os.path.join(path, filename), sheet_name=0, skiprows=2).iloc[:-3]
        df = df.rename(columns=cbsa_cols_renamed)[cbsa_columns]
        cbsa_df = cbsa_df.append(df, ignore_index=True)


# drop any duplicate rows
cbsa_df.drop_duplicates(inplace=True)

# replace missing csa titles with the cbsa title
cbsa_df.title.fillna(cbsa_df.cbsa_title, inplace=True)

# sort table by zip code
cbsa_df.sort_values(by=['state_fips'], inplace=True)

# reset the index
cbsa_df.reset_index(drop=True, inplace=True)

# make sure that the codes are integers
cbsa_int_cols = ['cbsa_code', 'csa_code', 'state_fips', 'county_code']
cbsa_df.fillna({ col: 0 for col in cbsa_int_cols }, inplace=True)
cbsa_df = cbsa_df.astype({ col: int for col in cbsa_int_cols })

# delete unneeded columns
del cbsa_df['cbsa_title']

# drop rows with any missing data
cbsa_df.dropna(how='any', inplace=True)

# restructure columns
cbsa_df = cbsa_df[['cbsa_code', 'title','state_name','state_fips']]

# Glassdoor Metro Areas and lookup keys
areas = {
    'Atlanta': 'Atlanta',
    'Boston': 'Boston',
    'Chicago': 'Chicago',
    'Houston': 'Houston',
    'Los Angeles': 'Los Angeles',
    'New York': 'New York City',
    'Philadelphia-Camden': 'Philadelphia',
    'San Francisco': 'San Francisco',
    'Seattle': 'Seattle',
    'DC': 'Washington DC'
}

# create metro column in table with default string 'N/A'
cbsa_df['metro'] = 'N/A'

# loop through lookup keys
for area,metro in areas.items():
    # set glassdoor metro name where lookup key is found in CBSA title
    cbsa_df.loc[cbsa_df.title.str.contains(area), 'metro'] = metro
    
# merge transformations into one table for storing to the database
regions_tbl = pd.merge(zips_df, cbsa_df, on=['cbsa_code'], how='left')

# drop any duplicate rows
regions_tbl.drop_duplicates(inplace=True)

# drop rows with any missing data
regions_tbl.dropna(how='any', thresh=3, inplace=True)

# reset the index
regions_tbl.reset_index(drop=True, inplace=True)

# make sure that the codes are integers
regions_int_cols = ['cbsa_code', 'state_fips']
regions_tbl.fillna({ col: 0 for col in regions_int_cols }, inplace=True)
regions_tbl = regions_tbl.astype({ col: int for col in regions_int_cols })

# set `zipcode` as the index
regions_tbl.set_index('zipcode', inplace=True)

# store table in database
regions_tbl.to_sql(name='regions', con=engine, if_exists='replace', index=True, method='multi')

# Import National Salary For Each Major

In [36]:
degreesTPB_df = pd.read_csv(r"data/kaggle/degrees-that-pay-back.csv")
#degreesTPB_df.head(5)

degreesTPB_df = degreesTPB_df.rename(columns={"Undergraduate Major": "Majors"})
#degreesTPB_df.head(5)

#Transform Salary to Integers

for col in degreesTPB_df.columns:
    if 'Salary' in col:
        degreesTPB_df[col] = degreesTPB_df[col].replace( '[\$,)]','', regex=True ).astype(float)
        
#Drop Unnecessary Columns
degreesTPB_df = degreesTPB_df.drop(columns=['Percent change from Starting to Mid-Career Salary', 'Mid-Career 10th Percentile Salary', 'Mid-Career 25th Percentile Salary', 'Mid-Career 75th Percentile Salary', 'Mid-Career 90th Percentile Salary'])
degreesTPB_df.head(5)

#Connect to local database
rds_connection_string = "postgres:postgres@localhost:5432/HigherEducation"
engine = create_engine(f'postgresql://{rds_connection_string}')

#Check For Tables

engine.table_names()

#Use pandas to load csv converted DataFrame into database

if_exists_param = 'replace'

degreesTPB_df.to_sql(name='salaries_per_major', con=engine, if_exists=if_exists_param, index=False)
#degreesTPB_df['Major_id'].to_sql(name='school_majors', con=engine, if_exists=if_exists_param, index=False)

#get table names

#inspector = inspect(engine)
#inspector.get_table_names()