# Creating College Names and Regions Tables

In [1]:
# import dependencies
import os
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

In [2]:
engine = create_engine('postgresql://postgres:postgres@localhost:5432/HigherEducation')
con = engine.connect()

In [3]:
# read the datadictionary on the third sheet from excel file
datadict = pd.read_excel('CollegeScorecardDataDictionary.xlsx', sheet_name=3)

In [4]:
# retrieve datadictionary information for degrees
deg_cols = datadict[(datadict['dev-category']=='academics') &  (datadict['VARIABLE NAME'].str.contains('PCIP'))]

### College Scorecard Data Dictionary of Interest

In [5]:
HIGHDEG = {
	0:	'Non-degree-granting',
	1:	'Certificate degree',
	2:	'Associate degree',
	3:	'Bachelors degree',
	4:	'Graduate degree'
}

ICLEVEL = {
	1:	'4-year',
	2:	'2-year',
	3:	'Less-than-2-year'
}

REGION = {
	0:	'U.S. Service Schools',
	1:	'New England (CT, ME, MA, NH, RI, VT)',
	2:	'Mid East (DE, DC, MD, NJ, NY, PA)',
	3:	'Great Lakes (IL, IN, MI, OH, WI)',
	4:	'Plains (IA, KS, MN, MO, NE, ND, SD)',
	5:	'Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)',
	6:	'Southwest (AZ, NM, OK, TX)',
	7:	'Rocky Mountains (CO, ID, MT, UT, WY)',
	8:	'Far West (AK, CA, HI, NV, OR, WA)',
	9:	'Outlying Areas (AS, FM, GU, MH, MP, PR, PW, VI)'
}

DISTANCEONLY = {
	0:	'Not distance-education only',
	1:	'Distance-education only'
}
    
CURROPER = {
	0:	'Not currently certified as an operating institution',
	1:	'Currently certified as operating'
}
    
SCHTYPE = { 
	1:	'Public',
	2:	'Private, Nonprofit',
	3:	'Private, For-profit'
}

## Create college_names dataframe
### read all CSV files into a pandas dataframe in which the year is the key for the dataframe

In [102]:
# get current working directory
file_path = os.getcwd()

# create an empty dataframe for storing the CSV files data
college_names_df = pd.DataFrame() 

# loop through files in current directory
for filename in os.listdir(file_path):
    if 'MERGED' in filename:
        year = filename[6:][:7]
        df = pd.read_csv(os.path.join(file_path, filename), encoding='utf-8', low_memory=False)
        df = df[['UNITID','INSTNM','CITY','STABBR','ZIP',\
                 'LATITUDE','LONGITUDE',\
                 'REGION','SCHTYPE','HIGHDEG','CURROPER'
                 ]]
        #df['YEAR'] = int(year)
        try:
            college_names_df = college_names_df.append(df, ignore_index=True)
        except:
            print('CSV dataframe append failed')
            
# keep only the first 5 characters of the zip code
college_names_df['ZIP'] = college_names_df['ZIP'].str[:5]

cond1 = (college_names_df['CURROPER'] == 1)           # select currently operating schools
cond2 = (college_names_df['HIGHDEG'] > 0)             # select degree granting schools only
college_names_df = college_names_df[cond1 & cond2]    # filter data and save as updated dataframe

# rename columns
college_names_df = college_names_df.rename(columns={
                                                        'UNITID'    : 'college_id',
                                                        'INSTNM'    : 'name',
                                                        'SCHTYPE'   : 'schtype',
                                                        'CITY'      : 'city',
                                                        'STABBR'    : 'state',
                                                        'ZIP'       : 'zipcode',
                                                        'LATITUDE'  : 'latitude',
                                                        'LONGITUDE' : 'longitude',
                                                        'REGION'    : 'region'
                                                    })

# drop last two rows
df_cols = len(college_names_df.columns)
college_names_df.drop(college_names_df.iloc[:, df_cols-2:df_cols], inplace = True, axis = 1)

# update region names using the data dictionary provided by College Scorecard
college_names_df['region'] = college_names_df['region'].replace(REGION)
college_names_df['schtype'] = college_names_df['schtype'].replace(SCHTYPE)

print(college_names_df.shape)
college_names_df.head(2)

(6127, 9)


Unnamed: 0,college_id,name,city,state,zipcode,latitude,longitude,region,schtype
0,100654,Alabama A & M University,Normal,AL,35762,34.783368,-86.568502,"Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC,...",Public
1,100663,University of Alabama at Birmingham,Birmingham,AL,35294,33.505697,-86.799345,"Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC,...",Public


### Import CSVs from Kaggle with Schools Listed

In [56]:
schools_df = pd.read_csv('../kaggle/salaries-by-region-id.csv', encoding='utf-8').iloc[:, 0:2]

# drop any rows with a missing college id
schools_df = schools_df[schools_df['UNITID'].isna() == False]

# convert the college id to an integer
schools_df = schools_df.astype({ 'UNITID': int })

# rename the `UNITID` column to `college_id`
schools_df.rename(columns={'UNITID': 'college_id', 'School Name': 'name'}, inplace=True)

# drop duplicates
schools_df.drop_duplicates(inplace=True)

# set `college_id` as the index
#schools_df.set_index('college_id', inplace=True)

print(schools_df.shape)
schools_df.head()

(317, 2)


Unnamed: 0,college_id,name
0,243744,Stanford University
1,110404,California Institute of Technology (CIT)
2,115409,Harvey Mudd College
3,110635,"University of California, Berkeley"
4,120254,Occidental College


In [84]:
# retrieve school types from CSV file
school_types_df = pd.read_csv('../kaggle/salaries-by-college-type-id.csv', encoding='utf-8').iloc[:, 0:3]

# drop any rows with a missing college id
school_types_df = school_types_df[school_types_df['UNITID'].isna() == False]

# convert the college id to an integer
school_types_df = school_types_df.astype({ 'UNITID': int })

# rename the `UNITID` column to `college_id`
school_types_df.rename(columns={'UNITID': 'college_id', 'School Name': 'name', 'School Type': 'type'}, inplace=True)

# update school types of `Party` or `State` to `Public`
school_types_df.loc[school_types_df['type'].isin(['Party','State']), 'type'] = 'Public'

# drop duplicates
school_types_df.drop_duplicates(inplace=True)
school_types_df.reset_index(inplace=True)

# remove bad data
school_types_df = school_types_df.drop(school_types_df[(school_types_df['college_id']==233295) & (school_types_df['type']=='Public')].index)

# set `college_id` as the index
#school_types_df.set_index('college_id', inplace=True)

print(school_types_df.shape)
school_types_df.head()

(248, 4)


Unnamed: 0,index,college_id,name,type
0,0,166683,Massachusetts Institute of Technology (MIT),Engineering
1,1,110404,California Institute of Technology (CIT),Engineering
2,2,115409,Harvey Mudd College,Engineering
3,4,190372,Cooper Union,Engineering
4,5,168421,Worcester Polytechnic Institute (WPI),Engineering


In [86]:
school_types_df.groupby('type')['name'].count()

type
Engineering      18
Ivy League        8
Liberal Arts     47
Public          175
Name: name, dtype: int64

In [90]:
schools_df.merge(school_types_df, on=['college_id','name'], how='left')['type'].isna().value_counts()

False    247
True      70
Name: type, dtype: int64

In [94]:
test = schools_df.merge(school_types_df, on=['college_id','name'], how='left')

In [100]:
test.merge(college_names_df, on=['college_id'], how='left')[['college_id','type','schtype']]['type'].isna().value_counts()

False    247
True      70
Name: type, dtype: int64

In [None]:
metro_df = pd.DataFrame(columns=['ZIP','CBSA Code','CBSA Title','Metro','CSA Code','CSA Title'])

In [None]:
areas = {
    'Atlanta': 'Atlanta',
    'Boston': 'Boston',
    'Chicago': 'Chicago',
    'Houston': 'Houston',
    'Los Angeles': 'Los Angeles',
    'New York': 'New York City',
    'Philadelphia-Camden': 'Philadelphia',
    'San Francisco': 'San Francisco',
    'Seattle': 'Seattle',
    'DC': 'Washington DC'
}

In [None]:
for area, orig in areas.items():
    df = merged_df[merged_df['CBSA Title'].str.contains(area)].groupby(['ZIP','CBSA Code','CBSA Title','CSA Code','CSA Title']).count()
    df.reset_index(inplace=True)
    df['Metro'] = orig
    metro_df = metro_df.append(df, ignore_index=True)
metro_df = metro_df.astype({ 'CBSA Code': int, 'CSA Code': int })
metro_df = metro_df.drop_duplicates()

In [None]:
print(metro_df.shape)
print(metro_df.dtypes)
metro_df.head(2)

In [None]:
merged_df[(merged_df['CBSA Title'].isna()==True)&(merged_df['CSA Title'].isna()==True)]#.value_counts()

In [None]:
kaggle_files = [ file for file in os.listdir('kaggle') if file.endswith('.csv') ]

In [None]:
kaggle_dfs = { filename[:-4]: pd.read_csv(os.path.join(os.getcwd(), 'kaggle', filename), encoding='utf-8', low_memory=False) for filename in kaggle_files }

In [None]:
kaggle_dfs.keys()

In [None]:
region_test = kaggle_dfs['salaries-by-region-id'][kaggle_dfs['salaries-by-region-id']['UNITID'].isna() == False]
region_test.reset_index(drop=True, inplace=True)
region_test = region_test.astype({'UNITID': int})

In [None]:
#region_test['INSTNM'] = region_test.progress_apply(lambda x: str(x['School Name'].split(' (')[0]), axis=1)
#region_test['INSTNM'] = region_test.progress_apply(lambda x: str(x['School Name'].replace(',','-')), axis=1)

In [None]:
region_test.head()

In [None]:
region_test.dtypes

In [None]:
inst_test = test.merge(metro_df, on='ZIP', how='inner')

In [None]:
print(inst_test.shape)
#print(inst_test.dtypes)
inst_test.head(2)

In [None]:
inst_test[(inst_test['CBSA Title'].str.contains('New York'))&(inst_test['CITY'].str.contains('Elizabeth'))]

In [None]:
zips_df = pd.read_csv('../cbsa/cbsa_zip_list.csv', encoding='utf-8')

In [None]:
zips_df = pd.read_excel('../cbsa/ZIP_CBSA_122019.xlsx', sheet_name=0, converters={'ZIP': str})[['ZIP','CBSA']]
zips_df = zips_df.rename(columns={'CBSA': 'CBSA Code'})

In [None]:
zip_df = pd.read_excel('../cbsa/ZIP_CBSA_122017.xlsx', sheet_name=0, converters={'zip': str})[['zip','cbsa']]
zip_df = zip_df.rename(columns={'zip': 'ZIP', 'cbsa': 'CBSA Code'})
zips_df = zips_df.append(zip_df)
zips_df = zips_df.drop_duplicates()

In [None]:
print(zips_df.shape)
print(zips_df.dtypes)
zips_df.head(2)


In [None]:
cbsa_df = pd.read_excel('../cbsa/2018_cbsa.xls', sheet_name=0, skiprows=2)
cbsa_df = cbsa_df[['CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']].dropna(how='all')
cbsa_df = cbsa_df.iloc[:-3]
cbsa_df = cbsa_df.drop_duplicates()
cbsa_df = cbsa_df.astype({ 'CBSA Code': int })

In [None]:
cbs_df = pd.read_excel('../cbsa/2017_cbsa.xls', sheet_name=0, skiprows=2)
cbs_df = cbs_df[['CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']].dropna(how='all')
cbs_df = cbs_df.iloc[:-3]
cbs_df = cbs_df.drop_duplicates()
cbs_df = cbs_df.astype({ 'CBSA Code': int })
cbsa_df = cbsa_df.append(cbs_df)
cbsa_df = cbsa_df.drop_duplicates()

In [None]:
print(cbsa_df.shape)
print(cbsa_df.dtypes)
cbsa_df.head(2)

In [None]:
zips_df.merge(cbsa_df, on='CBSA Code', how='left')[zips_df.merge(cbsa_df, on='CBSA Code', how='left')['ZIP']=='06511']

In [None]:
merged_df = zips_df.merge(cbsa_df, on='CBSA Code', how='left')
merged_df = merged_df.drop_duplicates()
#.drop_duplicates(subset=None,keep='first',inplace=False)
merged_df = merged_df[merged_df['CBSA Title'].isna() == False]

In [None]:
print(merged_df.shape)
print(merged_df.dtypes)
merged_df.head(2)