# Creating college_names Table

In [1]:
# import dependencies
import os
import pandas as pd
from sqlalchemy import create_engine

In [130]:
# set up display area to show dataframe in jupyter qtconsole
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1248)

In [2]:
# create a connection to the database
engine = create_engine('postgresql://postgres:postgres@localhost:5432/HigherEducation')
con = engine.connect()

In [3]:
# read the datadictionary on the third sheet from excel file
datadict = pd.read_excel('CollegeScorecardDataDictionary.xlsx', sheet_name=3)

In [4]:
# retrieve datadictionary information for degrees
deg_cols = datadict[(datadict['dev-category']=='academics') &  (datadict['VARIABLE NAME'].str.contains('PCIP'))]

### College Scorecard Data Dictionary of Interest

In [5]:
HIGHDEG = {
	0:	'Non-degree-granting',
	1:	'Certificate degree',
	2:	'Associate degree',
	3:	'Bachelors degree',
	4:	'Graduate degree'
}

ICLEVEL = {
	1:	'4-year',
	2:	'2-year',
	3:	'Less-than-2-year'
}

REGION = {
	0:	'U.S. Service Schools',
	1:	'New England (CT, ME, MA, NH, RI, VT)',
	2:	'Mid East (DE, DC, MD, NJ, NY, PA)',
	3:	'Great Lakes (IL, IN, MI, OH, WI)',
	4:	'Plains (IA, KS, MN, MO, NE, ND, SD)',
	5:	'Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)',
	6:	'Southwest (AZ, NM, OK, TX)',
	7:	'Rocky Mountains (CO, ID, MT, UT, WY)',
	8:	'Far West (AK, CA, HI, NV, OR, WA)',
	9:	'Outlying Areas (AS, FM, GU, MH, MP, PR, PW, VI)'
}

DISTANCEONLY = {
	0:	'Not distance-education only',
	1:	'Distance-education only'
}
    
CURROPER = {
	0:	'Not currently certified as an operating institution',
	1:	'Currently certified as operating'
}
    
SCHTYPE = { 
	1:	'Public',
	2:	'Private, Nonprofit',
	3:	'Private, For-profit'
}

## Create college_names dataframe
### read all CSV files into a pandas dataframe in which the year is the key for the dataframe

In [105]:
# get current working directory
file_path = os.getcwd()

# create an empty dataframe for storing the CSV files data
college_names_df = pd.DataFrame() 

# loop through files in current directory
for filename in os.listdir(file_path):
    if 'MERGED' in filename:
        year = filename[6:][:7]
        df = pd.read_csv(os.path.join(file_path, filename), encoding='utf-8', low_memory=False)
        df = df[['UNITID','INSTNM','INSTURL',\
                 'CITY','STABBR','ZIP',\
                 'LATITUDE','LONGITUDE',\
                 'REGION','SCHTYPE',\
                 'HIGHDEG','CURROPER'
                 ]]
        #df['YEAR'] = int(year)
        try:
            college_names_df = college_names_df.append(df, ignore_index=True)
        except:
            print('CSV dataframe append failed')
            
# keep only the first 5 characters of the zip code
college_names_df['ZIP'] = college_names_df['ZIP'].str[:5]

cond1 = (college_names_df['CURROPER'] == 1)           # select currently operating schools
cond2 = (college_names_df['HIGHDEG'] > 0)             # select degree granting schools only
college_names_df = college_names_df[cond1 & cond2]    # filter data and save as updated dataframe

# rename columns
college_names_df = college_names_df.rename(columns={
                                                        'UNITID'    : 'college_id',
                                                        'INSTNM'    : 'name',
                                                        'INSTURL'   : 'website',
                                                        'SCHTYPE'   : 'schtype',
                                                        'CITY'      : 'city',
                                                        'STABBR'    : 'state',
                                                        'ZIP'       : 'zipcode',
                                                        'LATITUDE'  : 'latitude',
                                                        'LONGITUDE' : 'longitude',
                                                        'REGION'    : 'region'
                                                    })

# drop last two rows
df_cols = len(college_names_df.columns)
college_names_df.drop(college_names_df.iloc[:, df_cols-2:df_cols], inplace = True, axis = 1)

# update region names using the data dictionary provided by College Scorecard
college_names_df['region'] = college_names_df['region'].replace(REGION)
college_names_df['schtype'] = college_names_df['schtype'].replace(SCHTYPE)

print(college_names_df.shape)
college_names_df.head(2)

(6127, 10)


Unnamed: 0,college_id,name,website,city,state,zipcode,latitude,longitude,region,schtype
0,100654,Alabama A & M University,www.aamu.edu/,Normal,AL,35762,34.783368,-86.568502,"Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC,...",Public
1,100663,University of Alabama at Birmingham,www.uab.edu,Birmingham,AL,35294,33.505697,-86.799345,"Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC,...",Public


### Import CSVs from Kaggle with Schools Listed

In [112]:
schools_df = pd.read_csv('../kaggle/salaries-by-region-id.csv', encoding='utf-8').iloc[:, 0:2]

# drop any rows with a missing college id
schools_df = schools_df[schools_df['UNITID'].isna() == False]

# convert the college id to an integer
schools_df = schools_df.astype({ 'UNITID': int })

# rename the `UNITID` column to `college_id`
schools_df.rename(columns={'UNITID': 'college_id', 'School Name': 'name'}, inplace=True)

# drop duplicates
schools_df.drop_duplicates(inplace=True)

# sort `college_id`
schools_df.sort_values(by='college_id', inplace=True)

# reset index
schools_df.reset_index(drop=True, inplace=True)

# set `college_id` as the index
#schools_df.set_index('college_id', inplace=True)

print(schools_df.shape)
schools_df.head()

(317, 2)


Unnamed: 0,college_id,name
0,100663,University of Alabama at Birmingham (UAB)
1,100706,University of Alabama at Huntsville (UAH)
2,100751,"University of Alabama, Tuscaloosa"
3,100858,Auburn University
4,102553,"University of Alaska, Anchorage"


In [113]:
# retrieve school types from CSV file
school_types_df = pd.read_csv('../kaggle/salaries-by-college-type-id.csv', encoding='utf-8').iloc[:, 0:3]

# drop any rows with a missing college id
school_types_df = school_types_df[school_types_df['UNITID'].isna() == False]

# convert the college id to an integer
school_types_df = school_types_df.astype({ 'UNITID': int })

# rename the `UNITID` column to `college_id`
school_types_df.rename(columns={'UNITID': 'college_id', 'School Name': 'name', 'School Type': 'type'}, inplace=True)

# update school types of `Party` or `State` to `Public`
school_types_df.loc[school_types_df['type'].isin(['Party','State']), 'type'] = 'Public'

# drop duplicates
school_types_df.drop_duplicates(inplace=True)

# remove bad data
school_types_df = school_types_df.drop(school_types_df[(school_types_df['college_id']==233295) & (school_types_df['type']=='Public')].index)

# sort by `college_id`
school_types_df.sort_values(by='college_id', inplace=True)

# reset index
school_types_df.reset_index(drop=True,inplace=True)

print(school_types_df.shape)
school_types_df.head()

(248, 3)


Unnamed: 0,college_id,name,type
0,100663,University of Alabama at Birmingham (UAB),Public
1,100706,University of Alabama at Huntsville (UAH),Public
2,100751,"University of Alabama, Tuscaloosa",Public
3,100858,Auburn University,Public
4,102553,"University of Alaska, Anchorage",Public


In [116]:
school_types_df.groupby('type')['name'].count()

type
Engineering      18
Ivy League        8
Liberal Arts     47
Public          175
Name: name, dtype: int64

In [118]:
test = schools_df.merge(school_types_df, on=['college_id','name'], how='left')

In [120]:
test.merge(college_names_df, on=['college_id'], how='left')[['college_id','type','schtype']]['type'].isna().value_counts()

False    247
True      70
Name: type, dtype: int64

In [144]:
# merge Kaggle schools data with Scorecard schools data
college_names_tbl = pd.merge(schools_df.merge(school_types_df, on=['college_id','name'], how='left'),
              college_names_df, on=['college_id'], how='left')

# replace missing Kaggle school type with Scorecard school type
college_names_tbl.type.fillna(college_names_tbl.schtype, inplace=True)

# delete unneeded columns
del college_names_tbl['schtype']
del college_names_tbl['name_y']

# rename `name` column
college_names_tbl.rename(columns={'name_x': 'name'}, inplace=True)

# set `college_id` as the index
college_names_tbl.set_index('college_id', inplace=True)

# display table
print(college_names_tbl.shape)
college_names_tbl.head()

In [152]:
# extract table to CSV file
#college_names_tbl.to_csv('../../resources/college_names.csv', sep=',', encoding='utf-8', mode='w')

In [146]:
# store table in database
college_names_tbl.to_sql(name='college_names', con=engine, if_exists='replace', index=True, method='multi')

In [147]:
# query database table
pd.read_sql('SELECT * FROM college_names', con=engine, index_col='college_id')

Unnamed: 0_level_0,name,type,website,city,state,zipcode,latitude,longitude,region
college_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
100663,University of Alabama at Birmingham (UAB),Public,www.uab.edu,Birmingham,AL,35294,33.505697,-86.799345,"Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC,..."
100706,University of Alabama at Huntsville (UAH),Public,www.uah.edu,Huntsville,AL,35899,34.724557,-86.640449,"Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC,..."
100751,"University of Alabama, Tuscaloosa",Public,www.ua.edu/,Tuscaloosa,AL,35487,33.211875,-87.545978,"Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC,..."
100858,Auburn University,Public,www.auburn.edu,Auburn,AL,36849,32.599378,-85.488258,"Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC,..."
102553,"University of Alaska, Anchorage",Public,www.uaa.alaska.edu,Anchorage,AK,99508,61.190163,-149.82619,"Far West (AK, CA, HI, NV, OR, WA)"
104151,Arizona State University (ASU),Public,www.asu.edu/,Tempe,AZ,85287,33.417721,-111.934383,"Southwest (AZ, NM, OK, TX)"
104179,University of Arizona,Public,www.arizona.edu,Tucson,AZ,85721,32.232672,-110.950815,"Southwest (AZ, NM, OK, TX)"
106397,University of Arkansas,Public,https://www.uark.edu,Fayetteville,AR,72701,36.070009,-94.176981,"Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC,..."
106458,Arkansas State University (ASU),Public,www.astate.edu/,Jonesboro,AR,72401,35.842388,-90.679988,"Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC,..."
106485,University of Arkansas - Monticello (UAM),Public,www.uamont.edu/,Monticello,AR,71656,33.590909,-91.811153,"Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC,..."
