In [2]:
# import dependencies
import os
import numpy as np
import pandas as pd

In [3]:
# read the datadictionary on the third sheet from excel file
datadict = pd.read_excel('CollegeScorecardDataDictionary.xlsx', sheet_name=3)

In [4]:
# retrieve datadictionary information for degrees
deg_cols = datadict[(datadict['dev-category']=='academics') &  (datadict['VARIABLE NAME'].str.contains('PCIP'))]

In [5]:
"""
HIGHDEG
	0	Non-degree-granting
	1	Certificate degree
	2	Associate degree
	3	Bachelor's degree
	4	Graduate degree

ICLEVEL
	1	4-year
	2	2-year
	3	Less-than-2-year

REGION
	0	U.S. Service Schools
	1	New England (CT, ME, MA, NH, RI, VT)
	2	Mid East (DE, DC, MD, NJ, NY, PA)
	3	Great Lakes (IL, IN, MI, OH, WI)
	4	Plains (IA, KS, MN, MO, NE, ND, SD)
	5	Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)
	6	Southwest (AZ, NM, OK, TX)
	7	Rocky Mountains (CO, ID, MT, UT, WY)
	8	Far West (AK, CA, HI, NV, OR, WA)
	9	Outlying Areas (AS, FM, GU, MH, MP, PR, PW, VI)

DISTANCEONLY
	0	Not distance-education only
	1	Distance-education only
    
CURROPER
	0	Not currently certified as an operating institution
	1	Currently certified as operating
    
SCHTYPE
	1	Public            
	2	Private, Nonprofit
	3	Proprietary       
    
    
"""

"\nHIGHDEG\n\t0\tNon-degree-granting\n\t1\tCertificate degree\n\t2\tAssociate degree\n\t3\tBachelor's degree\n\t4\tGraduate degree\n\nICLEVEL\n\t1\t4-year\n\t2\t2-year\n\t3\tLess-than-2-year\n\nREGION\n\t0\tU.S. Service Schools\n\t1\tNew England (CT, ME, MA, NH, RI, VT)\n\t2\tMid East (DE, DC, MD, NJ, NY, PA)\n\t3\tGreat Lakes (IL, IN, MI, OH, WI)\n\t4\tPlains (IA, KS, MN, MO, NE, ND, SD)\n\t5\tSoutheast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)\n\t6\tSouthwest (AZ, NM, OK, TX)\n\t7\tRocky Mountains (CO, ID, MT, UT, WY)\n\t8\tFar West (AK, CA, HI, NV, OR, WA)\n\t9\tOutlying Areas (AS, FM, GU, MH, MP, PR, PW, VI)\n\nDISTANCEONLY\n\t0\tNot distance-education only\n\t1\tDistance-education only\n    \nCURROPER\n\t0\tNot currently certified as an operating institution\n\t1\tCurrently certified as operating\n    \nSCHTYPE\n\t1\tPublic            \n\t2\tPrivate, Nonprofit\n\t3\tProprietary       \n    \n    \n"

In [7]:
# read all CSV files into a pandas dataframe in which the year is the key for the dataframe
file_path = '.'
scorecard_df = pd.DataFrame()
for filename in os.listdir(file_path):
    if 'MERGED' in filename:
        year = filename[6:][:7]
        df = pd.read_csv(os.path.join(file_path, filename), encoding='utf-8', low_memory=False)
        df = df[['UNITID','INSTNM',\
                 'CITY','STABBR','ZIP','REGION','LATITUDE','LONGITUDE',\
                 'SCHTYPE','HIGHDEG','ICLEVEL',\
                 'DISTANCEONLY','CURROPER',\
                  *deg_cols['VARIABLE NAME'].to_list()\
                 ]]
        df['YEAR'] = int(year)
        try:
            scorecard_df = scorecard_df.append(df, ignore_index=True)
        except:
            print('append failed')
scorecard_df = scorecard_df.rename(columns={ row[0]: row[1] for index, row in deg_cols[['VARIABLE NAME','LABEL']].iterrows() })
scorecard_df['ZIP'] = scorecard_df['ZIP'].str[:5]
print(scorecard_df.shape)
cond1 = (scorecard_df['CURROPER'] == 1)           # select currently operating schools
#scorecard_df = scorecard_df[cond1]
cond2 = (scorecard_df['HIGHDEG'] > 0)             # select degree granting schools only
scorecard_df = scorecard_df[cond1 & cond2] 

print(scorecard_df.shape)
#print(scorecard_df.dtypes)
scorecard_df.head(2)

(13918, 52)
(6127, 52)


Unnamed: 0,UNITID,INSTNM,CITY,STABBR,ZIP,REGION,LATITUDE,LONGITUDE,SCHTYPE,HIGHDEG,...,Social Sciences,Construction Trades,Mechanic and Repair Technologies/Technicians,Precision Production,Transportation and Materials Moving,Visual and Performing Arts,Health Professions and Related Programs,"Business, Management, Marketing, and Related Support Services",History,YEAR
0,100654,Alabama A & M University,Normal,AL,35762,5,34.783368,-86.568502,1.0,4,...,0.0355,0.0,0.0,0.0,0.0,0.0237,0.0,0.1578,0.0,201819
1,100663,University of Alabama at Birmingham,Birmingham,AL,35294,5,33.505697,-86.799345,1.0,4,...,0.0315,0.0,0.0,0.0,0.0,0.0339,0.2255,0.1908,0.01,201819


In [None]:
metro_df = pd.DataFrame(columns=['ZIP','CBSA Code','CBSA Title','Metro','CSA Code','CSA Title'])

In [None]:
areas = {
    'Atlanta': 'Atlanta',
    'Boston': 'Boston',
    'Chicago': 'Chicago',
    'Houston': 'Houston',
    'Los Angeles': 'Los Angeles',
    'New York': 'New York City',
    'Philadelphia-Camden': 'Philadelphia',
    'San Francisco': 'San Francisco',
    'Seattle': 'Seattle',
    'DC': 'Washington DC'
}

In [None]:
for area, orig in areas.items():
    df = merged_df[merged_df['CBSA Title'].str.contains(area)].groupby(['ZIP','CBSA Code','CBSA Title','CSA Code','CSA Title']).count()
    df.reset_index(inplace=True)
    df['Metro'] = orig
    metro_df = metro_df.append(df, ignore_index=True)
metro_df = metro_df.astype({ 'CBSA Code': int, 'CSA Code': int })
metro_df = metro_df.drop_duplicates()

In [None]:
print(metro_df.shape)
print(metro_df.dtypes)
metro_df.head(2)

In [None]:
merged_df[(merged_df['CBSA Title'].isna()==True)&(merged_df['CSA Title'].isna()==True)]#.value_counts()

In [None]:
kaggle_files = [ file for file in os.listdir('kaggle') if file.endswith('.csv') ]

In [None]:
kaggle_dfs = { filename[:-4]: pd.read_csv(os.path.join(os.getcwd(), 'kaggle', filename), encoding='utf-8', low_memory=False) for filename in kaggle_files }

In [None]:
kaggle_dfs.keys()

In [None]:
region_test = kaggle_dfs['salaries-by-region-id'][kaggle_dfs['salaries-by-region-id']['UNITID'].isna() == False]
region_test.reset_index(drop=True, inplace=True)
region_test = region_test.astype({'UNITID': int})

In [None]:
#region_test['INSTNM'] = region_test.progress_apply(lambda x: str(x['School Name'].split(' (')[0]), axis=1)
#region_test['INSTNM'] = region_test.progress_apply(lambda x: str(x['School Name'].replace(',','-')), axis=1)

In [None]:
region_test.head()

In [None]:
region_test.dtypes

In [None]:
inst_test = test.merge(metro_df, on='ZIP', how='inner')

In [None]:
print(inst_test.shape)
#print(inst_test.dtypes)
inst_test.head(2)

In [None]:
inst_test[(inst_test['CBSA Title'].str.contains('New York'))&(inst_test['CITY'].str.contains('Elizabeth'))]

In [8]:
zips_df = pd.read_csv('../cbsa/cbsa_zip_list.csv', encoding='utf-8')

In [9]:
zips_df = pd.read_excel('../cbsa/ZIP_CBSA_122019.xlsx', sheet_name=0, converters={'ZIP': str})[['ZIP','CBSA']]
zips_df = zips_df.rename(columns={'CBSA': 'CBSA Code'})

In [10]:
zip_df = pd.read_excel('../cbsa/ZIP_CBSA_122017.xlsx', sheet_name=0, converters={'zip': str})[['zip','cbsa']]
zip_df = zip_df.rename(columns={'zip': 'ZIP', 'cbsa': 'CBSA Code'})
zips_df = zips_df.append(zip_df)
zips_df = zips_df.drop_duplicates()

In [11]:
print(zips_df.shape)
print(zips_df.dtypes)
zips_df.head(2)


(52426, 2)
ZIP          object
CBSA Code     int64
dtype: object


Unnamed: 0,ZIP,CBSA Code
0,501,35620
1,601,38660


In [None]:
cbsa_df = pd.read_excel('../cbsa/2018_cbsa.xls', sheet_name=0, skiprows=2)
cbsa_df = cbsa_df[['CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']].dropna(how='all')
cbsa_df = cbsa_df.iloc[:-3]
cbsa_df = cbsa_df.drop_duplicates()
cbsa_df = cbsa_df.astype({ 'CBSA Code': int })

In [None]:
cbs_df = pd.read_excel('../cbsa/2017_cbsa.xls', sheet_name=0, skiprows=2)
cbs_df = cbs_df[['CBSA Code', 'CBSA Title', 'CSA Code', 'CSA Title']].dropna(how='all')
cbs_df = cbs_df.iloc[:-3]
cbs_df = cbs_df.drop_duplicates()
cbs_df = cbs_df.astype({ 'CBSA Code': int })
cbsa_df = cbsa_df.append(cbs_df)
cbsa_df = cbsa_df.drop_duplicates()

In [None]:
print(cbsa_df.shape)
print(cbsa_df.dtypes)
cbsa_df.head(2)

In [None]:
zips_df.merge(cbsa_df, on='CBSA Code', how='left')[zips_df.merge(cbsa_df, on='CBSA Code', how='left')['ZIP']=='06511']

In [None]:
merged_df = zips_df.merge(cbsa_df, on='CBSA Code', how='left')
merged_df = merged_df.drop_duplicates()
#.drop_duplicates(subset=None,keep='first',inplace=False)
merged_df = merged_df[merged_df['CBSA Title'].isna() == False]

In [None]:
print(merged_df.shape)
print(merged_df.dtypes)
merged_df.head(2)