In [3]:
import pandas as pd
from sqlalchemy import create_engine

### Extract CSVs into DataFrames

In [4]:
salaries_file = "data/kaggle/salaries-by-college-type-id.csv"
majors_file = "data/kaggle/degrees-that-pay-back.csv"

In [5]:
salaries_df = pd.read_csv(salaries_file)
majors_df = pd.read_csv(majors_file)

In [6]:
salaries_df.head()

Unnamed: 0,UNITID,School Name,School Type,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 25th Percentile Salary,Mid-Career 75th Percentile Salary,Mid-Career 90th Percentile Salary
0,166683.0,Massachusetts Institute of Technology (MIT),Engineering,"$72,200.00","$126,000.00","$76,800.00","$99,200.00","$168,000.00","$220,000.00"
1,110404.0,California Institute of Technology (CIT),Engineering,"$75,500.00","$123,000.00",,"$104,000.00","$161,000.00",
2,115409.0,Harvey Mudd College,Engineering,"$71,800.00","$122,000.00",,"$96,000.00","$180,000.00",
3,,"Polytechnic University of New York, Brooklyn",Engineering,"$62,400.00","$114,000.00","$66,800.00","$94,300.00","$143,000.00","$190,000.00"
4,190372.0,Cooper Union,Engineering,"$62,200.00","$114,000.00",,"$80,200.00","$142,000.00",


In [7]:
final_salaries = salaries_df[['UNITID', 'Starting Median Salary', 'Mid-Career Median Salary']].copy() 

In [8]:
salaries_per_college = final_salaries.rename(columns={'UNITID': 'college_id', 
                                                      'Starting Median Salary': 'starting_median_salary',
                                                      'Mid-Career Median Salary': 'midcareer_median_salary'
                                                     })

In [9]:
salaries_per_college.dtypes

college_id                 float64
starting_median_salary      object
midcareer_median_salary     object
dtype: object

In [10]:
salaries_per_college

Unnamed: 0,college_id,starting_median_salary,midcareer_median_salary
0,166683.0,"$72,200.00","$126,000.00"
1,110404.0,"$75,500.00","$123,000.00"
2,115409.0,"$71,800.00","$122,000.00"
3,,"$62,400.00","$114,000.00"
4,190372.0,"$62,200.00","$114,000.00"
...,...,...,...
264,219602.0,"$37,700.00","$59,200.00"
265,155681.0,"$40,400.00","$58,200.00"
266,230603.0,"$41,900.00","$56,500.00"
267,180179.0,"$37,900.00","$50,600.00"


In [11]:
salaries_per_college['starting_median_salary'] = salaries_per_college['starting_median_salary'].replace( '[\$,)]','', regex=True ).astype(float)


In [12]:
salaries_per_college['midcareer_median_salary'] = salaries_per_college['midcareer_median_salary'].replace( '[\$,)]','', regex=True ).astype(float)

In [13]:
salaries_per_college.dtypes

college_id                 float64
starting_median_salary     float64
midcareer_median_salary    float64
dtype: object

In [14]:
salaries_per_college.set_index("college_id", inplace=True)

In [15]:
majors = majors_df[['Undergraduate Major']].copy()

In [16]:
majors['Major_id'] = majors.index + 1



In [17]:
majors = majors.rename(columns={'Undergraduate Major': 'Majors'})

In [18]:
majors = majors.set_index('Major_id')

### Load DataFrames into database

In [20]:
connection_string = "postgres:Analytics20@localhost:5432/HigherEducation"
engine = create_engine(f'postgresql://{connection_string}')

In [21]:
# Confirm tables
engine.table_names()

[]

### Load DataFrames into database

In [22]:
#if_exists_param = 'append'
if_exists_param = 'replace'

majors.to_sql(name='majors', con=engine, if_exists=if_exists_param, index=True)

In [23]:
salaries_per_college.to_sql(name='salaries_per_college', con=engine, if_exists=if_exists_param, index=True)

In [24]:
pd.read_sql_query('select * from majors', con=engine)

Unnamed: 0,Major_id,Majors
0,1,Accounting
1,2,Aerospace Engineering
2,3,Agriculture
3,4,Anthropology
4,5,Architecture
5,6,Art History
6,7,Biology
7,8,Business Management
8,9,Chemical Engineering
9,10,Chemistry
