## Import Dependencies

In [1]:
# Dependencies
# ----------------------------------
# Imports the method used for connecting to DBs
from sqlalchemy import create_engine

# Imports the methods needed to abstract classes into tables
from sqlalchemy.ext.declarative import declarative_base

# Allow us to declare column types
from sqlalchemy import Column, Integer, String, Float 

import psycopg2 as pg
import numpy as np
import pandas as pd

import sys
sys.path.append("..")
from cred.cred_user import username
from cred.cred_p import pgpass
from cred.cred_host import host_loc
from cred.cred_port import cred_port

db = "higher_learning"

## Kaggle: Crime and Incarceration in the United States: 21st century state crime and prison custody statistics

In [2]:
inc_data = "data/kaggle_corr/crime_and_incarceration_by_state.csv"
inc_df = pd.read_csv(inc_data)
inc_df.head()

Unnamed: 0,jurisdiction,includes_jails,year,prisoner_count,crime_reporting_change,crimes_estimated,state_population,violent_crime_total,murder_manslaughter,rape_legacy,rape_revised,robbery,agg_assault,property_crime_total,burglary,larceny,vehicle_theft
0,FEDERAL,False,2001,149852,,,,,,,,,,,,,
1,ALABAMA,False,2001,24741,False,False,4468912.0,19582.0,379.0,1369.0,,5584.0,12250.0,173253.0,40642.0,119992.0,12619.0
2,ALASKA,True,2001,4570,False,False,633630.0,3735.0,39.0,501.0,,514.0,2681.0,23160.0,3847.0,16695.0,2618.0
3,ARIZONA,False,2001,27710,False,False,5306966.0,28675.0,400.0,1518.0,,8868.0,17889.0,293874.0,54821.0,186850.0,52203.0
4,ARKANSAS,False,2001,11489,False,False,2694698.0,12190.0,148.0,892.0,,2181.0,8969.0,99106.0,22196.0,69590.0,7320.0


In [3]:
inc_df.columns

Index(['jurisdiction', 'includes_jails', 'year', 'prisoner_count',
       'crime_reporting_change', 'crimes_estimated', 'state_population',
       'violent_crime_total', 'murder_manslaughter', 'rape_legacy',
       'rape_revised', 'robbery', 'agg_assault', 'property_crime_total',
       'burglary', 'larceny', 'vehicle_theft'],
      dtype='object')

In [4]:
incarceration_df = inc_df[['jurisdiction', 'year', 'prisoner_count', 'state_population']].copy()
incarceration_df.columns = ['state', 'year', 'prisoner_count', 'state_population']

In [5]:
incarceration_df['year'] = incarceration_df['year'].astype(int, inplace = True)
incarceration_df['prisoner_count'] = incarceration_df['prisoner_count'].astype(int, inplace = True)

In [6]:
incarceration_df = incarceration_df[incarceration_df['year']>2005]
incarceration_df = incarceration_df[incarceration_df['state']!='FEDERAL']

In [7]:
incarceration_df['year_state'] = incarceration_df['year'].map(str) + '_' + incarceration_df['state'].map(str)

In [8]:
inc_na = incarceration_df[incarceration_df['state_population'].isnull()].copy()

In [9]:
inc_na

Unnamed: 0,state,year,prisoner_count,state_population,year_state
746,NEW YORK,2015,51485,,2015_NEW YORK


In [10]:
inc_na_idx = (incarceration_df[incarceration_df['state_population'].isnull()].index[0]).copy()
print(inc_na_idx)

746


In [11]:
# 2015 NY Population 
# https://population.us/ny/
# https://www.health.ny.gov/statistics/vital_statistics/2015/table02.htm
incarceration_df.update(incarceration_df.loc[[inc_na_idx]].fillna('19795791'))

In [12]:
inc_check_na = incarceration_df[(incarceration_df.state=='NEW YORK') & (incarceration_df.year == 2015)].copy()

In [13]:
inc_check_na

Unnamed: 0,state,year,prisoner_count,state_population,year_state
746,NEW YORK,2015.0,51485.0,19795791,2015_NEW YORK


In [14]:
incarceration_df['state_population'] = incarceration_df['state_population'].astype(int, inplace = True)

In [15]:
incarceration_df.head()

Unnamed: 0,state,year,prisoner_count,state_population,year_state
256,ALABAMA,2006.0,24103.0,4599030,2006_ALABAMA
257,ALASKA,2006.0,5052.0,670053,2006_ALASKA
258,ARIZONA,2006.0,35752.0,6166318,2006_ARIZONA
259,ARKANSAS,2006.0,12854.0,2810872,2006_ARKANSAS
260,CALIFORNIA,2006.0,172298.0,36457549,2006_CALIFORNIA


In [16]:
incarceration_df.tail()

Unnamed: 0,state,year,prisoner_count,state_population,year_state
811,VIRGINIA,2016.0,29882.0,8414380,2016_VIRGINIA
812,WASHINGTON,2016.0,17228.0,7280934,2016_WASHINGTON
813,WEST VIRGINIA,2016.0,5899.0,1828637,2016_WEST VIRGINIA
814,WISCONSIN,2016.0,23163.0,5772917,2016_WISCONSIN
815,WYOMING,2016.0,2352.0,584910,2016_WYOMING


## Kaggle: U.S. Education Datasets: Unification Project: K-12 financial, enrollment, and achievement data in one place

In [17]:
ed_data = "data/kaggle_ed/states_all_extended.csv"
ed_df = pd.read_csv(ed_data)
ed_df.head()

Unnamed: 0,PRIMARY_KEY,STATE,YEAR,ENROLL,TOTAL_REVENUE,FEDERAL_REVENUE,STATE_REVENUE,LOCAL_REVENUE,TOTAL_EXPENDITURE,INSTRUCTION_EXPENDITURE,...,GRADES_4_TRF,GRADES_8_TRF,GRADES_12_TRF,GRADES_1_8_TRF,GRADES_9_12_TRF,GRADES_ALL_TRF,AVG_MATH_4_SCORE,AVG_MATH_8_SCORE,AVG_READING_4_SCORE,AVG_READING_8_SCORE
0,1992_ALABAMA,ALABAMA,1992,,2678885.0,304177.0,1659028.0,715680.0,2653798.0,1481703.0,...,,,,,,,208.327876,252.187522,207.963517,
1,1992_ALASKA,ALASKA,1992,,1049591.0,106780.0,720711.0,222100.0,972488.0,498362.0,...,,,,,,,,,,258.859712
2,1992_ARIZONA,ARIZONA,1992,,3258079.0,297888.0,1369815.0,1590376.0,3401580.0,1435908.0,...,,,,,,,215.253932,265.366278,206.212716,262.169895
3,1992_ARKANSAS,ARKANSAS,1992,,1711959.0,178571.0,958785.0,574603.0,1743022.0,964323.0,...,,,,,,,210.206028,256.31209,208.634458,264.619665
4,1992_CALIFORNIA,CALIFORNIA,1992,,26260025.0,2072470.0,16546514.0,7641041.0,27138832.0,14358922.0,...,,,,,,,208.398961,260.892247,196.764414,


In [18]:
ed_df.columns

Index(['PRIMARY_KEY', 'STATE', 'YEAR', 'ENROLL', 'TOTAL_REVENUE',
       'FEDERAL_REVENUE', 'STATE_REVENUE', 'LOCAL_REVENUE',
       'TOTAL_EXPENDITURE', 'INSTRUCTION_EXPENDITURE',
       ...
       'GRADES_4_TRF', 'GRADES_8_TRF', 'GRADES_12_TRF', 'GRADES_1_8_TRF',
       'GRADES_9_12_TRF', 'GRADES_ALL_TRF', 'AVG_MATH_4_SCORE',
       'AVG_MATH_8_SCORE', 'AVG_READING_4_SCORE', 'AVG_READING_8_SCORE'],
      dtype='object', length=193)

In [19]:
ed_df_state_list = ed_df['STATE'].value_counts()

In [20]:
ed_df_state_list.describe()

count    80.000000
mean     18.650000
std      10.901283
min       1.000000
25%       4.000000
50%      26.000000
75%      26.000000
max      29.000000
Name: STATE, dtype: float64

In [21]:
ed_df_state_list[51:80]

PUERTO_RICO                                 23
GUAM                                        23
AMERICAN_SAMOA                              22
VIRGIN_ISLANDS                              22
NORTHERN_MARIANAS                           14
DOD_OVERSEAS                                 8
DOD_DOMESTIC                                 8
BUREAU_OF_INDIAN_AFFAIRS                     6
BI                                           4
DD                                           4
COMMONWEALTH_OF_MARIANAS                     3
NORTHERN_MARIANA_ISLANDS                     3
BIE                                          2
DOD_-_DOMESTIC                               2
BUREAU_OF_INDIAN_EDUCATION                   2
DOD_-_OVERSEAS                               2
AS                                           1
DEPARTMENT_OF_DEFENSE_EDUCATION_ACTIVITY     1
PR                                           1
DOD                                          1
VI                                           1
GU           

In [22]:
ed_df_states_exclude = ['DISTRICT_OF_COLUMBIA', 'GUAM', 'PUERTO_RICO', 'AMERICAN_SAMOA','VIRGIN_ISLANDS', 'NORTHERN_MARIANAS',
                  'DOD_OVERSEAS', 'DOD_DOMESTIC', 'BUREAU_OF_INDIAN_AFFAIRS', 'BI', 'DD', 'COMMONWEALTH_OF_MARIANAS', 'NORTHERN_MARIANA_ISLANDS', 
                  'DOD_-_OVERSEAS', 'BIE', 'DOD_-_DOMESTIC', 'BUREAU_OF_INDIAN_EDUCATION', 'VI',
                  'DOD_(OVERSEAS_AND_DOMESTIC_COMBINED)', 'MARIANAS', 'AS', 'PR', 'GU', 'DEPARTMENT_OF_DEFENSE',
                  'DOD_-_FOREIGN', 'BUREAU_OF_INDIAN_EDUCATIO', 'MP', 'DOD', 'DEPARTMENT_OF_DEFENSE_EDUCATION_ACTIVITY',
                  'U.S._VIRGIN_ISLANDS'
                 ]

In [23]:
ed_df = ed_df[~ed_df.STATE.isin(ed_df_states_exclude)]

In [24]:
ed_df['STATE'].describe()

count         1302
unique          50
top       VIRGINIA
freq            28
Name: STATE, dtype: object

In [25]:
ed_enroll_exp_df = ed_df[['PRIMARY_KEY', 'STATE', 'YEAR', 'ENROLL', 'TOTAL_EXPENDITURE']].copy()
ed_enroll_exp_df.columns = ['year_state', 'state', 'year', 'enrollment', 'total_expenditure']

In [26]:
ed_enroll_exp_df["enrollment"].fillna(0, inplace=True)
ed_enroll_exp_df["total_expenditure"].fillna(0, inplace=True)

In [27]:
ed_enroll_exp_df['year'] = ed_enroll_exp_df['year'].astype(int, inplace = True)
ed_enroll_exp_df['enrollment'] = ed_enroll_exp_df['enrollment'].astype(int, inplace = True)
ed_enroll_exp_df['total_expenditure'] = ed_enroll_exp_df['total_expenditure'].astype(int, inplace = True)

In [28]:
ed_enroll_exp_df = ed_enroll_exp_df[(ed_enroll_exp_df.year>2005) & (ed_enroll_exp_df.year<2017)]

In [29]:
ed_enroll_exp_df['state'].describe()

count          552
unique          50
top       VIRGINIA
freq            13
Name: state, dtype: object

In [30]:
ed_enroll_exp_df['state'].nunique()

50

In [31]:
ed_enroll_exp_df['state'] = ed_enroll_exp_df['state'].str.replace('_',' ')

In [32]:
ed_enroll_exp_df['year_state'].value_counts()

2008_VIRGINIA          3
2009_WEST_VIRGINIA     1
2006_ILLINOIS          1
2011_MAINE             1
2014_CALIFORNIA        1
2016_OKLAHOMA          1
2010_NEW_YORK          1
2015_NEBRASKA          1
2006_KENTUCKY          1
2014_IDAHO             1
2013_COLORADO          1
2008_COLORADO          1
2016_WEST_VIRGINIA     1
2013_NEW_JERSEY        1
2007_MINNESOTA         1
2015_FLORIDA           1
2011_ARKANSAS          1
2015_NEW_JERSEY        1
2008_ARKANSAS          1
2013_INDIANA           1
2013_SOUTH_DAKOTA      1
2011_SOUTH_DAKOTA      1
2008_RHODE_ISLAND      1
2006_ALASKA            1
2006_NEW_YORK          1
2011_GEORGIA           1
2009_LOUISIANA         1
2011_WEST_VIRGINIA     1
2014_VERMONT           1
2011_WASHINGTON        1
                      ..
2015_NEW_MEXICO        1
2012_LOUISIANA         1
2010_IOWA              1
2010_MINNESOTA         1
2012_WISCONSIN         1
2010_KENTUCKY          1
2008_MONTANA           1
2012_GEORGIA           1
2012_COLORADO          1


In [33]:
ed_duplicates = ed_enroll_exp_df[(ed_enroll_exp_df.year_state=='2008_VIRGINIA')]

In [34]:
ed_duplicates.head()

Unnamed: 0,year_state,state,year,enrollment,total_expenditure
863,2008_VIRGINIA,VIRGINIA,2008,1230857,15236306
864,2008_VIRGINIA,VIRGINIA,2008,1230857,15236306
865,2008_VIRGINIA,VIRGINIA,2008,1230857,15236306


In [35]:
ed_enroll_exp_df.drop_duplicates(subset ="year_state", keep = 'first', inplace = True) 

In [36]:
ed_duplicates_post = ed_enroll_exp_df[(ed_enroll_exp_df.year_state=='2008_VIRGINIA')]

In [37]:
ed_duplicates_post.head()

Unnamed: 0,year_state,state,year,enrollment,total_expenditure
863,2008_VIRGINIA,VIRGINIA,2008,1230857,15236306


In [38]:
ed_enroll_exp_df.head()

Unnamed: 0,year_state,state,year,enrollment,total_expenditure
714,2006_ALABAMA,ALABAMA,2006,743265,6591429
715,2006_ALASKA,ALASKA,2006,132893,1817656
716,2006_ARIZONA,ARIZONA,2006,947266,7934177
717,2006_ARKANSAS,ARKANSAS,2006,472609,4343877
718,2006_CALIFORNIA,CALIFORNIA,2006,6295994,68722432


In [39]:
ed_enroll_exp_df.tail()

Unnamed: 0,year_state,state,year,enrollment,total_expenditure
1275,2016_VIRGINIA,VIRGINIA,2016,1283493,16497520
1276,2016_WASHINGTON,WASHINGTON,2016,1083973,15253296
1277,2016_WEST_VIRGINIA,WEST VIRGINIA,2016,276764,3366566
1278,2016_WISCONSIN,WISCONSIN,2016,857736,11787535
1279,2016_WYOMING,WYOMING,2016,94511,2034229


## Urban Institute: State and Local Finance Data Query System: cost per prisoner by state (2006-2016)

In [40]:
pr_exp_data = "data/urbaninstitute/cost per prisoner by state (2006-2016).csv"
pr_exp_df = pd.read_csv(pr_exp_data)
pr_exp_df.head()

Unnamed: 0,Level: State and Local,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,Unit:Total (thousands),Nominal,,,
1,State,Year,(E021) Total Correct-Dir Exp,(E022) Total Correct-Cur Oper,(E023) Total Correct-Cap Out
2,United States,2006,"$62,642,834","$60,171,656","$2,471,178"
3,United States,2007,"$68,035,382","$64,811,380","$3,224,002"
4,United States,2008,"$72,752,623","$69,199,149","$3,553,474"


In [41]:
pr_exp_df.tail()

Unnamed: 0,Level: State and Local,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4
571,Wyoming,2014,"$210,034","$207,690","$2,344"
572,Wyoming,2015,"$213,887","$210,756","$3,131"
573,Wyoming,2016,"$214,564","$213,090","$1,474"
574,,,,,
575,Observations with N/A,missing years or zero values should be checke...,,,


In [42]:
pr_exp_df.columns

Index(['Level: State and Local', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3',
       'Unnamed: 4'],
      dtype='object')

In [43]:
pr_exp_df['Level: State and Local'].describe()

count      575
unique      55
top       Iowa
freq        11
Name: Level: State and Local, dtype: object

In [44]:
pr_exp_df['Level: State and Local'].value_counts()

Iowa                      11
United States             11
New Jersey                11
Arkansas                  11
Texas                     11
Hawaii                    11
Florida                   11
Michigan                  11
Mississippi               11
Colorado                  11
New Mexico                11
Nevada                    11
Missouri                  11
Ohio                      11
Virginia                  11
California                11
Georgia                   11
New Hampshire             11
Kentucky                  11
Utah                      11
Wisconsin                 11
Massachusetts             11
North Dakota              11
Wyoming                   11
New York                  11
Connecticut               11
South Dakota              11
Tennessee                 11
Vermont                   11
Louisiana                 11
Nebraska                  11
South Carolina            11
Kansas                    11
Arizona                   11
Delaware      

In [45]:
pr_exp_states_exclude = ['United States', 'DC', 'Unit:Total (thousands)', 'Observations with N/A', 'State']
pr_exp_df = pr_exp_df[~pr_exp_df['Level: State and Local'].isin(pr_exp_states_exclude)]

In [46]:
pr_exp_df['Level: State and Local'].describe()

count      550
unique      50
top       Iowa
freq        11
Name: Level: State and Local, dtype: object

In [47]:
pr_exp_df.head()

Unnamed: 0,Level: State and Local,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4
13,Alabama,2006,"$626,112","$597,654","$28,458"
14,Alabama,2007,"$764,056","$661,016","$103,040"
15,Alabama,2008,"$704,294","$682,245","$22,049"
16,Alabama,2009,"$741,164","$710,132","$31,032"
17,Alabama,2010,"$727,653","$722,513","$5,140"


In [48]:
prisoner_exp_df = pr_exp_df.drop(['Unnamed: 3', 'Unnamed: 4'], axis=1).copy()

In [49]:
prisoner_exp_df.head()

Unnamed: 0,Level: State and Local,Unnamed: 1,Unnamed: 2
13,Alabama,2006,"$626,112"
14,Alabama,2007,"$764,056"
15,Alabama,2008,"$704,294"
16,Alabama,2009,"$741,164"
17,Alabama,2010,"$727,653"


In [50]:
prisoner_exp_df.columns = ['state', 'year', 'total_expenditure']

In [51]:
prisoner_exp_df.head()

Unnamed: 0,state,year,total_expenditure
13,Alabama,2006,"$626,112"
14,Alabama,2007,"$764,056"
15,Alabama,2008,"$704,294"
16,Alabama,2009,"$741,164"
17,Alabama,2010,"$727,653"


In [52]:
prisoner_exp_df.tail()

Unnamed: 0,state,year,total_expenditure
570,Wyoming,2013.0,"$213,000"
571,Wyoming,2014.0,"$210,034"
572,Wyoming,2015.0,"$213,887"
573,Wyoming,2016.0,"$214,564"
574,,,


In [53]:
prisoner_exp_df = prisoner_exp_df.dropna()

In [54]:
prisoner_exp_df.tail()

Unnamed: 0,state,year,total_expenditure
569,Wyoming,2012,"$215,414"
570,Wyoming,2013,"$213,000"
571,Wyoming,2014,"$210,034"
572,Wyoming,2015,"$213,887"
573,Wyoming,2016,"$214,564"


In [55]:
prisoner_exp_df['year'] = prisoner_exp_df['year'].astype(int, inplace = True)

In [56]:
prisoner_exp_df['total_expenditure'] = prisoner_exp_df['total_expenditure'].str.replace('$','') 
prisoner_exp_df['total_expenditure'] = prisoner_exp_df['total_expenditure'].str.replace(' ','') 
prisoner_exp_df['total_expenditure'] = prisoner_exp_df['total_expenditure'].str.replace(',','') 
prisoner_exp_df['total_expenditure'] = prisoner_exp_df['total_expenditure'].astype(int, inplace = True)

In [57]:
prisoner_exp_df['state'] = prisoner_exp_df['state'].str.upper() 

In [58]:
prisoner_exp_df['year_state'] = prisoner_exp_df['year'].map(str) + '_' + prisoner_exp_df['state'].map(str)

In [59]:
prisoner_exp_df.head()

Unnamed: 0,state,year,total_expenditure,year_state
13,ALABAMA,2006,626112,2006_ALABAMA
14,ALABAMA,2007,764056,2007_ALABAMA
15,ALABAMA,2008,704294,2008_ALABAMA
16,ALABAMA,2009,741164,2009_ALABAMA
17,ALABAMA,2010,727653,2010_ALABAMA


In [60]:
prisoner_exp_df.tail()

Unnamed: 0,state,year,total_expenditure,year_state
569,WYOMING,2012,215414,2012_WYOMING
570,WYOMING,2013,213000,2013_WYOMING
571,WYOMING,2014,210034,2014_WYOMING
572,WYOMING,2015,213887,2015_WYOMING
573,WYOMING,2016,214564,2016_WYOMING


## The Annie E. Casey Foundation: Kids Count Data Center: Fourth grade reading achievement levels in the United States

In [61]:
fourth_grade_rd_data = "data/kidscount/Fourth grade reading achievement levels.xlsx"
fourth_grade_rd_df = pd.read_excel(fourth_grade_rd_data)
fourth_grade_rd_df.head()

Unnamed: 0,LocationType,Location,Achievement Level,TimeFrame,DataFormat,Data
0,Nation,United States,Below basic,2002,Percent,0.38
1,Nation,United States,At or above basic,2002,Percent,0.62
2,Nation,United States,Below proficient,2002,Percent,0.7
3,Nation,United States,At or above proficient,2002,Percent,0.3
4,Nation,United States,Below basic,2003,Percent,0.38


In [62]:
fourth_grade_rd_df.columns

Index(['LocationType', 'Location', 'Achievement Level', 'TimeFrame',
       'DataFormat', 'Data'],
      dtype='object')

In [63]:
fourth_grade_rd_df['LocationType'].value_counts()

State        1800
City           36
Nation         36
Territory      20
Name: LocationType, dtype: int64

In [64]:
location_type_exclude =['Nation', 'City', 'Territory']
fourth_grade_rd_df = fourth_grade_rd_df[~fourth_grade_rd_df['LocationType'].isin(location_type_exclude)]
fourth_grade_read_df = fourth_grade_rd_df.drop(['LocationType', 'DataFormat'], axis=1).copy()
fourth_grade_read_df.columns = ['state', 'achievement_level', 'year', 'data']

In [65]:
fourth_grade_read_df['data'].nunique()

71

In [66]:
fourth_grade_read_df['data'][0:30]

36    0.37
37    0.63
38    0.69
39    0.31
40    0.33
41    0.67
42    0.69
43    0.31
44    0.38
45    0.62
46    0.72
47    0.28
48    0.35
49    0.65
50    0.71
51    0.29
52    0.38
53    0.62
54    0.71
55    0.29
56    0.35
57    0.65
58    0.69
59    0.31
60    0.47
61    0.53
62    0.78
63    0.22
64    0.48
65    0.52
Name: data, dtype: object

In [67]:
fourth_grade_read_exclude = ['N.A.', 'S']
fourth_grade_read_df = fourth_grade_read_df[~fourth_grade_read_df['data'].isin(fourth_grade_read_exclude)]

In [68]:
fourth_grade_read_df['year'] = fourth_grade_read_df['year'].astype(int, inplace = True)

In [69]:
fourth_grade_read_df['data'] = fourth_grade_read_df['data'].str.replace('.','') 
fourth_grade_read_df['data'] = fourth_grade_read_df['data'].astype(int, inplace = True)
fourth_grade_read_df['data'] = fourth_grade_read_df['data'] / 100

In [70]:
fourth_grade_read_df = fourth_grade_read_df[(fourth_grade_read_df.year>2005) & (fourth_grade_read_df.year<2017)]

In [71]:
fourth_grade_read_df['year_state_ach_lvl'] = fourth_grade_read_df['year'].map(str) + '_' + fourth_grade_read_df['state'].map(str) + fourth_grade_read_df['achievement_level']

In [72]:
fourth_grade_read_df.head()

Unnamed: 0,state,achievement_level,year,data,year_state_ach_lvl
40,Alabama,Below basic,2011,0.33,2011_AlabamaBelow basic
41,Alabama,At or above basic,2011,0.67,2011_AlabamaAt or above basic
42,Alabama,Below proficient,2011,0.69,2011_AlabamaBelow proficient
43,Alabama,At or above proficient,2011,0.31,2011_AlabamaAt or above proficient
44,Alabama,Below basic,2009,0.38,2009_AlabamaBelow basic


In [73]:
fourth_grade_read_df.tail()

Unnamed: 0,state,achievement_level,year,data,year_state_ach_lvl
1819,Wyoming,At or above proficient,2009,0.33,2009_WyomingAt or above proficient
1820,Wyoming,Below basic,2007,0.27,2007_WyomingBelow basic
1821,Wyoming,At or above basic,2007,0.74,2007_WyomingAt or above basic
1822,Wyoming,Below proficient,2007,0.64,2007_WyomingBelow proficient
1823,Wyoming,At or above proficient,2007,0.36,2007_WyomingAt or above proficient


## Combined Table

In [92]:
merge_1_df = pd.merge(ed_enroll_exp_df, incarceration_df, on="year_state")

In [93]:
merge_1_df.head()

Unnamed: 0,year_state,state_x,year_x,enrollment,total_expenditure,state_y,year_y,prisoner_count,state_population
0,2006_ALABAMA,ALABAMA,2006,743265,6591429,ALABAMA,2006.0,24103.0,4599030
1,2006_ALASKA,ALASKA,2006,132893,1817656,ALASKA,2006.0,5052.0,670053
2,2006_ARIZONA,ARIZONA,2006,947266,7934177,ARIZONA,2006.0,35752.0,6166318
3,2006_ARKANSAS,ARKANSAS,2006,472609,4343877,ARKANSAS,2006.0,12854.0,2810872
4,2006_CALIFORNIA,CALIFORNIA,2006,6295994,68722432,CALIFORNIA,2006.0,172298.0,36457549


In [94]:
merge_1_df = merge_1_df[['year_state', 'state_x', 'year_x', 'enrollment', 'total_expenditure', 
                         'prisoner_count','state_population']].copy()

In [95]:
merge_1_df.head()

Unnamed: 0,year_state,state_x,year_x,enrollment,total_expenditure,prisoner_count,state_population
0,2006_ALABAMA,ALABAMA,2006,743265,6591429,24103.0,4599030
1,2006_ALASKA,ALASKA,2006,132893,1817656,5052.0,670053
2,2006_ARIZONA,ARIZONA,2006,947266,7934177,35752.0,6166318
3,2006_ARKANSAS,ARKANSAS,2006,472609,4343877,12854.0,2810872
4,2006_CALIFORNIA,CALIFORNIA,2006,6295994,68722432,172298.0,36457549


In [96]:
merge_1_df.columns = ['year_state', 'state', 'year', 'ed_enrollment', 'ed_total_exp', 'prisoner_count', 'state_population']

In [97]:
merge_1_df.head()

Unnamed: 0,year_state,state,year,ed_enrollment,ed_total_exp,prisoner_count,state_population
0,2006_ALABAMA,ALABAMA,2006,743265,6591429,24103.0,4599030
1,2006_ALASKA,ALASKA,2006,132893,1817656,5052.0,670053
2,2006_ARIZONA,ARIZONA,2006,947266,7934177,35752.0,6166318
3,2006_ARKANSAS,ARKANSAS,2006,472609,4343877,12854.0,2810872
4,2006_CALIFORNIA,CALIFORNIA,2006,6295994,68722432,172298.0,36457549


In [98]:
merge_2_df = pd.merge(merge_1_df, prisoner_exp_df, on="year_state")

In [99]:
merge_2_df.head()

Unnamed: 0,year_state,state_x,year_x,ed_enrollment,ed_total_exp,prisoner_count,state_population,state_y,year_y,total_expenditure
0,2006_ALABAMA,ALABAMA,2006,743265,6591429,24103.0,4599030,ALABAMA,2006,626112
1,2006_ALASKA,ALASKA,2006,132893,1817656,5052.0,670053,ALASKA,2006,212043
2,2006_ARIZONA,ARIZONA,2006,947266,7934177,35752.0,6166318,ARIZONA,2006,1428311
3,2006_ARKANSAS,ARKANSAS,2006,472609,4343877,12854.0,2810872,ARKANSAS,2006,446200
4,2006_CALIFORNIA,CALIFORNIA,2006,6295994,68722432,172298.0,36457549,CALIFORNIA,2006,10914128


In [100]:
merge_2_df = merge_2_df[['year_state', 'state_x', 'year_x', 'ed_enrollment', 'ed_total_exp', 
                         'prisoner_count','total_expenditure', 'state_population']].copy()

In [101]:
merge_2_df.head()

Unnamed: 0,year_state,state_x,year_x,ed_enrollment,ed_total_exp,prisoner_count,total_expenditure,state_population
0,2006_ALABAMA,ALABAMA,2006,743265,6591429,24103.0,626112,4599030
1,2006_ALASKA,ALASKA,2006,132893,1817656,5052.0,212043,670053
2,2006_ARIZONA,ARIZONA,2006,947266,7934177,35752.0,1428311,6166318
3,2006_ARKANSAS,ARKANSAS,2006,472609,4343877,12854.0,446200,2810872
4,2006_CALIFORNIA,CALIFORNIA,2006,6295994,68722432,172298.0,10914128,36457549


In [102]:
merge_2_df.columns = ['year_state', 'state', 'year', 'ed_enrollment', 'ed_total_exp', 'prisoner_count', 
                      'corr_total_exp','state_population']

In [103]:
merge_2_df.head()

Unnamed: 0,year_state,state,year,ed_enrollment,ed_total_exp,prisoner_count,corr_total_exp,state_population
0,2006_ALABAMA,ALABAMA,2006,743265,6591429,24103.0,626112,4599030
1,2006_ALASKA,ALASKA,2006,132893,1817656,5052.0,212043,670053
2,2006_ARIZONA,ARIZONA,2006,947266,7934177,35752.0,1428311,6166318
3,2006_ARKANSAS,ARKANSAS,2006,472609,4343877,12854.0,446200,2810872
4,2006_CALIFORNIA,CALIFORNIA,2006,6295994,68722432,172298.0,10914128,36457549


In [104]:
merge_2_df['cost_per_stu'] = merge_2_df['ed_total_exp'] / merge_2_df['ed_enrollment']
merge_2_df['stu_to_pop_percent'] = merge_2_df['ed_enrollment'] / merge_2_df['state_population']

merge_2_df['cost_per_prisoner'] = merge_2_df['corr_total_exp'] / merge_2_df['prisoner_count']
merge_2_df['prisoner_to_pop_percent'] = merge_2_df['prisoner_count'] / merge_2_df['state_population']

In [105]:
merge_2_df.head()

Unnamed: 0,year_state,state,year,ed_enrollment,ed_total_exp,prisoner_count,corr_total_exp,state_population,cost_per_stu,stu_to_pop_percent,cost_per_prisoner,prisoner_to_pop_percent
0,2006_ALABAMA,ALABAMA,2006,743265,6591429,24103.0,626112,4599030,8.868209,0.161613,25.976517,0.005241
1,2006_ALASKA,ALASKA,2006,132893,1817656,5052.0,212043,670053,13.67759,0.198332,41.97209,0.00754
2,2006_ARIZONA,ARIZONA,2006,947266,7934177,35752.0,1428311,6166318,8.37587,0.153619,39.95052,0.005798
3,2006_ARKANSAS,ARKANSAS,2006,472609,4343877,12854.0,446200,2810872,9.19127,0.168136,34.71293,0.004573
4,2006_CALIFORNIA,CALIFORNIA,2006,6295994,68722432,172298.0,10914128,36457549,10.915263,0.172694,63.344485,0.004726


In [106]:
ed_corr_data_df = merge_2_df.copy()

In [107]:
ed_corr_data_df.head()

Unnamed: 0,year_state,state,year,ed_enrollment,ed_total_exp,prisoner_count,corr_total_exp,state_population,cost_per_stu,stu_to_pop_percent,cost_per_prisoner,prisoner_to_pop_percent
0,2006_ALABAMA,ALABAMA,2006,743265,6591429,24103.0,626112,4599030,8.868209,0.161613,25.976517,0.005241
1,2006_ALASKA,ALASKA,2006,132893,1817656,5052.0,212043,670053,13.67759,0.198332,41.97209,0.00754
2,2006_ARIZONA,ARIZONA,2006,947266,7934177,35752.0,1428311,6166318,8.37587,0.153619,39.95052,0.005798
3,2006_ARKANSAS,ARKANSAS,2006,472609,4343877,12854.0,446200,2810872,9.19127,0.168136,34.71293,0.004573
4,2006_CALIFORNIA,CALIFORNIA,2006,6295994,68722432,172298.0,10914128,36457549,10.915263,0.172694,63.344485,0.004726


In [108]:
ed_corr_data_df.tail()

Unnamed: 0,year_state,state,year,ed_enrollment,ed_total_exp,prisoner_count,corr_total_exp,state_population,cost_per_stu,stu_to_pop_percent,cost_per_prisoner,prisoner_to_pop_percent
435,2016_VERMONT,VERMONT,2016,87974,2082696,1735.0,137103,623354,23.673995,0.14113,79.021902,0.002783
436,2016_VIRGINIA,VIRGINIA,2016,1283493,16497520,29882.0,2560724,8414380,12.853611,0.152536,85.694532,0.003551
437,2016_WASHINGTON,WASHINGTON,2016,1083973,15253296,17228.0,1727579,7280934,14.071657,0.148878,100.277397,0.002366
438,2016_WISCONSIN,WISCONSIN,2016,857736,11787535,23163.0,1553919,5772917,13.742614,0.148579,67.086258,0.004012
439,2016_WYOMING,WYOMING,2016,94511,2034229,2352.0,214564,584910,21.523727,0.161582,91.22619,0.004021


In [113]:
x = ed_corr_data_df[(ed_corr_data_df.year==2015)]

In [114]:
x

Unnamed: 0,year_state,state,year,ed_enrollment,ed_total_exp,prisoner_count,corr_total_exp,state_population,cost_per_stu,stu_to_pop_percent,cost_per_prisoner,prisoner_to_pop_percent
360,2015_ALABAMA,ALABAMA,2015,734974,7501799,25212.0,742594,4858979,10.20689,0.151261,29.45399,0.005189
361,2015_ALASKA,ALASKA,2015,130755,2968341,5247.0,346872,738432,22.701549,0.177071,66.108634,0.007106
362,2015_ARIZONA,ARIZONA,2015,944978,7902600,42204.0,1708409,6828065,8.362734,0.138396,40.479789,0.006181
363,2015_ARKANSAS,ARKANSAS,2015,479682,5350543,15784.0,587690,2978204,11.154354,0.161064,37.233274,0.0053
364,2015_CALIFORNIA,CALIFORNIA,2015,6226523,78365958,127815.0,15275175,39144818,12.58583,0.159064,119.510034,0.003265
365,2015_COLORADO,COLORADO,2015,872320,9557682,19959.0,1316236,5456574,10.956624,0.159866,65.946991,0.003658
366,2015_CONNECTICUT,CONNECTICUT,2015,505366,10542667,16024.0,699051,3590886,20.861449,0.140736,43.62525,0.004462
367,2015_DELAWARE,DELAWARE,2015,121845,1975093,6437.0,302261,945934,16.209881,0.128809,46.956812,0.006805
368,2015_FLORIDA,FLORIDA,2015,2743641,27277049,99485.0,4125808,20271272,9.941916,0.135346,41.471659,0.004908
369,2015_GEORGIA,GEORGIA,2015,1717805,18501103,52002.0,2278830,10214860,10.7702,0.168167,43.821968,0.005091


## SQL Connection

In [109]:
eng = create_engine('postgresql+psycopg2://'+ username +':' + pgpass + '@' +host_loc + ':' + cred_port + '/' + db)
conn = eng.connect()

In [110]:
print(eng.table_names())

['incarceration', 'ed_enroll_exp', 'prisoner_exp', 'fourth_grade_read', 'ed_corr_data']


In [94]:
incarceration_df.to_sql(name='incarceration', con=eng, if_exists='append', index=False)

In [95]:
ed_enroll_exp_df.to_sql(name='ed_enroll_exp', con=eng, if_exists='append', index=False) 

In [96]:
prisoner_exp_df.to_sql(name='prisoner_exp', con=eng, if_exists='append', index=False)

In [97]:
fourth_grade_read_df.to_sql(name='fourth_grade_read', con=eng, if_exists='append', index=False)

In [111]:
ed_corr_data_df.to_sql(name='ed_corr_data', con=eng, if_exists='append', index=False)