## Create a database

In [14]:
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd

In [15]:
# Define a database name (we're using a dataset on births, so we'll call it birth_db)
# Set your postgres username
dbname = 'prediction_db'
username = 'xingliu' # change this to your username

In [16]:
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print(engine.url)

postgres://xingliu@localhost/prediction_db


In [17]:
## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))

True


In [43]:
zone = 'HUDVL' #'GENESE'#'DUNWOD'#'CENTRL' #'CAPITL'
df_byzone = pd.read_csv('nyiso_' + zone + '_price.csv', parse_dates=['time_stamp', 'time_stamp_local'])

In [44]:
prev1week_index_price = (df_byzone.time_stamp_local >= pd.datetime(2017, 12, 25)) & \
                        (df_byzone.time_stamp_local <= pd.datetime(2017, 12, 31, 23)) 

df_byzone = df_byzone.loc[prev1week_index_price, :]

In [45]:
df_byzone.drop('time_stamp', axis = 1, inplace = True)

In [46]:
df_byzone = df_byzone.loc[:,['time_stamp_local', 'price']]

In [47]:
df_byzone.shape

(168, 2)

In [48]:
df_byzone.head(2)

Unnamed: 0,time_stamp_local,price
17376,2017-12-25 00:00:00,20.06
17377,2017-12-25 01:00:00,5.76


In [49]:
df_byzone.tail(2)

Unnamed: 0,time_stamp_local,price
17542,2017-12-31 22:00:00,134.74
17543,2017-12-31 23:00:00,142.58


In [50]:
df_byzone.to_sql(zone.lower() + '_table', engine, index = False, if_exists='replace')

In [None]:
# pred50187 = pd.read_csv('forecastplantid50187.csv', parse_dates = ['ds'])
# pred50187['plant_id'] = 50187

In [None]:
# pred3845 = pd.read_csv('forecastplantid3845.csv', parse_dates = ['ds'])
# pred3845['plant_id'] = 3845

In [None]:
# pred54268 = pd.read_csv('forecastplantid54268.csv', parse_dates = ['ds'])
# pred54268['plant_id'] = 54268

In [None]:
# plant_pred = pd.concat([pred50187, pred3845, pred54268], axis = 0)

In [None]:
# df_price = pd.read_csv('nyiso_price_2017.csv', parse_dates=['time_stamp', 'time_stamp_local'])

In [None]:
# df_price.tail()

In [None]:
# prev1week_index_price = df_price.time_stamp_local >= pd.datetime(2018, 1, 18, 15, 15)

# df_price = df_price.loc[prev1week_index_price, :]

In [None]:
# df_price.drop('time_stamp', axis = 1, inplace = True)

In [None]:
# df_price.columns = ['price', 'time_stamp']

In [None]:
# df_price.set_index('time_stamp', inplace = True)

In [None]:
# df_price.head()

In [None]:
# df_price_byhour = df_price.resample('1H').mean()

In [None]:
# df_price_byhour.reset_index(inplace=True)

In [None]:
# df_price_byhour.head(2)

In [None]:
# df_price_byhour['hour'] = df_price_byhour.time_stamp.apply(lambda x: x.hour)

In [None]:
# df_price_pred = df_price_byhour.loc[:,['hour', 'price']].groupby('hour').mean()

In [None]:
# df_price_pred.sort_values(by='price', inplace = True)

In [None]:
# df_price_pred.reset_index(inplace=True)

In [None]:
# df_price_pred

In [None]:
## insert data into database from Python (proof of concept - this won't be useful for big data, of course)
# df_price_pred.to_sql('prediction_table', engine, index = False, if_exists='replace')

The above line (to_sql) is doing a lot of heavy lifting.  It's reading a dataframe, it's creating a table, and adding the data to the table.  So ** SQLAlchemy is quite useful! **

## Working with PostgresSQL without Python

**Open up the PostgreSQL app, click on the "Open psql" button in the bottom right corner, ** <br>

or alternatively type <br>

    psql -h localhost

into the command line  

**Connect to the "birth_db" database we created**

    \c birth_db

**You should see something like the following**

`You are now connected to database "birth_db" as user "rockson".`


**Then try the following query:**

    SELECT * FROM birth_data_table;
    
Note that the semi-colon indicates an end-of-statement.

### You can see the table we created!  But it's kinda ugly and hard to read.

Try a few other sample queries.  Before you type in each one, ask yourself what you think the output will look like:

`SELECT * FROM birth_data_table WHERE infant_sex='M';`

`SELECT COUNT(infant_sex) FROM birth_data_table WHERE infant_sex='M';`

`SELECT COUNT(gestation_weeks), infant_sex FROM birth_data_table WHERE infant_sex = 'M' GROUP BY gestation_weeks, infant_sex;`

`SELECT gestation_weeks, COUNT(gestation_weeks) FROM birth_data_table WHERE infant_sex = 'M' GROUP BY gestation_weeks;`

All the above queries run, but they are difficult to visually inspect in the Postgres terminal.

## Working with PostgreSQL in Python

In [26]:
# Connect to make queries using psycopg2
con = None
con = psycopg2.connect(database = dbname, user = username)

query:
sql_query = """
SELECT * FROM centrl_table;
"""

netgen_from_sql = pd.read_sql_query(sql_query,con)
netgen_from_sql.head()

SyntaxError: invalid syntax (<ipython-input-26-e8da15ea824a>, line 5)

Once the data has been pulled into python, we can leverage pandas methods to work with the data.