In [1]:
import sys
import pandas as pd
from pandas.io import sql
from pandas.io.sql import read_sql
from pandas.io.sql import to_sql

#sqlalchemy
import sqlalchemy
from sqlalchemy import create_engine, Table, Column, Integer, String, MetaData, ForeignKey

# for postgres
import psycopg2

import warnings
warnings.filterwarnings("ignore")

print('OK')

OK


  """)


In [2]:
# open a new connection to pyanalysis
engine = create_engine('postgresql://postgres@localhost:5432/pyanalysis')
print("OK")

OK


In [3]:
# First, let's create the taable

engine.execute("CREATE TABLE char_data_types ("
               "varchar_column varchar(10),"
               "char_column char(10),"
               "text_column text)"
) # no semicolon at end of commands here
print('done')



done


In [8]:
# load the data up
# note that VALUES needs to be on same line
# That's different from the book example
engine.execute("INSERT INTO char_data_types VALUES ('abc', 'abc', 'abc'),"
               "('defghi', 'defghi', 'defghi')")

sql = "SELECT * FROM char_data_types"

df = pd.read_sql(sql, engine)

print(df.dtypes)
df.head(6)


varchar_column    object
char_column       object
text_column       object
dtype: object


Unnamed: 0,varchar_column,char_column,text_column
0,abc,abc,abc
1,defghi,defghi,defghi


In [9]:
# now let's try and write a file directly to the folder
engine.execute("COPY char_data_types TO '/Users/tbroderick/anaconda3/envs/pracSQL/Chapter_03/typetestpy.txt'"
               "WITH (FORMAT CSV, HEADER, DELIMITER '|')"
              )
print("done")

done


In [11]:
# now let's create the number data set, but let's try to do this all at once
engine.execute("CREATE TABLE number_data_types ("
    "numeric_column numeric(20,5),"
    "real_column real,"
    "double_column double precision)")

engine.execute("INSERT INTO number_data_types VALUES (.7, .7, .7),"
    "(2.13579, 2.13579, 2.13579),"
    "(2.1357987654, 2.1357987654, 2.1357987654)")

sql = "SELECT * FROM number_data_types"

dfnum = pd.read_sql(sql, engine)

print(dfnum.dtypes)
dfnum.head(6)

# pandas casts all these data types as float 64
# This talks about db types
# http://pbpython.com/pandas_dtypes.html

numeric_column    float64
real_column       float64
double_column     float64
dtype: object


Unnamed: 0,numeric_column,real_column,double_column
0,0.7,0.7,0.7
1,2.13579,2.13579,2.13579
2,2.1358,2.1358,2.135799


In [13]:
# let's see how the math test works
sql = """
    SELECT numeric_column * 10000000 AS "Fixed", 
    real_column * 10000000 AS "Float"
    FROM number_data_types
    WHERE numeric_column = .7
    """

dftest = pd.read_sql(sql, engine)

print(dftest.dtypes)
dftest.head(6)
# interesting. Pandas reports all as float64, but real data is coming in differently

Fixed    float64
Float    float64
dtype: object


Unnamed: 0,Fixed,Float
0,7000000.0,7000000.0


In [15]:
# now let's create the number data set, but let's try to do this all at once
engine.execute("""
    CREATE TABLE date_time_types (
    timestamp_column timestamp with time zone,
    interval_column interval)
    """
)

engine.execute("""
    INSERT INTO date_time_types VALUES
    ('2018-12-31 01:00 EST','2 days'),
    ('2018-12-31 01:00 -8','1 month'),
    ('2018-12-31 01:00 Australia/Melbourne','1 century'),
    (now(),'1 week')
    """
)

sql = "SELECT * FROM date_time_types"

dftime = pd.read_sql(sql, engine)

print(dftime.dtypes)
dftime.head(6)

# VERY interesting. Pandas reads the time columns accurately 
# without having to cast them as such

timestamp_column             object
interval_column     timedelta64[ns]
dtype: object


Unnamed: 0,timestamp_column,interval_column
0,2018-12-31 00:00:00-06:00,2 days
1,2018-12-31 03:00:00-06:00,30 days
2,2018-12-30 08:00:00-06:00,36500 days
3,2018-10-02 20:05:35.220381-05:00,7 days


In [16]:
# finally, let's see if we can do the math with time
sql = """
    SELECT
    timestamp_column,
    interval_column,
    timestamp_column - interval_column AS new_date
    FROM date_time_types;
    """

dftimetest = pd.read_sql(sql, engine)

print(dftimetest.dtypes)
dftimetest.head(6)
# success

timestamp_column             object
interval_column     timedelta64[ns]
new_date                     object
dtype: object


Unnamed: 0,timestamp_column,interval_column,new_date
0,2018-12-31 00:00:00-06:00,2 days,2018-12-29 00:00:00-06:00
1,2018-12-31 03:00:00-06:00,30 days,2018-11-30 03:00:00-06:00
2,2018-12-30 08:00:00-06:00,36500 days,1918-12-30 08:00:00-06:00
3,2018-10-02 20:05:35.220381-05:00,7 days,2018-09-25 20:05:35.220381-05:00


In [18]:
# let's try the cast requests that work
sql = """
    SELECT timestamp_column, CAST(timestamp_column AS varchar(10))
    FROM date_time_types
    """
dfcast1 = pd.read_sql(sql, engine)

print(dfcast1.dtypes)
print( dfcast1.head(6) )

print("----------")
sql2 = """
    SELECT numeric_column,
    CAST(numeric_column AS integer),
    CAST(numeric_column AS varchar(6))
    FROM number_data_types;
    """
dfcast2 = pd.read_sql(sql2, engine)

print(dfcast2.dtypes)
print( dfcast2.head(6) )



timestamp_column    object
timestamp_column    object
dtype: object
                   timestamp_column timestamp_column
0         2018-12-31 00:00:00-06:00       2018-12-31
1         2018-12-31 03:00:00-06:00       2018-12-31
2         2018-12-30 08:00:00-06:00       2018-12-30
3  2018-10-02 20:05:35.220381-05:00       2018-10-02
----------
numeric_column    float64
numeric_column      int64
numeric_column     object
dtype: object
  numeric_column numeric_column numeric_column
0        0.70000              1         0.7000
1        2.13579              2         2.1357
2        2.13580              2         2.1358
