In [None]:
# Slicing in pandas instead of SQL

In [1]:
import sys
import pandas as pd
import numpy as np

from pandas.io import sql
from pandas.io.sql import read_sql
from pandas.io.sql import to_sql

#sqlalchemy
import sqlalchemy
from sqlalchemy import create_engine, Table, Column, Integer, String, MetaData, ForeignKey

# for postgres
import psycopg2

import warnings
warnings.filterwarnings("ignore")

print('OK')

OK


  """)


In [2]:
# open a new connection to pyanalysis
engine = create_engine('postgresql://postgres@localhost:5432/pyanalysis')
print("OK")

OK


In [3]:
# This will immediately do a sql command
#engine.execute("command")

# but we want to tie this in with pandas, so we'll save the commands in a variable and use it like so
# read with dates parsed as dates

sql = "SELECT * FROM teachers"
teachers = pd.read_sql(sql, engine, parse_dates=['hire_date'])

print(teachers.dtypes)

teachers.head(6)


id                     int64
first_name            object
last_name             object
school                object
hire_date     datetime64[ns]
salary               float64
dtype: object


Unnamed: 0,id,first_name,last_name,school,hire_date,salary
0,1,Janet,Smith,F.D. Roosevelt HS,2011-10-30,36200.0
1,2,Lee,Reynolds,F.D. Roosevelt HS,1993-05-22,65000.0
2,3,Samuel,Cole,Myers Middle School,2005-08-01,43500.0
3,4,Samantha,Bush,Myers Middle School,2011-10-30,36200.0
4,5,Betty,Diaz,Myers Middle School,2005-08-30,43500.0
5,6,Kathleen,Roush,F.D. Roosevelt HS,2010-10-22,38500.0


In [4]:
teachers.shape

(6, 6)

In [6]:
# select certain columns
df = teachers[['last_name', 'first_name', 'salary']]
df

Unnamed: 0,last_name,first_name,salary
0,Smith,Janet,36200.0
1,Reynolds,Lee,65000.0
2,Cole,Samuel,43500.0
3,Bush,Samantha,36200.0
4,Diaz,Betty,43500.0
5,Roush,Kathleen,38500.0


In [9]:
# select distinct values in a column
df = pd.DataFrame(teachers.school.unique() )
df

Unnamed: 0,0
0,F.D. Roosevelt HS
1,Myers Middle School


In [11]:
# Select distinct values in multiple columns
df = teachers[['school', 'salary']].drop_duplicates()
df

Unnamed: 0,school,salary
0,F.D. Roosevelt HS,36200.0
1,F.D. Roosevelt HS,65000.0
2,Myers Middle School,43500.0
3,Myers Middle School,36200.0
5,F.D. Roosevelt HS,38500.0


In [13]:
# select columns and sort
df = teachers[['last_name', 'first_name', 'salary']].sort_values(['salary'], ascending=False)
df

Unnamed: 0,last_name,first_name,salary
1,Reynolds,Lee,65000.0
2,Cole,Samuel,43500.0
4,Diaz,Betty,43500.0
5,Roush,Kathleen,38500.0
0,Smith,Janet,36200.0
3,Bush,Samantha,36200.0


In [14]:
# select columns and sorting by multiple columns
df = teachers[['last_name', 'school', 'hire_date']].sort_values(['school', 'hire_date'], ascending=[True, False])
df

Unnamed: 0,last_name,school,hire_date
0,Smith,F.D. Roosevelt HS,2011-10-30
5,Roush,F.D. Roosevelt HS,2010-10-22
1,Reynolds,F.D. Roosevelt HS,1993-05-22
3,Bush,Myers Middle School,2011-10-30
4,Diaz,Myers Middle School,2005-08-30
2,Cole,Myers Middle School,2005-08-01


In [18]:
# filtering rows
df = teachers[['last_name', 'school', 'hire_date']].loc[ teachers['school'] == "Myers Middle School" ]
df

Unnamed: 0,last_name,school,hire_date
2,Cole,Myers Middle School,2005-08-01
3,Bush,Myers Middle School,2011-10-30
4,Diaz,Myers Middle School,2005-08-30


In [20]:
# filtering rows
df = pd.DataFrame( teachers['school'].loc[ teachers['school'] != "F.D. Roosevelt HS" ] )
df

Unnamed: 0,school
2,Myers Middle School
3,Myers Middle School
4,Myers Middle School


In [21]:
# filtering on date less than
df = teachers[['first_name', 'last_name', 'hire_date']].loc[ teachers['hire_date'] < "2000-01-01" ]
df

Unnamed: 0,first_name,last_name,hire_date
1,Lee,Reynolds,1993-05-22


In [24]:
# selecting greater than or equal to
df = teachers[['first_name', 'last_name', 'salary']].loc[ teachers['salary'] >= 43500 ]
df

Unnamed: 0,first_name,last_name,salary
1,Lee,Reynolds,65000.0
2,Samuel,Cole,43500.0
4,Betty,Diaz,43500.0


In [27]:
# selecting on range

df = teachers[['first_name', 'last_name', 'salary']].loc[ (teachers['salary'] > 40000) & (teachers['salary'] < 65000) ]
df

Unnamed: 0,first_name,last_name,salary
2,Samuel,Cole,43500.0
4,Betty,Diaz,43500.0


In [36]:
# like or ILIKE?
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.contains.html

df = pd.DataFrame( teachers['first_name'].loc[ teachers['first_name'].str.contains("sam", case=False, regex=True) ] )
df

Unnamed: 0,first_name
2,Samuel
3,Samantha


In [42]:
# And Or
df = teachers.loc[ (teachers['last_name'].str.contains("Cole")) | (teachers['last_name'].str.contains("Bush")) ]
df

Unnamed: 0,id,first_name,last_name,school,hire_date,salary
2,3,Samuel,Cole,Myers Middle School,2005-08-01,43500.0
3,4,Samantha,Bush,Myers Middle School,2011-10-30,36200.0


In [43]:
df = teachers.loc[ (teachers['school'].str.contains("Myers Middle School")) & (teachers['salary'] < 40000) ]
df

Unnamed: 0,id,first_name,last_name,school,hire_date,salary
3,4,Samantha,Bush,Myers Middle School,2011-10-30,36200.0


In [44]:
df = teachers.loc[ (teachers['school'].str.contains("F.D. Roosevelt HS")) & ( (teachers['salary'] < 38000) | (teachers['salary'] > 40000) ) ]
df

Unnamed: 0,id,first_name,last_name,school,hire_date,salary
0,1,Janet,Smith,F.D. Roosevelt HS,2011-10-30,36200.0
1,2,Lee,Reynolds,F.D. Roosevelt HS,1993-05-22,65000.0


In [46]:
df = teachers[['first_name','last_name','school','hire_date','salary']].loc[ teachers['school'].str.contains("roo", case=False, regex=True) ].sort_values(['hire_date'], ascending=False)
df

Unnamed: 0,first_name,last_name,school,hire_date,salary
0,Janet,Smith,F.D. Roosevelt HS,2011-10-30,36200.0
5,Kathleen,Roush,F.D. Roosevelt HS,2010-10-22,38500.0
1,Lee,Reynolds,F.D. Roosevelt HS,1993-05-22,65000.0


In [None]:
# http://pandas.pydata.org/pandas-docs/stable/comparison_with_sql.html