# SQL for Data Science

## Connection

[PostgresSQL Tutorial](https://www.postgresqltutorial.com/postgresql-python/connect/)

### Import libraries

In [18]:
# !pip install psycopg2
import psycopg2
import pandas as pd

### Create Connection & Cursor

In [61]:
#change user and password to your settings ☜(ﾟヮﾟ☜)
# port: the port number that defaults to 5432 if it is not provided.
conn = psycopg2.connect(host="localhost",database="data_sci", user="postgres", password="password", port=5432)
cur = conn.cursor()

Code Template:
```python
cur = conn.cursor()
command = """SQL DDL/DML Command"""
cur.execute(command)
cur.fetchone()
cur.fetchall()
cur.close()
con.commit()
con.close()
```

## Executing Commands

Version Check and Connection

In [20]:
cur.execute("""SELECT version()
                """)
cur.fetchone()

('PostgreSQL 12.2, compiled by Visual C++ build 1914, 64-bit',)

To check whether the cursor or the connection is connected:

In [21]:
conn is None and cur is None

False

In [22]:
cur

<cursor object at 0x00000202B38A32E8; closed: 0>

In [23]:
conn

<connection object at 0x00000202B37D89D8; dsn: 'user=postgres password=xxx dbname=data_sci host=localhost port=5432', closed: 0>

## Show Tables using PostgreSQL

If you are coming from MySQL, you may miss the SHOW TABLES statement that displays all tables in a specific database. PostgreSQL does not provide the SHOW TABLES statement directly but give you something similar ¯\_(ツ)_/¯.

In [29]:
command = """SELECT
   *
FROM
   pg_catalog.pg_tables
WHERE
   schemaname != 'pg_catalog'
AND schemaname != 'information_schema';"""
cur.execute(command)
cols = [x[0] for x in cur.description]
pd.DataFrame(cur.fetchall(), columns=cols)

Unnamed: 0,schemaname,tablename,tableowner,tablespace,hasindexes,hasrules,hastriggers,rowsecurity
0,public,company_divisions,postgres,,True,False,False,False
1,public,company_regions,postgres,,True,False,False,False
2,public,staff,postgres,,True,False,False,False


## Checking the Tables

In [30]:
command = """SELECT * FROM company_divisions;"""
cur.execute(command)
cols = [x[0] for x in cur.description]
pd.DataFrame(cur.fetchall(), columns=cols)

Unnamed: 0,department,company_division
0,Automotive,Auto & Hardware
1,Baby,Domestic
2,Beauty,Domestic
3,Clothing,Domestic
4,Computers,Electronic Equipment
5,Electronics,Electronic Equipment
6,Games,Domestic
7,Garden,Outdoors & Garden
8,Grocery,Domestic
9,Health,Domestic


In [31]:
command = """SELECT * FROM company_regions;"""
cur.execute(command)
cols = [x[0] for x in cur.description]
pd.DataFrame(cur.fetchall(), columns=cols)

Unnamed: 0,region_id,company_regions,country
0,1,Northeast,USA
1,2,Southeast,USA
2,3,Northwest,USA
3,4,Southwest,USA
4,5,British Columbia,Canada
5,6,Quebec,Canada
6,7,Nova Scotia,Canada


In [32]:
command = """SELECT * FROM staff;"""
cur.execute(command)
cols = [x[0] for x in cur.description]
pd.DataFrame(cur.fetchall(), columns=cols)

Unnamed: 0,id,last_name,email,gender,department,start_date,salary,job_title,region_id
0,1,Kelley,rkelley0@soundcloud.com,Female,Computers,2009-10-02,67470,Structural Engineer,2
1,2,Armstrong,sarmstrong1@infoseek.co.jp,Male,Sports,2008-03-31,71869,Financial Advisor,2
2,3,Carr,fcarr2@woothemes.com,Male,Automotive,2009-07-12,101768,Recruiting Manager,3
3,4,Murray,jmurray3@gov.uk,Female,Jewelery,2014-12-25,96897,Desktop Support Technician,3
4,5,Ellis,jellis4@sciencedirect.com,Female,Grocery,2002-09-19,63702,Software Engineer III,7
...,...,...,...,...,...,...,...,...,...
995,996,James,tjamesrn@soundcloud.com,Female,Games,2013-11-17,78433,Accountant II,7
996,997,Reynolds,dreynoldsro@blogtalkradio.com,Female,Computers,2007-04-16,120138,Statistician IV,1
997,998,Walker,kwalkerrp@unicef.org,Female,Games,2010-02-13,60363,Account Coordinator,1
998,999,Kennedy,lkennedyrq@edublogs.org,Male,Industrial,2004-09-22,48050,Graphic Designer,2


# Count

In [39]:
command = """SELECT gender, COUNT(gender) FROM staff
             GROUP BY gender;"""
cur.execute(command)
cols = [x[0] for x in cur.description]
pd.DataFrame(cur.fetchall(), columns=cols)

Unnamed: 0,gender,count
0,Female,496
1,Male,504


# MIN, MAX

In [43]:
command = """SELECT department, gender, MIN(salary), MAX(salary) FROM staff
             GROUP BY department, gender
             ORDER BY department, gender;"""
cur.execute(command)
cols = [x[0] for x in cur.description]
pd.DataFrame(cur.fetchall(), columns=cols)

#Pretty Cool huh! ☜(ﾟヮﾟ☜)

Unnamed: 0,department,gender,min,max
0,Automotive,Female,42602,136448
1,Automotive,Male,44377,146167
2,Baby,Female,50448,141464
3,Baby,Male,42924,148687
4,Beauty,Female,41299,143853
5,Beauty,Male,40254,149099
6,Books,Female,47131,145284
7,Books,Male,42714,146745
8,Clothing,Female,42797,148408
9,Clothing,Male,43067,130188


# Sum & Average & Variance & Standard Deviation

In [55]:
command = """SELECT 
             department, 
             SUM(salary) AS total_salary, 
             AVG(salary) AS average_salary,
             var_pop(salary) AS variance_salary,
             stddev_pop(salary) AS standard_deviation_salary 
             FROM staff
             GROUP BY department
             ORDER BY total_salary;"""
cur.execute(command)
cols = [x[0] for x in cur.description]
pd.DataFrame(cur.fetchall(), columns=cols)

Unnamed: 0,department,total_salary,average_salary,variance_salary,standard_deviation_salary
0,Music,3274767,88507.21621621621,1001696601.520818,31649.59085866
1,Kids,3543027,93237.55263157895,1072987422.9314404,32756.48673059
2,Movies,3632825,100911.80555555556,876898879.1010802,29612.47843564
3,Sports,3756041,93901.025,1049150239.874375,32390.58875467
4,Toys,3943674,96187.17073170733,1150709491.312314,33922.10918136
5,Shoes,3968118,92281.81395348835,849306565.1746889,29142.86473864
6,Jewelery,4039362,87812.21739130435,875256537.7788278,29584.73487762
7,Tools,4095808,105020.71794871794,795868561.0230112,28211.14249766
8,Baby,4218724,93749.42222222223,912220932.8217283,30202.99542797
9,Industrial,4366340,92900.85106382977,889271955.2756902,29820.66322662


# Rounding by n decimal points

In [57]:
command = """SELECT 
             department, 
             SUM(salary) AS total_salary, 
             ROUND(AVG(salary),2) AS average_salary,
             ROUND(var_pop(salary),2) AS variance_salary,
             ROUND(stddev_pop(salary),2) AS standard_deviation_salary 
             FROM staff
             GROUP BY department
             ORDER BY total_salary;"""
cur.execute(command)
cols = [x[0] for x in cur.description]
pd.DataFrame(cur.fetchall(), columns=cols)

Unnamed: 0,department,total_salary,average_salary,variance_salary,standard_deviation_salary
0,Music,3274767,88507.22,1001696601.52,31649.59
1,Kids,3543027,93237.55,1072987422.93,32756.49
2,Movies,3632825,100911.81,876898879.1,29612.48
3,Sports,3756041,93901.03,1049150239.87,32390.59
4,Toys,3943674,96187.17,1150709491.31,33922.11
5,Shoes,3968118,92281.81,849306565.17,29142.86
6,Jewelery,4039362,87812.22,875256537.78,29584.73
7,Tools,4095808,105020.72,795868561.02,28211.14
8,Baby,4218724,93749.42,912220932.82,30203.0
9,Industrial,4366340,92900.85,889271955.28,29820.66


# Filtering & Joining

In [62]:
command = """SELECT *
             FROM staff
             INNER JOIN company_regions
             USING (region_id)
             INNER JOIN company_divisions
             USING (department)
             WHERE company_division IN ('Electronic Equipment', 'Games & Sports')
             AND
             department = 'Electronics'
             AND
             last_name ILIKE 's%'"""
cur.execute(command)
cols = [x[0] for x in cur.description]
pd.DataFrame(cur.fetchall(), columns=cols)

Unnamed: 0,department,region_id,id,last_name,email,gender,start_date,salary,job_title,company_regions,country,company_division
0,Electronics,1,985,Stevens,hstevensrc@hugedomains.com,Male,2006-02-28,118791,Safety Technician IV,Northeast,USA,Electronic Equipment
1,Electronics,1,906,Spencer,sspencerp5@mtv.com,Male,2014-04-18,110881,Electrical Engineer,Northeast,USA,Electronic Equipment
2,Electronics,2,679,Stone,pstoneiu@narod.ru,Male,2014-05-15,40218,Paralegal,Southeast,USA,Electronic Equipment
3,Electronics,7,762,Stanley,cstanleyl5@springer.com,Male,2013-08-09,103237,Account Executive,Nova Scotia,Canada,Electronic Equipment


In [63]:
#if the joining column names are not the same then:

command = """SELECT *
             FROM staff s
             INNER JOIN company_regions cr
             ON cr.region_id=s.region_id
             INNER JOIN company_divisions cd
             ON s.department=cd.department
             WHERE company_division IN ('Electronic Equipment', 'Games & Sports')
	         AND
	         s.department = 'Electronics'
	         AND
	         last_name ILIKE 's%'"""
cur.execute(command)
cols = [x[0] for x in cur.description]
pd.DataFrame(cur.fetchall(), columns=cols)



Unnamed: 0,id,last_name,email,gender,department,start_date,salary,job_title,region_id,region_id.1,company_regions,country,department.1,company_division
0,985,Stevens,hstevensrc@hugedomains.com,Male,Electronics,2006-02-28,118791,Safety Technician IV,1,1,Northeast,USA,Electronics,Electronic Equipment
1,906,Spencer,sspencerp5@mtv.com,Male,Electronics,2014-04-18,110881,Electrical Engineer,1,1,Northeast,USA,Electronics,Electronic Equipment
2,679,Stone,pstoneiu@narod.ru,Male,Electronics,2014-05-15,40218,Paralegal,2,2,Southeast,USA,Electronics,Electronic Equipment
3,762,Stanley,cstanleyl5@springer.com,Male,Electronics,2013-08-09,103237,Account Executive,7,7,Nova Scotia,Canada,Electronics,Electronic Equipment


In [None]:
cur.close()
conn.close()