In [None]:
from sqlalchemy import *
import pandas as pd
engine = create_engine('postgresql://student_ies:PythonData@localhost:5432/student_ies')

# Lecture 7 - Intro to databases

### Contents:
* Databases
* DataTypes
* Tables
* Schema
* Joins
* Python - SQLAlchemy
* Pandas implementation!



## Relational databases

* huge data 
* simultanous access and changes of it
* read it very fast
* Raw data
* Combine various sources of data
* manage access to data
* include inside business processes
* Many different applications!
    * Business
    * Web-servers
    * Big data

## SQL
*Structured Query Language*
* Human readable
* Different implementations
    * SQLite, MySQL, Oracle, PostgreSQL
* SQL is only a language
* Data are stored in *Tables* 
* Connected via *Relations*
* NoSQL
* Allow for limited aggregation, filtering and analysis

## How to use it? 
* Command-line
* Programming interface
* GUI Interface - [DBeaver](https://dbeaver.io/)
* Integration with existing software - MS Office, GIS, etc

### Data Layers
* Value < Rows < Tables < Schemas < Databases < Database servers

### Data Types
depends on specific application
* numeric
    * INT, INTEGER, REAL, FLOAT, DOUBLE etc.
* strings
    * STRING, TEXT, VARCHAR
* more specialized
    * DATE, TIME etc.


### IES database
The data from past lecture are stored in three tables:

*people*, *courses* and *theses*

Try to explore:

In [None]:
pd.read_sql_query(
'''
SELECT * FROM people
''',con=engine).columns

### SELECT statement

I want to know all the most succesful teachers on IES in terms of theses (bachelor and masters) supervision

In [None]:
SELECT id,
    (master_awarded + bachelor_awarded) AS awarded,
    (master_all + bachelor_all) AS all,
    (master_awarded + bachelor_awarded)/(master_all + bachelor_all) AS award_share  
FROM people
WHERE (master_awarded + bachelor_awarded)/(master_all + bachelor_all) != 'NaN'
ORDER BY award_share DESC
LIMIT 10


In [None]:
pd.read_sql_query(
'''
SELECT id,name,
    (bachelor_awarded + master_awarded)/(bachelor_all + master_all) as share_awarded
FROM people
ORDER BY share_awarded DESC
LIMIT 10
''',con=engine)

What is missing? 

### Relations!

see other two tables:

*people_courses* and *people_theses*

In [None]:
pd.read_sql_query(
'''
SELECT * FROM people_courses
''',con=engine).head()

### JOINS 

* connecting tables - relations!



<img src='https://www.dofactory.com/Images/sql-joins.png'/>


### Inner
* most common - give me the match!
* when you see match, keep it, otherwise drop it.


In [None]:
pd.read_sql_query(
'''
SELECT c.name,p.researcher FROM courses c
INNER JOIN people_courses pc ON c.id = pc."courses-id" 
INNER JOIN people p ON p.id = pc."people-id"
''',con=engine).head()

### Left 
INNER + rows from LEFT with no match in the RIGHT

In [None]:
pd.read_sql_query(
'''
SELECT * FROM courses c
LEFT JOIN people_courses pc ON c.id = pc."courses-id"
''',con=engine).head()

### Right
INNER + rows from RIGHT with no match in the LEFT

In [None]:
pd.read_sql_query(
'''
SELECT * FROM courses c
RIGHT JOIN people_courses pc ON c.id = pc."courses-id"
''',con=engine).head()

### FULL 
INNER + rows from both LEFT and RIGHT with no match in other table

In [None]:
pd.read_sql_query(
'''
SELECT * FROM courses c
FULL JOIN people_courses pc ON c.id = pc."courses-id"
''',con=engine).head()

### MULTIPLE JOINS

In [None]:
pd.read_sql_query('''
SELECT p.researcher,c.name FROM people p
INNER JOIN people_courses pc ON p.id = pc."people-id"
INNER JOIN courses c ON pc."courses-id" = c.id
''',con=engine).head()

## CREATE TABLE

jump to DBeaver

explore *courses*, *theses* and *people*

## Integration to Pandas
* using SQLAlchemy
* Compatible across databases
* beyond the scope of this lecture

BUT! 
### Connection Strings:
` create_engine('postgresql://student_ies:PythonData@localhost:5432/student_ies')` 

or 

` create_engine('sqlite:///sqlite.db')` 

### pd.read_sql_query

In [None]:
conn = create_engine('postgresql://student_ies:PythonData@localhost:5432/student_ies')
pd.read_sql_table('people',con=conn).head()

### pd.read_sql_table

In [None]:
df = pd.DataFrame()
?df.to_sql

In [None]:
awarded = pd.read_sql_query(
'''
SELECT id,
    (master_awarded + bachelor_awarded) AS awarded,
    (master_all + bachelor_all) AS all,
    (master_awarded + bachelor_awarded)/(master_all + bachelor_all) AS award_share  
FROM people
WHERE (master_awarded + bachelor_awarded)/(master_all + bachelor_all) != 'NaN'
ORDER BY award_share DESC
''',con=engine)

In [None]:
from bokeh.models import ColumnDataSource
from bokeh.io import show, output_file,output_notebook
from bokeh.plotting import figure

output_file('index.html')
source = ColumnDataSource(data = {col:awarded[col][:10] for col in awarded.columns})

TOOLTIPS = [
    ("id", "$id"),
    ("awarded theses", "$awarded"),
    ("all theses",'$all')

]


p = figure(x_range=awarded.id[:10], plot_height=250, title="Best people (by share of awarded theses)",tooltips=TOOLTIPS)

p.vbar(x='id', top='award_share', width=0.9,source=source)

p.xgrid.grid_line_color = None

show(p)