In [2]:
from sqlalchemy import *
import pandas as pd
engine = create_engine('postgresql://student_ies:PythonData@195.181.209.73:5432/student_ies')

  """)


# Lecture 7 - Intro to databases

### Contents:
* Databases
* DataTypes
* Tables
* Schema
* Joins
* Python - SQLAlchemy
* Pandas implementation!



## Relational databases

* huge data 
* simultanous access and changes of it
* read it very fast
* Raw data
* Combine various sources of data
* manage access to data
* include inside business processes
* Many different applications!
    * Business
    * Web-servers
    * Big data

## SQL
*Structured Query Language*
* Human readable
* Different implementations
    * SQLite, MySQL, Oracle, PostgreSQL
* SQL is only a language
* Data are stored in *Tables* 
* Connected via *Relations*
* NoSQL
* Allow for limited aggregation, filtering and analysis

## How to use it? 
* Command-line
* Programming interface
* GUI Interface - [DBeaver](https://dbeaver.io/)
* Integration with existing software - MS Office, GIS, etc

### Basic philosophy
![alt text](sql_filosofie.png "Základní filosofie")

### Database Layers
![alt text](sql_struktura.png "Struktura databáze")


### Data Types
depends on specific application
* numeric
    * INT, INTEGER, REAL, FLOAT, DOUBLE etc.
* strings
    * STRING, TEXT, VARCHAR
* more specialized
    * DATE, TIME etc.


### IES database
The data from past lecture are stored in three tables:

*people*, *courses* and *theses*

Try to explore:

In [6]:
pd.read_sql_query(
'''
SELECT * FROM people
''',con=engine,index_col='id').head(1)

Unnamed: 0_level_0,position,field_of_interest,membership,office,email,phone,available,organisation_memberships,education,job_history,extra_activities,bachelor_theses,master_theses,name,category,bachelor_all,bachelor_awarded,master_all,master_awarded
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
barunik,Associate Professor,"Financial Economics, Financial Econometrics","Internal, Macroeconomics and Econometrics",503,barunik [AT] fsv [DOT] cuni [DOT] cz,+420(776)259273,upon request,"The Econometric Society, The Society for Finan...","2011 PhD in Economics, Charles University in P...",2017 - research visit Humboldt-Universität zu ...,2018+ President of the Czech Econometric Socie...,I welcome any topic in the field of Applied Fi...,My research interest is in Econometrics of Fin...,doc. PhDr. Jozef Baruník Ph.D.,Current faculty,4.0,0.0,54.0,20.0


### SELECT statement

I want to know all the most succesful teachers on IES in terms of theses (bachelor and masters) supervision

In [13]:
pd.read_sql_query(
'''
select
    id, 
    (bachelor_awarded + master_awarded) as awarded
from public.people
where (bachelor_awarded + master_awarded) != 'NaN'
order by awarded desc
''',con=engine) 

Unnamed: 0,id,awarded
0,mejstrik,33.0
1,horvath,29.0
2,teply,26.0
3,gregor,22.0
4,barunik,20.0
5,kristoufek,18.0
6,dedek,11.0
7,vosvrda,11.0
8,cahlik,11.0
9,schneider,11.0


In [14]:
pd.read_sql_query(
'''
    SELECT id,
    (master_awarded + bachelor_awarded) AS awarded,
    (master_all + bachelor_all) AS all,
    (master_awarded + bachelor_awarded)/(master_all + bachelor_all) AS award_share  
FROM people
WHERE (master_awarded + bachelor_awarded)/(master_all + bachelor_all) != 'NaN'
ORDER BY award_share DESC

''',con=engine)


Unnamed: 0,id,awarded,all,award_share
0,mateju,2.0,2.0,1.0
1,vosvrda,11.0,16.0,0.6875
2,gregor,22.0,51.0,0.431373
3,bauerm,6.0,14.0,0.428571
4,visek,6.0,14.0,0.428571
5,mejstrik,33.0,79.0,0.417722
6,scasny,10.0,24.0,0.416667
7,jakubik,8.0,20.0,0.4
8,antosova,2.0,5.0,0.4
9,vacha,4.0,10.0,0.4


In [None]:
pd.read_sql_query(
'''
SELECT id,name,
    (bachelor_awarded + master_awarded)/(bachelor_all + master_all) as share_awarded
FROM people
WHERE (bachelor_awarded + master_awarded)/(bachelor_all + master_all) != 'NaN'
ORDER BY share_awarded DESC
LIMIT 10
''',con=engine)

What is missing? 

### Relations!

see other two tables:

*people_courses* and *people_theses*

In [15]:
pd.read_sql_query(
'''
SELECT * FROM people_courses
''',con=engine).head()

Unnamed: 0,people-id,courses-id
0,barunik,JEM005
1,barunik,"JED412,413"
2,vosvrda,"JED412,413"
3,barunik,JEM059
4,vacha,JEM059


## ER Diagram

![alt text](sql_diagram.png "ER diagram")

### JOINS 

* connecting tables - relations!



<img src='https://www.dofactory.com/Images/sql-joins.png'/>


### Inner
* most common - give me the match!
* when you see match, keep it, otherwise drop it.


In [16]:
pd.read_sql_query(
'''
select c.name from courses c
''',con=engine)

Unnamed: 0,name
0,JEM005 - Advanced Econometrics
1,"JED412,413 - Nonlinear Dynamic Economic System..."
2,JEM059 - Quantitative Finance I
3,JEM061 - Quantitative Finance II
4,JED414 - Quantitative Methods I
5,JED415 - Quantitative Methods II
6,JEM116 - Applied Econometrics
7,JEM123 - Economics of Least Developed Countries
8,JEM136 - Experimental Economics
9,JEB147 - Behavioral Economics and Finance


In [24]:
pd.read_sql_query(
'''
SELECT 
    c.name as coursename,
    p.name as person    
FROM courses c
INNER JOIN people_courses as pc ON c.id = pc."courses-id" 
INNER JOIN people as p ON p.id = pc."people-id"
''',con=engine).head() 

Unnamed: 0,coursename,person
0,JEM005 - Advanced Econometrics,doc. PhDr. Jozef Baruník Ph.D.
1,"JED412,413 - Nonlinear Dynamic Economic System...",doc. PhDr. Jozef Baruník Ph.D.
2,"JED412,413 - Nonlinear Dynamic Economic System...",prof. Ing. Miloslav Vošvrda CSc.
3,JEM059 - Quantitative Finance I,doc. PhDr. Jozef Baruník Ph.D.
4,JEM059 - Quantitative Finance I,Mgr. Lukáš Vácha Ph.D.


### Left 
INNER + rows from LEFT with no match in the RIGHT

In [26]:
pd.read_sql_query(
'''
SELECT * FROM courses c
LEFT JOIN people_courses pc ON c.id = pc."courses-id"
''',con=engine)

Unnamed: 0,id,name,credit,status,literature,description,people-id,courses-id
0,JEM005,JEM005 - Advanced Econometrics,6,CFS - coreEnglishMasters - allMasters - coreME...,"For the topics covered during the semester, we...",The objective of the course is to help student...,barunik,JEM005
1,"JED412,413","JED412,413 - Nonlinear Dynamic Economic System...",5,DoctoralSemester - summerSemester - winter,"C. Gandolfo: Economics Dynamics, Springer, 199...",The aim of this seminar is an analysis of macr...,barunik,"JED412,413"
2,"JED412,413","JED412,413 - Nonlinear Dynamic Economic System...",5,DoctoralSemester - summerSemester - winter,"C. Gandolfo: Economics Dynamics, Springer, 199...",The aim of this seminar is an analysis of macr...,vosvrda,"JED412,413"
3,JEM059,JEM059 - Quantitative Finance I,6,CFS - electiveEEI and EP - electiveEnglishET -...,"Campbell, Lo and MacKinlay (CLM): The Economet...",The objective of the course is to introduce ad...,barunik,JEM059
4,JEM059,JEM059 - Quantitative Finance I,6,CFS - electiveEEI and EP - electiveEnglishET -...,"Campbell, Lo and MacKinlay (CLM): The Economet...",The objective of the course is to introduce ad...,vacha,JEM059
5,JEM061,JEM061 - Quantitative Finance II,6,CFS - electiveEEI and EP - electiveEnglishET -...,"Beran, J. (1994): Statistics for Long - Memory...",The objective of the course is to introduce ad...,barunik,JEM061
6,JEM061,JEM061 - Quantitative Finance II,6,CFS - electiveEEI and EP - electiveEnglishET -...,"Beran, J. (1994): Statistics for Long - Memory...",The objective of the course is to introduce ad...,vacha,JEM061
7,JED414,JED414 - Quantitative Methods I,5,DoctoralEnglishSemester - winter,,This doctoral course focuses on the presentati...,barunik,JED414
8,JED414,JED414 - Quantitative Methods I,5,DoctoralEnglishSemester - winter,,This doctoral course focuses on the presentati...,horvath,JED414
9,JED415,JED415 - Quantitative Methods II,5,DoctoralEnglishSemester - summer,,This doctoral course focuses on the presentati...,barunik,JED415


### Right
INNER + rows from RIGHT with no match in the LEFT

In [27]:
pd.read_sql_query(
'''
SELECT * FROM courses c
RIGHT JOIN people_courses pc ON c.id = pc."courses-id"
''',con=engine).head()

Unnamed: 0,id,name,credit,status,literature,description,people-id,courses-id
0,JEM005,JEM005 - Advanced Econometrics,6,CFS - coreEnglishMasters - allMasters - coreME...,"For the topics covered during the semester, we...",The objective of the course is to help student...,barunik,JEM005
1,"JED412,413","JED412,413 - Nonlinear Dynamic Economic System...",5,DoctoralSemester - summerSemester - winter,"C. Gandolfo: Economics Dynamics, Springer, 199...",The aim of this seminar is an analysis of macr...,barunik,"JED412,413"
2,"JED412,413","JED412,413 - Nonlinear Dynamic Economic System...",5,DoctoralSemester - summerSemester - winter,"C. Gandolfo: Economics Dynamics, Springer, 199...",The aim of this seminar is an analysis of macr...,vosvrda,"JED412,413"
3,JEM059,JEM059 - Quantitative Finance I,6,CFS - electiveEEI and EP - electiveEnglishET -...,"Campbell, Lo and MacKinlay (CLM): The Economet...",The objective of the course is to introduce ad...,barunik,JEM059
4,JEM059,JEM059 - Quantitative Finance I,6,CFS - electiveEEI and EP - electiveEnglishET -...,"Campbell, Lo and MacKinlay (CLM): The Economet...",The objective of the course is to introduce ad...,vacha,JEM059


### FULL 
INNER + rows from both LEFT and RIGHT with no match in other table

In [28]:
pd.read_sql_query(
'''
SELECT * FROM courses c
FULL JOIN people_courses pc ON c.id = pc."courses-id"
''',con=engine).head()

Unnamed: 0,id,name,credit,status,literature,description,people-id,courses-id
0,JEM005,JEM005 - Advanced Econometrics,6,CFS - coreEnglishMasters - allMasters - coreME...,"For the topics covered during the semester, we...",The objective of the course is to help student...,barunik,JEM005
1,"JED412,413","JED412,413 - Nonlinear Dynamic Economic System...",5,DoctoralSemester - summerSemester - winter,"C. Gandolfo: Economics Dynamics, Springer, 199...",The aim of this seminar is an analysis of macr...,barunik,"JED412,413"
2,"JED412,413","JED412,413 - Nonlinear Dynamic Economic System...",5,DoctoralSemester - summerSemester - winter,"C. Gandolfo: Economics Dynamics, Springer, 199...",The aim of this seminar is an analysis of macr...,vosvrda,"JED412,413"
3,JEM059,JEM059 - Quantitative Finance I,6,CFS - electiveEEI and EP - electiveEnglishET -...,"Campbell, Lo and MacKinlay (CLM): The Economet...",The objective of the course is to introduce ad...,barunik,JEM059
4,JEM059,JEM059 - Quantitative Finance I,6,CFS - electiveEEI and EP - electiveEnglishET -...,"Campbell, Lo and MacKinlay (CLM): The Economet...",The objective of the course is to introduce ad...,vacha,JEM059


### MULTIPLE JOINS

In [30]:
pd.read_sql_query('''
SELECT p.name,c.name FROM people p
INNER JOIN people_courses pc ON p.id = pc."people-id"
INNER JOIN courses c ON pc."courses-id" = c.id
''',con=engine).head()

Unnamed: 0,name,name.1
0,doc. PhDr. Jozef Baruník Ph.D.,JEM005 - Advanced Econometrics
1,doc. PhDr. Jozef Baruník Ph.D.,"JED412,413 - Nonlinear Dynamic Economic System..."
2,prof. Ing. Miloslav Vošvrda CSc.,"JED412,413 - Nonlinear Dynamic Economic System..."
3,doc. PhDr. Jozef Baruník Ph.D.,JEM059 - Quantitative Finance I
4,Mgr. Lukáš Vácha Ph.D.,JEM059 - Quantitative Finance I


## CREATE TABLE

jump to DBeaver

explore *courses*, *theses* and *people*

## Integration to Pandas
* using SQLAlchemy
* Compatible across databases
* beyond the scope of this lecture

BUT! 
### Connection Strings:
` create_engine('<DB_TYPE>://<USER>:<PASSWORD>@<SERVER_ADDRESS>:<SERVER_PORT>/<DATABASE_NAME>')` 

or 

` create_engine('sqlite:///sqlite.db')` 

### pd.read_sql_query

In [None]:
pd.read_sql_table('people',con=engine).head()

### pd.read_sql_table

In [31]:
df = pd.DataFrame()
?df.to_sql

[0;31mSignature:[0m
[0mdf[0m[0;34m.[0m[0mto_sql[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m[[0m[0;34m'name'[0m[0;34m,[0m [0;34m'con'[0m[0;34m,[0m [0;34m'schema=None'[0m[0;34m,[0m [0;34m"if_exists='fail'"[0m[0;34m,[0m [0;34m'index=True'[0m[0;34m,[0m [0;34m'index_label=None'[0m[0;34m,[0m [0;34m'chunksize=None'[0m[0;34m,[0m [0;34m'dtype=None'[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Write records stored in a DataFrame to a SQL database.

Databases supported by SQLAlchemy [1]_ are supported. Tables can be
newly created, appended to, or overwritten.

Parameters
----------
name : string
    Name of SQL table.
con : sqlalchemy.engine.Engine or sqlite3.Connection
    Using SQLAlchemy makes it possible to use any DB supported by that
    library. Legacy support is provided for sqlite3.Connection objects.
schema : string, optional
    Specify the schema (if database flavor supports th

In [None]:
awarded = pd.read_sql_query(
'''
SELECT id,
    (master_awarded + bachelor_awarded) AS awarded,
    (master_all + bachelor_all) AS all,
    (master_awarded + bachelor_awarded)/(master_all + bachelor_all) AS award_share  
FROM people
WHERE (master_awarded + bachelor_awarded)/(master_all + bachelor_all) != 'NaN'
ORDER BY award_share DESC
''',con=engine)

In [None]:
from bokeh.models import ColumnDataSource
from bokeh.io import show, output_file,output_notebook
from bokeh.plotting import figure

output_file('index.html')
source = ColumnDataSource(data = {col:awarded[col][:10] for col in awarded.columns})

TOOLTIPS = [
    ("id", "$id"),
    ("awarded theses", "$awarded"),
    ("all theses",'$all')

]


p = figure(x_range=awarded.id[:10], plot_height=250, title="Best people (by share of awarded theses)",tooltips=TOOLTIPS)

p.vbar(x='id', top='award_share', width=0.9,source=source)

p.xgrid.grid_line_color = None

show(p)

https://jupyterhub.vitekzkytek.cz/user/vitekzkytek/files/PythonDataIES/08/index.html