## SQL Connection logic

In [1]:
import pandas as pd
from sqlalchemy import create_engine
import os

In [2]:
%load_ext sql
%sql postgresql://toofanmacpro@localhost:5432/testdb


In [3]:
%%sql
SELECT * FROM "Employee" LIMIT 5;

 * postgresql://toofanmacpro@localhost:5432/testdb
3 rows affected.


index,id,salary
0,1,100
1,2,200
2,3,300


In [4]:

def toDB(da, tableName):
    try:
        engine = create_engine('postgresql://toofanmacpro@localhost:5432/testdb')
        try: 
            da.to_sql(tableName, engine, if_exists= 'replace')
        except Exception as e:
            print(f"error occurred: {e}")
            
    except Exception as e:
        print("error occured at create_engine: {e}")

In [5]:
id = [1,2,3]
salary = [100,200,300]

df = pd.DataFrame({'id':id, 'salary': salary})

In [6]:
toDB(df,'Employee')

## Find second highest salary

-- if not found, return null

### SQL Solution - mine

_first lets bring out unique salary values and sort them_

In [7]:
%%sql 
SELECT 
    DISTINCT salary 
FROM 
    "Employee"
ORDER BY salary;

 * postgresql://toofanmacpro@localhost:5432/testdb
3 rows affected.


salary
100
200
300


Combination of LIMIT and OFFSET can be used to get desired output.

OFFSET skips the specified number of rows » so, we skip the top row  
LIMIT can be used to select » once second best becomes top, we limit result to 1

In [8]:
%%sql
SELECT 
    DISTINCT salary AS SecondHighestSalary
FROM 
    "Employee"
ORDER BY salary
LIMIT 1 OFFSET 1;

 * postgresql://toofanmacpro@localhost:5432/testdb
1 rows affected.


secondhighestsalary
200


_now lets deal with the null if not found issue_

In [9]:
%%sql
SELECT(
    SELECT 
        DISTINCT salary
    FROM 
        "Employee"
    ORDER BY salary
    LIMIT 1 OFFSET 1
) AS SecondHighestSalary;

 * postgresql://toofanmacpro@localhost:5432/testdb
1 rows affected.


secondhighestsalary
200


### My pandas solution

In [10]:
import numpy as np

salaries = (df
            .salary
            .sort_values(ascending = False)
            .unique()
            )

secondHigh = int(salaries[1])
da = pd.DataFrame({"secondHighestSalary":secondHigh}, index = [0])

da

Unnamed: 0,secondHighestSalary
0,200


### Pandas - Best solution:

In [11]:
unique = (df
          .salary
          .drop_duplicates()
          .nlargest(2)
          )

if (len(unique) < 2):
    print(pd.DataFrame({"SecondHighestSalary": [None]}, index = [0]))
else:
    print(pd.DataFrame({"SecondHighestSalary":[unique.iloc[1]]}, index = [0]))

   SecondHighestSalary
0                  200


## DENSE_RANK() -- different implementations

In [12]:
Scores = pd.DataFrame({'id':range(1,7), 'score':[3.50,3.65,4.00,3.85,4.00,3.65]})

Scores

Unnamed: 0,id,score
0,1,3.5
1,2,3.65
2,3,4.0
3,4,3.85
4,5,4.0
5,6,3.65


In [13]:
toDB(Scores,'Scores')

In [14]:
%%sql
SELECT * FROM "Scores";

 * postgresql://toofanmacpro@localhost:5432/testdb
6 rows affected.


index,id,score
0,1,3.5
1,2,3.65
2,3,4.0
3,4,3.85
4,5,4.0
5,6,3.65


In [15]:
%%sql
SELECT s1.score, 
       (SELECT COUNT(DISTINCT s2.score) 
        FROM "Scores" s2 
        WHERE s2.score >= s1.score) AS rank
FROM "Scores" s1
ORDER BY s1.score DESC;

 * postgresql://toofanmacpro@localhost:5432/testdb
6 rows affected.


score,rank
4.0,1
4.0,1
3.85,2
3.65,3
3.65,3
3.5,4


Explanation: 

Things to keep in mind: SQL's logical query processing order or execution order

1. FROM and JOIN
2. WHERE
3. GROUP BY
4. HAVING
5. SELECT
6. DISTINCT
7. ORDER BY
8. LIMIT/OFFSET (TOP, FETCH FIRST in SQL Server)

So,   
first: SELECT would choose each score1 from table 1  

second: it will do multiple things in this order:  

» filter the new table to have only values greater than or equal to score1   
- _(ex: >= 4 --> 4,4 = 2 rows) ( >= 3.85 --> 4,4,3.85 = 3 rows) ( >= 3.65 --> 4,4,3.85,3.65,3.65 = 5 rows)_  

» then COUNT the number of distinct scores in that table   
- _(Distinct count = 1 -- 4's repeptitions) (distinct count = 2 -- 4's repeptitions) (distinct count = 3 -- 4's and 3.65's repetetions)_  

» use that as rank   
- _(4 is ranked 1) (3.85 is ranked 2) (3.65's are ranked 3)_  

Scores.order_values(['score'])

In [16]:
Scores.sort_values(by = ['score'], inplace = True, ascending= False)

In [17]:
Scores.assign(
    rank = Scores.score.rank(method = 'dense', ascending = False)
)

Unnamed: 0,id,score,rank
2,3,4.0,1.0
4,5,4.0,1.0
3,4,3.85,2.0
1,2,3.65,3.0
5,6,3.65,3.0
0,1,3.5,4.0


### Dense Rank in python

In [18]:
def denseRnk(s:pd.DataFrame) -> pd.DataFrame:
   s = s.assign(
            rank = s.score.rank(
                method = 'dense',
                ascending = False
            )
        )
   return s[['score', 'rank']].sort_values(by=['rank'])

In [19]:
denseRnk(Scores)

Unnamed: 0,score,rank
2,4.0,1.0
4,4.0,1.0
3,3.85,2.0
1,3.65,3.0
5,3.65,3.0
0,3.5,4.0


In [20]:
%%sql
SELECT 
    s1.score,
    COUNT(DISTINCT s2.score) AS rank
FROM "Scores" s1
LEFT JOIN "Scores" s2 
ON s1.score <= s2.score
GROUP BY s1.score
ORDER BY s1.score DESC;

 * postgresql://toofanmacpro@localhost:5432/testdb
4 rows affected.


score,rank
4.0,1
3.85,2
3.65,3
3.5,4


In [21]:
%%sql
SELECT
    s1.index,
    s1.score,
    COUNT(DISTINCT s2.score) AS rank
FROM "Scores" s1
LEFT JOIN "Scores" s2
    ON s1.score <= s2.score
GROUP BY s1.index, s1.score
ORDER BY s1.score DESC;

 * postgresql://toofanmacpro@localhost:5432/testdb
6 rows affected.


index,score,rank
2,4.0,1
4,4.0,1
3,3.85,2
1,3.65,3
5,3.65,3
0,3.5,4


In [22]:
%%sql
SELECT * FROM "Scores";

 * postgresql://toofanmacpro@localhost:5432/testdb
6 rows affected.


index,id,score
0,1,3.5
1,2,3.65
2,3,4.0
3,4,3.85
4,5,4.0
5,6,3.65


#### Dense ranking with Joins

In [23]:
%%sql
SELECT s1.score, COUNT(DISTINCT s2.score) AS rank
FROM "Scores" s1
LEFT JOIN "Scores" s2 ON s1.score <= s2.score
GROUP BY s1.id, s1.score
ORDER BY s1.score DESC;

 * postgresql://toofanmacpro@localhost:5432/testdb
6 rows affected.


score,rank
4.0,1
4.0,1
3.85,2
3.65,3
3.65,3
3.5,4


#### Sparse ranking with Joins

In [24]:
%%sql
SELECT s1.score, COUNT(s2.score)+1 AS rank
FROM "Scores" s1
LEFT JOIN "Scores" s2 ON s1.score < s2.score
GROUP BY s1.id, s1.score
ORDER BY s1.score DESC;

 * postgresql://toofanmacpro@localhost:5432/testdb
6 rows affected.


score,rank
4.0,1
4.0,1
3.85,3
3.65,4
3.65,4
3.5,6


In [25]:
%%sql
SELECT 
    s1.score,
    (
        SELECT
            COUNT(s2.score) + 1 AS rank
        FROM "Scores" AS s2
        WHERE s1.score < s2.score
    )
FROM "Scores" AS s1
GROUP BY s1.id, s1.score
ORDER BY s1.score DESC;

 * postgresql://toofanmacpro@localhost:5432/testdb
6 rows affected.


score,rank
4.0,1
4.0,1
3.85,3
3.65,4
3.65,4
3.5,6


## Largest Number - python

In [26]:
from typing import List
import itertools

class Solution:

    def largestNumber(self, nums: List[int]) -> str:
        num_strings = [str(num) for num in nums]
        
        num_strings.sort(key = lambda a: a*10, reverse = True)                    
  

In [27]:
a = [10,2]

In [28]:
strA = [str(num) for num in a]

strA

['10', '2']

In [29]:
strA.sort(key=lambda a: a*10, reverse = True)

strA

['2', '10']

In [49]:
id = [1,2,3,4]
recordDate = ['2015-01-01', '2015-01-02', '2015-01-03', '2015-01-04']
temperature = [10,25,20,30]


Weather = pd.DataFrame({'id': id, 'record_date': recordDate, 'temperature': temperature})

Weather

Unnamed: 0,id,record_date,temperature
0,1,2015-01-01,10
1,2,2015-01-02,25
2,3,2015-01-03,20
3,4,2015-01-04,30


In [62]:
Weather['record_date'] = pd.to_datetime(Weather['record_date'])

Weather

Unnamed: 0,id,record_date,temperature
0,1,2015-01-01,10
1,2,2015-01-02,25
2,3,2015-01-03,20
3,4,2015-01-04,30


In [63]:
toDB(Weather, "Weather")

In [64]:
%%sql
SELECT *
FROM "Weather";

 * postgresql://toofanmacpro@localhost:5432/testdb
4 rows affected.


index,id,record_date,temperature
0,1,2015-01-01 00:00:00,10
1,2,2015-01-02 00:00:00,25
2,3,2015-01-03 00:00:00,20
3,4,2015-01-04 00:00:00,30


In [65]:
%%sql
SELECT record_date
FROM "Weather";

 * postgresql://toofanmacpro@localhost:5432/testdb
4 rows affected.


record_date
2015-01-01 00:00:00
2015-01-02 00:00:00
2015-01-03 00:00:00
2015-01-04 00:00:00


In [66]:
# MySQL code - because MYSQL has DATEDIFF

# %%sql
# SELECT *
# FROM "Weather" AS tdy
# JOIN "Weather" AS ydy
# ON DATEDIFF(tdy.record_date, ydy.record_date) = 1;

## INTERVAL

In [71]:
%%sql
SELECT 
    tdy.id
FROM "Weather" AS tdy
JOIN "Weather" AS ydy
ON tdy.record_date = ydy.record_date + INTERVAL '1 day'
WHERE tdy.temperature > ydy.temperature;

 * postgresql://toofanmacpro@localhost:5432/testdb
2 rows affected.


id
2
4


## LAG()

In [78]:
%%sql
WITH previousWeatherData AS
(
    SELECT 
        id,
        record_date,
        temperature,
        LAG(temperature, 1) OVER (ORDER BY record_date) AS prevTemp,
        LAG(record_date, 1) OVER (ORDER BY record_date) AS prevDate
    FROM
        "Weather"
)
SELECT
    id
FROM previousWeatherData
WHERE temperature > prevTemp
AND record_date = prevDate + INTERVAL '1 day';

 * postgresql://toofanmacpro@localhost:5432/testdb
2 rows affected.


id
2
4
