In [None]:
"""Find the employees with the second highest salary"""
# There might be multiple employees with the same salary

In [None]:
"""SQL table creation and data insertion"""
drop table if exists  `salaries` ; 
CREATE TABLE `salaries` (
`emp_no` int(11) NOT NULL,
`salary` int(11) NOT NULL,
`from_date` date NOT NULL,
`to_date` date NOT NULL,
PRIMARY KEY (`emp_no`,`from_date`));

INSERT INTO salaries VALUES(10001,88958,'2002-06-22','9999-01-01');
INSERT INTO salaries VALUES(10002,72527,'2001-08-02','9999-01-01');
INSERT INTO salaries VALUES(10003,43311,'2001-12-01','9999-01-01');

In [None]:
"""SQL solution"""
# 1
SELECT emp_no, salary
FROM salaries 
WHERE salary = 
(SELECT DISTINCT salary 
FROM salaries
ORDER BY salary DESC
LIMIT 1
OFFSET 1)

# 2
SELECT emp_no, salary
FROM salaries
WHERE salary = 
(SELECT salary 
FROM salaries
GROUP BY salary
ORDER BY salary DESC
LIMIT 1, 1)

# 3
SELECT emp_no, salary
FROM 
(SELECT emp_no, salary, DENSE_RANK() OVER(ORDER BY salary DESC) AS r 
FROM salaries) AS rk
WHERE r = 2
"""
Comparison of window function
suppose there are 2 employees with the highest salary, 3 employees with the second highest saalry
ROW_NUMER() returns 12345 (continuous + no repeated number)
RANK() returns 11333 (not continuous, + repeated number)
DENSE_RANK returns 11222 (continuous + repeated number) --> most appropriate 
"""

In [1]:
"""Pandas dataframe creation"""
import numpy as np
import pandas as pd

ts = np.array([[10001,88958,'2002-06-22','9999-01-01'],
               [10002,72527,'2001-08-02','9999-01-01'],
               [10003,43311,'2001-12-01','9999-01-01'],
              ])
salaries = pd.DataFrame(data=ts, columns=['emp_no', 'salary', 'from_date', 'to_date'])
salaries.head()

Unnamed: 0,emp_no,salary,from_date,to_date
0,10001,88958,2002-06-22,9999-01-01
1,10002,72527,2001-08-02,9999-01-01
2,10003,43311,2001-12-01,9999-01-01


In [5]:
df = salaries[['emp_no', 'salary']].sort_values('salary', ascending=False)
df

Unnamed: 0,emp_no,salary
0,10001,88958
1,10002,72527
2,10003,43311


In [25]:
df['salary_rank'] = 1
for i in range(len(df)):
   df.iloc[i, 2] = df.iloc[i-1, 2] + (df.iloc[i, 1] < df.iloc[i-1, 1])
df

Unnamed: 0,emp_no,salary,salary_rank
0,10001,88958,1
1,10002,72527,2
2,10003,43311,3


In [28]:
df.loc[df.salary_rank == 2, ['emp_no', 'salary']]

Unnamed: 0,emp_no,salary
1,10002,72527


In [35]:
df2 = salaries[['emp_no', 'salary']]
df2['salary_rank'] = df2['salary'].rank(ascending=False)
df2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['salary_rank'] = df2['salary'].rank(ascending=False)


Unnamed: 0,emp_no,salary,salary_rank
0,10001,88958,1.0
1,10002,72527,2.0
2,10003,43311,3.0


In [37]:
df2.loc[df2.salary_rank == 2, ['emp_no', 'salary']]

Unnamed: 0,emp_no,salary
1,10002,72527


In [42]:
# data with duplicates
ts = np.array([[10001,88958,'2002-06-22','9999-01-01'],
               [10002,72527,'2001-08-02','9999-01-01'],
               [10003,43311,'2001-12-01','9999-01-01'],
               [10004,88958,'2001-12-01','9999-01-01'],
               [10005,72527,'2001-12-01','9999-01-01'],
               [10006,72527,'2001-12-01','9999-01-01']])
salaries = pd.DataFrame(data=ts, columns=['emp_no', 'salary', 'from_date', 'to_date'])
salaries.head()

Unnamed: 0,emp_no,salary,from_date,to_date
0,10001,88958,2002-06-22,9999-01-01
1,10002,72527,2001-08-02,9999-01-01
2,10003,43311,2001-12-01,9999-01-01
3,10004,88958,2001-12-01,9999-01-01
4,10005,72527,2001-12-01,9999-01-01


In [43]:
df3 = salaries[['emp_no', 'salary']].sort_values('salary', ascending=False)
df3['salary_rank'] = 1
for i in range(len(df3)):
   df3.iloc[i, 2] = df3.iloc[i-1, 2] + (df3.iloc[i, 1] < df3.iloc[i-1, 1])
df3

Unnamed: 0,emp_no,salary,salary_rank
0,10001,88958,1
3,10004,88958,1
1,10002,72527,2
4,10005,72527,2
5,10006,72527,2
2,10003,43311,3


In [50]:
df4 = salaries[['emp_no','salary']]
df4['salary'] = df4['salary'].astype('int')
# method = 'dense': works like dense_rank() in SQL
# with continuous and repeated rank
df4['salary_rank'] = df4['salary'].rank(method='dense', ascending=False)
df4.head(6)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['salary'] = df4['salary'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['salary_rank'] = df4['salary'].rank(method='dense', ascending=False)


Unnamed: 0,emp_no,salary,salary_rank
0,10001,88958,1.0
1,10002,72527,2.0
2,10003,43311,3.0
3,10004,88958,1.0
4,10005,72527,2.0
5,10006,72527,2.0


In [51]:
# default method = 'average', returns average rank for duplicates
df4['salary_rank'] = df4['salary'].rank(method='average', ascending=False)
df4.head(6)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['salary_rank'] = df4['salary'].rank(method='average', ascending=False)


Unnamed: 0,emp_no,salary,salary_rank
0,10001,88958,1.5
1,10002,72527,4.0
2,10003,43311,6.0
3,10004,88958,1.5
4,10005,72527,4.0
5,10006,72527,4.0


In [53]:
# method = 'min': works like rank() in SQL, 
# returns lowest rank in the group, similarly there is an option 'max'
# with repeated rank but not continuous
df4['salary_rank'] = df4['salary'].rank(method='min', ascending=False)
df4.head(6)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['salary_rank'] = df4['salary'].rank(method='min', ascending=False)


Unnamed: 0,emp_no,salary,salary_rank
0,10001,88958,1.0
1,10002,72527,3.0
2,10003,43311,6.0
3,10004,88958,1.0
4,10005,72527,3.0
5,10006,72527,3.0


In [54]:
# method = 'first': rank by the value used for comparison and the row number, no repeated rank
df4['salary_rank'] = df4['salary'].rank(method='first', ascending=False)
df4.head(6)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['salary_rank'] = df4['salary'].rank(method='first', ascending=False)


Unnamed: 0,emp_no,salary,salary_rank
0,10001,88958,1.0
1,10002,72527,3.0
2,10003,43311,6.0
3,10004,88958,2.0
4,10005,72527,4.0
5,10006,72527,5.0
