In [None]:
"""Find out average salaries for employees with different titles, order by average salary ascending"""

In [None]:
"""SQL table creation and data insertion"""
drop table if exists  `salaries` ; 
drop table if exists  titles;
CREATE TABLE `salaries` (
`emp_no` int(11) NOT NULL,
`salary` int(11) NOT NULL,
`from_date` date NOT NULL,
`to_date` date NOT NULL,
PRIMARY KEY (`emp_no`,`from_date`));

CREATE TABLE titles (
`emp_no` int(11) NOT NULL,
`title` varchar(50) NOT NULL,
`from_date` date NOT NULL,
`to_date` date DEFAULT NULL);

INSERT INTO salaries VALUES(10001,88958,'1986-06-26','9999-01-01');
INSERT INTO salaries VALUES(10003,43311,'2001-12-01','9999-01-01');
INSERT INTO salaries VALUES(10004,74057,'1995-12-01','9999-01-01');
INSERT INTO salaries VALUES(10006,43311,'2001-08-02','9999-01-01');
INSERT INTO salaries VALUES(10007,88070,'2002-02-07','9999-01-01');

INSERT INTO titles VALUES(10001,'Senior Engineer','1986-06-26','9999-01-01');
INSERT INTO titles VALUES(10003,'Senior Engineer','2001-12-01','9999-01-01');
INSERT INTO titles VALUES(10004,'Senior Engineer','1995-12-01','9999-01-01');
INSERT INTO titles VALUES(10006,'Senior Engineer','2001-08-02','9999-01-01');
INSERT INTO titles VALUES(10007,'Senior Staff','1996-02-11','9999-01-01');

In [None]:
"""SQL solution"""
SELECT a.title, AVG(a.salary) as avgs
FROM
(SELECT t.emp_no, t.title, s.salary
FROM titles AS t
INNER JOIN salaries AS s
ON t.emp_no = s.emp_no) AS a
GROUP BY a.title
ORDER BY avgs

In [1]:
"""Pandas dataframe creation"""
import pandas as pd
import numpy as np

ts = np.array([[10001,88958,'1986-06-26','9999-01-01'],
               [10003,43311,'2001-12-01','9999-01-01'],
               [10004,74057,'1995-12-01','9999-01-01'],
               [10006,43311,'2001-08-02','9999-01-01'],
               [10007,88070,'2002-02-07','9999-01-01']])
tt = np.array([[10001,'Senior Engineer','1986-06-26','9999-01-01'],
               [10003,'Senior Engineer','2001-12-01','9999-01-01'],
               [10004,'Senior Engineer','1995-12-01','9999-01-01'],
               [10006,'Senior Engineer','2001-08-02','9999-01-01'],
               [10007,'Senior Staff','1996-02-11','9999-01-01']])

salaries = pd.DataFrame(data=ts, columns=['emp_no', 'salary', 'from_date', 'hire_date'])
titles = pd.DataFrame(data=tt, columns=['emp_no', 'title', 'from_date', 'to_date'])
salaries.head()

Unnamed: 0,emp_no,salary,from_date,hire_date
0,10001,88958,1986-06-26,9999-01-01
1,10003,43311,2001-12-01,9999-01-01
2,10004,74057,1995-12-01,9999-01-01
3,10006,43311,2001-08-02,9999-01-01
4,10007,88070,2002-02-07,9999-01-01


In [2]:
titles.head()

Unnamed: 0,emp_no,title,from_date,to_date
0,10001,Senior Engineer,1986-06-26,9999-01-01
1,10003,Senior Engineer,2001-12-01,9999-01-01
2,10004,Senior Engineer,1995-12-01,9999-01-01
3,10006,Senior Engineer,2001-08-02,9999-01-01
4,10007,Senior Staff,1996-02-11,9999-01-01


In [18]:
# merge the two tables
df1 = pd.merge(salaries, titles, on='emp_no', how='inner')
df1

Unnamed: 0,emp_no,salary,from_date_x,hire_date,title,from_date_y,to_date
0,10001,88958,1986-06-26,9999-01-01,Senior Engineer,1986-06-26,9999-01-01
1,10003,43311,2001-12-01,9999-01-01,Senior Engineer,2001-12-01,9999-01-01
2,10004,74057,1995-12-01,9999-01-01,Senior Engineer,1995-12-01,9999-01-01
3,10006,43311,2001-08-02,9999-01-01,Senior Engineer,2001-08-02,9999-01-01
4,10007,88070,2002-02-07,9999-01-01,Senior Staff,1996-02-11,9999-01-01


In [20]:
df1.dtypes

emp_no         object
salary         object
from_date_x    object
hire_date      object
title          object
from_date_y    object
to_date        object
dtype: object

In [22]:
# convert salary to numeric type
df1['salary'] = df1['salary'].astype('int')
df1.dtypes

emp_no         object
salary          int64
from_date_x    object
hire_date      object
title          object
from_date_y    object
to_date        object
dtype: object

In [23]:
# merge only the columns needed
df2 = pd.merge(salaries[['emp_no', 'salary']], titles[['emp_no', 'title']], on='emp_no', how='inner')
df2

Unnamed: 0,emp_no,salary,title
0,10001,88958,Senior Engineer
1,10003,43311,Senior Engineer
2,10004,74057,Senior Engineer
3,10006,43311,Senior Engineer
4,10007,88070,Senior Staff


In [24]:
df2['salary'] = df1['salary'].astype('int')
df2.dtypes

emp_no    object
salary     int64
title     object
dtype: object

In [31]:
# group and aggregate + sort
# returns a series, not a dataframe
df1.groupby('title')['salary'].mean().sort_values()

title
Senior Engineer    62409.25
Senior Staff       88070.00
Name: salary, dtype: float64

In [33]:
df2.groupby('title')['salary'].mean().sort_values(ascending=False)

title
Senior Staff       88070.00
Senior Engineer    62409.25
Name: salary, dtype: float64