# From SQL to pandas challenge 11

In [2]:
# import libraries
import pandas as pd

# load data
# This code is made to load our data stored on Google Drive
def gd_path(file_id):
    """Generate a shareable link from Google Drive file id."""
    return f"https://drive.google.com/uc?export=download&id={file_id}"

# Google Drive file ids
files_id = {
    "titleauthor": "1F1JOiYXStWacOBca6coNVfyVtoST7ZgD",
    "titles": "1PLdn50N9GRa53ZbuVWo0l47F_IXdvlEm",
    "sales": "1fzFc9rwYmVIPaGOFmhLVxCi3kg19vNU2",
    "roysched": "1zPRZPoFPEMKyrNR5VSENeYFHGCBZmxbs",
    "publishers": "1s9E8_AVOziTrowb3wyh2jg3PV763VOyq",
    "employee": "1h9mUjsVqpP74b1w0x7KOw37n_n9Ulkt5",
    "authors": "1fEF89Nhe61EebAljKlwFwfEuokK0o6aJ"
}

# Read data from Google Drive
sales = pd.read_csv(gd_path(files_id["sales"]), sep=";")
titles = pd.read_csv(gd_path(files_id["titles"]), sep=";")
publishers = pd.read_csv(gd_path(files_id["publishers"]), sep=";")
employee = pd.read_csv(gd_path(files_id["employee"]), sep=";")
authors = pd.read_csv(gd_path(files_id["authors"]), sep=";")
titleauthor = pd.read_csv(gd_path(files_id["titleauthor"]), sep=";")
roysched = pd.read_csv(gd_path(files_id["roysched"]), sep=";")

## 1. Using LEFT JOIN: in which city is the publisher of "Is Anger the Enemy" based?

In [3]:
(
    titles[titles.title == 'Is Anger the Enemy?']
    .join(publishers.set_index('pub_id'), on='pub_id')
    .city
)

11    Boston
Name: city, dtype: object

### Hint:

In SQL the syntax is:

```sql
SELECT p.city
FROM publishers AS p
LEFT JOIN titles AS t
ON p.pub_id = t.pub_id
WHERE t.title = 'Is Anger the Enemy?';
```

## 2. Select all the book titles that have a link to the employee Howard Snyder
    (he works for the publisher that has published those books).

In [4]:
(
    employee[(employee.fname == 'Howard') & (employee.lname == 'Snyder')]
    .join(titles.set_index('pub_id'), on='pub_id')
    ['title']
)

13                  You Can Combat Computer Stress!
13                              Is Anger the Enemy?
13                                Life Without Fear
13    Prolonged Data Deprivation: Four Case Studies
13              Emotional Security: A New Algorithm
Name: title, dtype: object

### Hint:

In SQL the syntax is:

```sql
SELECT t.title
FROM employee e
JOIN titles t
ON e.pub_id = t.pub_id
WHERE e.fname = 'Howard'
AND e.lname = 'Snyder';
```

## 3. Using the `merge` of your choice: Select the book title with highest number of sales (qty)

In [5]:
(
    sales
    .groupby('title_id')
    .agg(sales=('qty', 'sum'))
    .join(titles.set_index('title_id'), on='title_id')
    .sort_values('sales', ascending=False)
    [['title', 'sales']]
    .head(1)
)

Unnamed: 0_level_0,title,sales
title_id,Unnamed: 1_level_1,Unnamed: 2_level_1
PS2091,Is Anger the Enemy?,108


### Hint:

In SQL the syntax is:

```sql
SELECT t.title, SUM(qty)
FROM sales AS s
JOIN titles t
ON s.title_id = t.title_id
GROUP BY t.title_id
ORDER BY SUM(qty) desc
LIMIT 1;
```

# 4. Select all book titles and the full name of their author(s).
      
      - If a book has multiple authors, all authors must be displayed (in
      multiple rows).
      
      - Books with no authors and authors with no books should not be displayed.

In [6]:
(
    titles
    .join(titleauthor.set_index('title_id'), on='title_id')
    .join(authors.set_index('au_id'), on='au_id', how='inner')
    .sort_values(['title', 'au_ord'])
    [['title', 'au_fname', 'au_lname']]
)

Unnamed: 0,title,au_fname,au_lname
7,But Is It User Friendly?,Cheryl,Carson
10,Computer Phobic AND Non-Phobic Individuals: Be...,Livia,Karsen
10,Computer Phobic AND Non-Phobic Individuals: Be...,Stearns,MacFeather
1,Cooking with Computers: Surreptitious Balance ...,Stearns,MacFeather
1,Cooking with Computers: Surreptitious Balance ...,Michael,O'Leary
14,Emotional Security: A New Algorithm,Charlene,Locksley
16,Fifty Years in Buckingham Palace Kitchens,Reginald,Blotchet-Halls
11,Is Anger the Enemy?,Albert,Ringer
11,Is Anger the Enemy?,Anne,Ringer
12,Life Without Fear,Albert,Ringer


### Hint:

In SQL the syntax is:

```sql
SELECT
    t.title,
    a.au_fname,
    a.au_lname
FROM titles t
INNER JOIN titleauthor ta
ON t.title_id = ta.title_id
INNER JOIN authors a
ON ta.au_id = a.au_id;
```

## 5. Select the full name of authors of Psychology books

   Bonus hint: if you want to prevent duplicates but allow authors with shared
   last names to be displayed, you can concatenate the first and last names
   with CONCAT(), and use the DISTINCT clause on the concatenated names.

In [7]:
(
    authors
    .join(titleauthor.set_index('au_id'), on='au_id', how='inner')
    .join(titles.set_index('title_id'), on='title_id', how='inner')
    .loc[lambda result: result.type == 'psychology', ['au_fname', 'au_lname']]
    .groupby(['au_fname', 'au_lname']).groups.keys()
)

dict_keys([('Albert', 'Ringer'), ('Anne', 'Ringer'), ('Charlene', 'Locksley'), ('Johnson', 'White'), ('Livia', 'Karsen'), ('Stearns', 'MacFeather')])

In [8]:
(
    authors
    .join(titleauthor.set_index('au_id'), on='au_id', how='inner')
    .join(titles.set_index('title_id'), on='title_id', how='inner')
    .loc[lambda result: result.type == 'psychology']
    .assign(
        full_name = lambda result: result.au_fname + ' ' + result.au_lname
    )
    .full_name.unique()
)

array(['Johnson White', 'Charlene Locksley', 'Stearns MacFeather',
       'Livia Karsen', 'Anne Ringer', 'Albert Ringer'], dtype=object)

In [9]:
(
    authors[authors.au_id.isin(
        titles[titles.type == 'psychology']
        .join(titleauthor.set_index('title_id'), on='title_id', how='inner')
        .au_id.unique())]
    [['au_fname', 'au_lname']]
)

Unnamed: 0,au_fname,au_lname
0,Johnson,White
9,Charlene,Locksley
16,Stearns,MacFeather
17,Livia,Karsen
21,Anne,Ringer
22,Albert,Ringer


### Hint:

In SQL the syntax is:

```sql
SELECT DISTINCT CONCAT(a.au_fname, " ", a.au_lname) AS full_name
FROM authors a
INNER JOIN titleauthor ta ON a.au_id = ta.au_id
INNER JOIN titles t ON ta.title_id = t.title_id
WHERE t.type = "psychology";
```

## 6. Explore the table roysched and try to grasp the meaning of each column.
   The notes below will help:
   
   - "Royalty" means the percentage of the sale price paid to the author(s).
   
   - Sometimes, the royalty may be smaller for the first few sales (which have
     to cover the publishing costs to the publisher) but higher for the sales
     above a certain threshold.
     
   - In the "roysched" table each title_id can appear multiple times, with
     different royalty values for each range of sales.
     
   - Select all rows for particular title_id, for example "BU1111", and explore
	 the data.

In [10]:
roysched[roysched.title_id == 'BU1111']

Unnamed: 0,title_id,lorange,hirange,royalty
49,BU1111,0,4000,10
50,BU1111,4001,8000,12
51,BU1111,8001,10000,14
52,BU1111,12001,16000,16
53,BU1111,16001,20000,18
54,BU1111,20001,24000,20
55,BU1111,24001,28000,22
56,BU1111,28001,50000,24


### Hint:

In SQL the syntax is:

```sql
SELECT * FROM roysched WHERE title_id = "BU1111";
```

## 7. Select all the book titles and the maximum royalty they can reach.
    Display only titles that are present in the roysched table.

In [11]:
roysched.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86 entries, 0 to 85
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title_id  86 non-null     object
 1   lorange   86 non-null     int64 
 2   hirange   86 non-null     int64 
 3   royalty   86 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 2.8+ KB


In [12]:
(
    titles[['title_id', 'title']]
    .join(roysched.set_index('title_id'), on='title_id', how='inner')
    .groupby('title_id')
    .agg(title = ('title', 'first'), max_royalty = ('royalty', 'max'))
    .sort_values('max_royalty', ascending=False)
)

Unnamed: 0_level_0,title,max_royalty
title_id,Unnamed: 1_level_1,Unnamed: 2_level_1
BU1111,Cooking with Computers: Surreptitious Balance ...,24
BU2075,You Can Combat Computer Stress!,24
BU7832,Straight Talk About Computers,24
MC3021,The Gourmet Microwave,24
TC3218,"Onions, Leeks, and Garlic: Cooking Secrets of ...",24
TC4203,Fifty Years in Buckingham Palace Kitchens,22
MC2222,Silicon Valley Gastronomic Treats,20
PC1035,But Is It User Friendly?,18
PS1372,Computer Phobic AND Non-Phobic Individuals: Be...,18
PC8888,Secrets of Silicon Valley,16


### Hint:

In SQL the syntax is:

```sql
SELECT t.title, MAX(r.royalty) max_royalty
FROM titles t
INNER JOIN roysched r
ON t.title_id = r.title_id
GROUP BY t.title
ORDER BY max_royalty DESC;
```