# From SQL to pandas challenge 10

In [1]:
# import libraries
import pandas as pd
import numpy as np

# load data
# This code is made to load our data stored on Google Drive
def gd_path(file_id):
    """Generate a shareable link from Google Drive file id."""
    return f"https://drive.google.com/uc?export=download&id={file_id}"

# Google Drive file ids
files_id = {
    "titles": "1PLdn50N9GRa53ZbuVWo0l47F_IXdvlEm",
    "sales": "1fzFc9rwYmVIPaGOFmhLVxCi3kg19vNU2",
}

# Read data from Google Drive
sales = pd.read_csv(gd_path(files_id["sales"]), sep=";")
titles = pd.read_csv(gd_path(files_id["titles"]), sep=";")

## 1. Select everything from the sales table and create a new column called "sales_category" to categorise qty:
   
		qty >= 50 high sales
		20 <= qty < 50 medium sales
		qty < 20 low sales

In [4]:
(
    sales
    .assign(sales_category = sales.qty.apply(
        lambda qty:
          'high'   if qty >= 50 else
          'medium' if qty >= 20 else
          'low'
    ))
)

Unnamed: 0,stor_id,ord_num,ord_date,qty,payterms,title_id,sales_category
0,6380,6871,1994-09-14 00:00:00,5,Net 60,BU1032,low
1,6380,722a,1994-09-13 00:00:00,3,Net 60,PS2091,low
2,7066,A2976,1993-05-24 00:00:00,50,Net 30,PC8888,high
3,7066,QA7442.3,1994-09-13 00:00:00,75,ON invoice,PS2091,high
4,7067,D4482,1994-09-14 00:00:00,10,Net 60,PS2091,low
5,7067,P2121,1992-06-15 00:00:00,40,Net 30,TC3218,medium
6,7067,P2121,1992-06-15 00:00:00,20,Net 30,TC4203,medium
7,7067,P2121,1992-06-15 00:00:00,20,Net 30,TC7777,medium
8,7131,N914008,1994-09-14 00:00:00,20,Net 30,PS2091,medium
9,7131,N914014,1994-09-14 00:00:00,25,Net 30,MC3021,medium


### Hint:

In SQL the syntax is:

```sql
SELECT *,
CASE
    WHEN qty >= 50 THEN "high sales"
    WHEN qty >= 20 THEN "medium sales"
    ELSE "low sales"
END AS sales_category
FROM sales;
```

## 2. Adding to your answer from the previous question. Find out the total amount of books sold (qty) in each sales category
    i.e. How many books had high sales, how many had medium sales, and how many had low sales

In [12]:
(
    sales
    .assign(sales_category = sales.qty.apply(
        lambda qty:
          'high'   if qty >= 50 else
          'medium' if qty >= 20 else
          'low'
    ))
    .groupby('sales_category')
    .agg(total_sales = ('qty', 'sum'))
    .loc[['low', 'medium', 'high'], :]
    .reset_index()
)

Unnamed: 0,sales_category,total_sales
0,low,83
1,medium,285
2,high,125


### Hint:

In SQL the syntax is:

```sql
SELECT sum(qty),
CASE
	WHEN qty>=50 THEN 'high sales'
    WHEN (qty>=20 AND qty<50) THEN 'medium sales'
    ELSE 'low sales'
END AS sales_category
FROM sales
GROUP BY sales_category;
```

## 3. Adding to your answer from the previous questions: output only those sales categories that have a SUM(qty) greater than 100, and order them in descending order

In [17]:
(
    sales
    .assign(sales_category = sales.qty.apply(
        lambda qty:
          'high'   if qty >= 50 else
          'medium' if qty >= 20 else
          'low'
    ))
    .groupby('sales_category')
    .agg(total_sales = ('qty', 'sum'))
    [lambda result: result.total_sales >= 100]
    .sort_values(by='total_sales', ascending=False)
)

Unnamed: 0_level_0,total_sales
sales_category,Unnamed: 1_level_1
medium,285
high,125


### Hint:

In SQL the syntax is:

```sql
SELECT sum(qty),
CASE
    WHEN qty>=50 THEN 'high sales'
    WHEN (qty>=20 AND qty<50) THEN 'medium sales'
    ELSE 'low sales'
END AS sales_category
FROM sales
GROUP BY sales_category
HAVING sum(qty)>100
ORDER BY sum(qty) DESC;
```

## 4. Find out the average book price, per publisher, for the following book types and price categories:
		book types: business, traditional cook and psychology
		price categories: <= 5 super low, <= 10 low, <= 15 medium, > 15 high
        
        - When displaying the average prices, use ROUND() to hide decimals.

In [31]:
(
    titles[titles.type.isin(['business', 'trad_cook', 'psychology'])]
    .assign(
        price_category = titles.price.apply(
            lambda price:
            'super low' if price <= 5 else
            'low' if price <= 10 else
            'medium' if price <= 15 else
            'high'
        )
    )
    .groupby(['pub_id', 'type', 'price_category'])
    .agg(average_price=('price', 'mean'))
    .assign(
        average_price =
            lambda x: round(x.average_price).astype('int')
    )
)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,average_price
pub_id,type,price_category,Unnamed: 3_level_1
736,business,super low,3
736,psychology,high,20
736,psychology,low,7
736,psychology,medium,11
877,psychology,high,22
877,trad_cook,high,21
877,trad_cook,medium,13
1389,business,high,20
1389,business,medium,12


### Hint:

In SQL the syntax is:

```sql
SELECT
    ROUND(AVG(price)),
    type,
    pub_id,
CASE
    WHEN price <= 5 THEN 'super low'
    WHEN (price > 5 AND price <= 10) THEN 'low'
    WHEN (price > 10 AND price <= 15) THEN 'medium'
    ELSE 'high'
END AS price_category
FROM titles
GROUP BY
    pub_id,
    type,
    price_category
HAVING
    type IN ('business', 'trad_cook', 'psychology');
```