In [None]:
#: imports!

import numpy as np
import babypandas as bpd

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline

# Lecture 10

## Grouping with Subgroups, Merge

## Prediction

In [None]:
galton = bpd.read_csv('data/galton.csv')
galton = (
    galton.assign(
        midparent=galton.get('midparentHeight'),
        child=galton.get('childHeight')
    )
    .drop(columns=['midparentHeight', 'childHeight'])
)
galton

### Can we predict the height of a child, given the midparent height?

* Given the midparent height, restrict to nearby examples in the dataset (within 0.5 in).
* Take the average child height within these nearby examples.
* This average is our guess!

In [None]:
galton.plot(kind='scatter', x='midparent', y='child')
_ = plt.plot([67.5, 67.5], [50, 85], color='red', lw=2)
_ = plt.plot([68.5, 68.5], [50, 85], color='red', lw=2)
_ = plt.scatter(68, 66.24, color='gold', s=40)

### Can we predict the height of a child, given the midparent height?

In [None]:
def predict_child(mp):
    '''returns a childs predicted height, given the midparent height, mp.'''

predict_child(68)

### Can we predict the height of a child, given the midparent height?
* Apply our function to all our examples
* Create a new column called `prediction` and plot the output 

In [None]:
predictions = galton.get('midparent').apply(predict_child)
predictions

In [None]:
with_predictions = galton.assign(prediction=predictions)
with_predictions

In [None]:
ax = with_predictions.plot(kind='scatter', x='midparent', y='child')
with_predictions.plot(kind='scatter', x='midparent', y='prediction', ax=ax, color='C2')

# Grouping with Subgroups

## Our familiar NBA data...

In [None]:
#: read from csv and relabel
nba = bpd.read_csv('data/nba_salaries.csv').set_index('PLAYER')
nba = nba.assign(SALARY=nba.get("'15-'16 SALARY")).drop(columns="'15-'16 SALARY")
nba

## How big is each team?

- We know how to do this: `.groupby()`.
- **Notice**: team names become the row labels.

## How much does each team pay in payroll?

- Instead of counting, we want to sum the `SALARY` column.

## How many of each position does each team have?

- We want to count...
    - but sizes of groups within groups.
- i.e., sizes of position groups within team groups.

## `.groupby()` with subgroups

- To make groups within groups (within groups, etc.)...
- Pass a list of column names to `.groupby()`:

```
table.groupby([col_1, col_2, col_3])
```
- Groups by `col_1` first.
- Within each group, groups by `col_2`.
- And so on.

## Notice the index...

- This is called a "[MultiIndex](https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html)"
- We won't worry about those...
- Use `.reset_index()` to move index back to columns.

In [None]:
nba.groupby(['TEAM', 'POSITION']).count()

## Does order matter?

In [None]:
nba.groupby(['POSITION', 'TEAM']).count()

## Which team has the most centers?

In [None]:
position_counts = ...
position_counts

In [None]:
# select only the centers


## Example: Sea Temperatures

- The sea surface temperature in La Jolla, every day from August 22, 1916 to May 31, 2019

In [None]:
sea_temp = bpd.read_csv('data/sea_temp.csv')
sea_temp

## Discussion question: Which month had the highest average temperature?

What kind of grouping should we do here?

- A) `sea_temp.groupby(['SURFACE_TEMP'])`
- B) `sea_temp.groupby(['MONTH'])`
- C) `sea_temp.groupby(['YEAR', 'MONTH'])`
- D) `sea_temp.groupby(['MONTH', 'DAY'])`
- E) `sea_temp.groupby(['MONTH', 'SURFACE_TEMP'])`


In [None]:
# define table `hottest` using `sea_temp`, in descending order by temp

hottest = (
    sea_temp
    .groupby(['YEAR', 'MONTH'])
    .mean()
    .reset_index()
    .sort_values('SURFACE_TEMP', ascending=False)
)

hottest

In [None]:
hottest.get('YEAR').iloc[0]

In [None]:
hottest.get('MONTH').iloc[0]

## Bonus Plot

- Yearly average surface temperature

In [None]:
sea_temp.groupby(['YEAR']).mean().plot(y='SURFACE_TEMP') #why not group by ['YEAR', 'MONTH']?

## Summary: `.groupby`

- Pass a list of columns to make subgroups.
- *Always* use `.reset_index()` after to move index to columns.

# Merge

Combining columns from two different tables

## Example

In [None]:
products = bpd.DataFrame().assign(
    Location=['Cups', 'Cups', 'Cups', 'Art of Espresso', 'Art of Espresso', 'Perks', 'Perks'],
    Product=['Green Tea', 'Latte', 'Drip Coffee', 'Espresso', 'Latte', 'Drip Coffee', 'Green Tea'],
    Price=[1.25, 2.50, 1.00, 2.00, 3.00, 1.25, 1.50]
)
products

## Example

In [None]:
coupons = bpd.DataFrame().assign(
    Location=['Cups', 'Art of Espresso'],
    Discount=[.25, .10]
)
coupons

## How do we calculate discounted price of each product?

- Idea: "cross-reference" tables.
- I.e., for each row in `products`, find discount in `coupons` for that row's `Location`.
- This is what `.merge()` does:

In [None]:
with_discounts = products.merge(coupons, left_on='Location', right_on='Location')
with_discounts

In [None]:
with_discounts.assign(
    DiscountedPrice=with_discounts.get('Price') *(1 - with_discounts.get('Discount'))
)

## Merging

- Pick a "left" table and a "right" table.
- Choose a column from each to "merge on".

<img src="data/merge.png" />

## `.merge()` method

```python
left_table.merge(
    right_table, 
    left_on=left_column_name,
    right_on=right_column_name
)
```
- `left_on` and `right_on` should be column names (don't have to be the same)
- One row for every match
- Deletes rows that don't match!

## What if column names don't match?

In [None]:
cafes = coupons.assign(
    Cafe=coupons.get('Location')
).drop(columns='Location')
cafes

In [None]:
products.merge(cafes, left_on='Location', right_on='Cafe')

## Does order matter?

In [None]:
cafes.merge(products, left_on="Cafe", right_on='Location')

## What if we want to "merge on" an index?

- Instead of using `left_on` or `right_on`, use `left_index=True` or `right_index=True`

In [None]:
coupons_by_location = coupons.set_index('Location')
coupons_by_location

In [None]:
products.merge(
    coupons_by_location, 
    left_on='Location', 
    right_index=True
)