<a href="https://colab.research.google.com/github/swilsonmfc/pandas/blob/main/3_Lesson.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lesson 3
* Indexes
* Joins
* Concatenations

# Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn

# Data

# Indexes

In [None]:
# Indexes are the margin names of rows and columns
df1 = pd.DataFrame([
  {'Name': 'Walter', 'Surname': 'White'},
  {'Name': 'Hank',   'Surname': 'Schrader'},
  {'Name': 'Jesse',  'Surname': 'Pinkman'},
  {'Name': 'Saul',   'Surname': 'Goodman'},
])
df1

Unnamed: 0,Name,Surname
0,Walter,White
1,Hank,Schrader
2,Jesse,Pinkman
3,Saul,Goodman


## Set

In [None]:
# We can set the index
df1.set_index('Name')

Unnamed: 0_level_0,Surname
Name,Unnamed: 1_level_1
Walter,White
Hank,Schrader
Jesse,Pinkman
Saul,Goodman


In [None]:
# We can make multi-indexes
df1.set_index(['Name', 'Surname'])

Name,Surname
Walter,White
Hank,Schrader
Jesse,Pinkman
Saul,Goodman


In [None]:
df1.set_index(['Name', 'Surname'])

Name,Surname
Walter,White
Hank,Schrader
Jesse,Pinkman
Saul,Goodman


## Reset

In [None]:
# We can reset an index
# Note:  It adds a column by default with the value being dropped
df1.reset_index()

Unnamed: 0,index,Name,Surname
0,0,Walter,White
1,1,Hank,Schrader
2,2,Jesse,Pinkman
3,3,Saul,Goodman


In [None]:
# We can reset and drop
df1.reset_index(drop=True)

Unnamed: 0,Name,Surname
0,Walter,White
1,Hank,Schrader
2,Jesse,Pinkman
3,Saul,Goodman


In [None]:
df1.reset_index(col_fill='Name')

Unnamed: 0,index,Name,Surname
0,0,Walter,White
1,1,Hank,Schrader
2,2,Jesse,Pinkman
3,3,Saul,Goodman


# Join
* Construct two data frames
* Join & Concat

In [None]:
df1 = pd.DataFrame([
  {'Name': 'Walter'},
  {'Name': 'Hank'},
  {'Name': 'Jesse'},
  {'Name': 'Saul'},
])
df1

Unnamed: 0,Name
0,Walter
1,Hank
2,Jesse
3,Saul


In [None]:
df2 = pd.DataFrame([
  {'Surname': 'White'},
  {'Surname': 'Schrader'},
  {'Surname': 'Pinkman'},
  {'Surname': 'Goodman'},
])
df2

Unnamed: 0,Surname
0,White
1,Schrader
2,Pinkman
3,Goodman


## Join 
* Match up rows and put frames together

In [None]:
# Use the data frame to join to another
# Join uses the index
df1.join(df2)

Unnamed: 0,Name,Surname
0,Walter,White
1,Hank,Schrader
2,Jesse,Pinkman
3,Saul,Goodman


## Index vs Column

In [None]:
df1 = pd.DataFrame([
  {'Name': 'Walter', 'Surname': 'White',    'Job': 'Entrepreneur'},
  {'Name': 'Hank',   'Surname': 'Schrader', 'Job': 'Law Enforcement'},
  {'Name': 'Jesse',  'Surname': 'Pinkman',  'Job': 'Entrepreneur'},
  {'Name': 'Saul',   'Surname': 'Goodman',  'Job': 'Lawyer'},
])
df1

Unnamed: 0,Name,Surname,Job
0,Walter,White,Entrepreneur
1,Hank,Schrader,Law Enforcement
2,Jesse,Pinkman,Entrepreneur
3,Saul,Goodman,Lawyer


In [None]:
df2 = pd.DataFrame([
  {'Job': 'Entrepreneur',    'Pay': 1_000_000},
  {'Job': 'Law Enforcement', 'Pay': 75_000},
  {'Job': 'Lawyer',          'Pay': 250_000},
  {'Job': 'Teacher',         'Pay': 50_000},
])
df2

Unnamed: 0,Job,Pay
0,Entrepreneur,1000000
1,Law Enforcement,75000
2,Lawyer,250000
3,Teacher,50000


In [None]:
# Can't join - there's a column the same
df1.join(df2)

ValueError: ignored

In [None]:
# What if we move the Job field to the index?
# Not an intuitive result - Again indexes...
df1.join(df2.set_index('Job'))

Unnamed: 0,Name,Surname,Job,Pay
0,Walter,White,Entrepreneur,
1,Hank,Schrader,Law Enforcement,
2,Jesse,Pinkman,Entrepreneur,
3,Saul,Goodman,Lawyer,


In [None]:
# We need to move the columns in common
# i.e. what we want to join on into the index
df1.set_index('Job').join(df2.set_index('Job'))

Unnamed: 0_level_0,Name,Surname,Pay
Job,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Entrepreneur,Walter,White,1000000
Entrepreneur,Jesse,Pinkman,1000000
Law Enforcement,Hank,Schrader,75000
Lawyer,Saul,Goodman,250000


In [None]:
# Wait, there's an "on" parameter
# "on" isn't necessarily intuitive
# You'd like it to join on the column, Job
# But it's still about joining indexes
# For example join the Job column in 1st frame to the index in the 2nd
df1.join(df2.set_index('Job'), on=['Job'])

Unnamed: 0,Name,Surname,Job,Pay
0,Walter,White,Entrepreneur,1000000
1,Hank,Schrader,Law Enforcement,75000
2,Jesse,Pinkman,Entrepreneur,1000000
3,Saul,Goodman,Lawyer,250000


## Inner, Left, Right, Outer

In [None]:
# If we flip this around (from jobs -> people)
# We can see the default join style, left
df2.set_index('Job').join(df1.set_index('Job'))

Unnamed: 0_level_0,Pay,Name,Surname
Job,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Entrepreneur,1000000,Walter,White
Entrepreneur,1000000,Jesse,Pinkman
Law Enforcement,75000,Hank,Schrader
Lawyer,250000,Saul,Goodman
Teacher,50000,,


In [None]:
# We can control the join using the how attribute
# Supports left, right, inner, outer
# Here we're showing an inner join
df2.set_index('Job').join(df1.set_index('Job'), how='inner')

Unnamed: 0_level_0,Pay,Name,Surname
Job,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Entrepreneur,1000000,Walter,White
Entrepreneur,1000000,Jesse,Pinkman
Law Enforcement,75000,Hank,Schrader
Lawyer,250000,Saul,Goodman


## Suffixes
* Sometimes, our join results in common column names
* We could replace the existing names of columns
* But it could be helpful to suffix all common codes

In [None]:
df1 = pd.DataFrame([
  {'Name': 'Walter', 'Surname': 'White',    'Job': 'Entrepreneur',    'Pay': 1_500_000},
  {'Name': 'Hank',   'Surname': 'Schrader', 'Job': 'Law Enforcement', 'Pay': 100_000},
  {'Name': 'Jesse',  'Surname': 'Pinkman',  'Job': 'Entrepreneur',    'Pay': 1_500_000},
  {'Name': 'Saul',   'Surname': 'Goodman',  'Job': 'Lawyer',          'Pay': 500_000},
])
df1

Unnamed: 0,Name,Surname,Job,Pay
0,Walter,White,Entrepreneur,1500000
1,Hank,Schrader,Law Enforcement,100000
2,Jesse,Pinkman,Entrepreneur,1500000
3,Saul,Goodman,Lawyer,500000


In [None]:
df2 = pd.DataFrame([
  {'Job': 'Entrepreneur',    'Pay': 200_000},
  {'Job': 'Law Enforcement', 'Pay': 75_000},
  {'Job': 'Lawyer',          'Pay': 150_000},
  {'Job': 'Teacher',         'Pay': 50_000},
])
df2

Unnamed: 0,Job,Pay
0,Entrepreneur,200000
1,Law Enforcement,75000
2,Lawyer,150000
3,Teacher,50000


In [None]:
# You can control the column names with suffixing
df1.join(df2.set_index('Job'), on=['Job'], lsuffix='_Current', rsuffix='_Typical')

Unnamed: 0,Name,Surname,Job,Pay_Current,Pay_Typical
0,Walter,White,Entrepreneur,1500000,200000
1,Hank,Schrader,Law Enforcement,100000,75000
2,Jesse,Pinkman,Entrepreneur,1500000,200000
3,Saul,Goodman,Lawyer,500000,150000


# Merge
* Pandas has a merge method
* Merge is more flexible than join
  * Can use indexes or columns
  * Supports left, right, outer, inner

![](https://i.stack.imgur.com/1rb1R.jpg)

In [None]:
df1 = pd.DataFrame([
  {'Name': 'Walter', 'Surname': 'White',    'Job': 'Entrepreneur',    'Pay': 1_500_000},
  {'Name': 'Hank',   'Surname': 'Schrader', 'Job': 'Law Enforcement', 'Pay': 100_000},
  {'Name': 'Jesse',  'Surname': 'Pinkman',  'Job': 'Entrepreneur',    'Pay': 1_500_000},
  {'Name': 'Saul',   'Surname': 'Goodman',  'Job': 'Lawyer',          'Pay': 500_000},
])
df1

Unnamed: 0,Name,Surname,Job,Pay
0,Walter,White,Entrepreneur,1500000
1,Hank,Schrader,Law Enforcement,100000
2,Jesse,Pinkman,Entrepreneur,1500000
3,Saul,Goodman,Lawyer,500000


In [None]:
df2 = pd.DataFrame([
  {'Job': 'Entrepreneur',    'Pay': 200_000},
  {'Job': 'Law Enforcement', 'Pay': 75_000},
  {'Job': 'Lawyer',          'Pay': 150_000},
  {'Job': 'Teacher',         'Pay': 50_000},
])
df2

Unnamed: 0,Job,Pay
0,Entrepreneur,200000
1,Law Enforcement,75000
2,Lawyer,150000
3,Teacher,50000


## Default

In [None]:
# Performs an inner join using on (left_on, right_on)
# Nothing specified, null set
# i.e. it doesn't use indexes by default
df1.merge(df2)

Unnamed: 0,Name,Surname,Job,Pay


In [None]:
# We can use the pandas merge method which accepts two frames
# Not that it affects the results here
pd.merge(df1, df2)

Unnamed: 0,Name,Surname,Job,Pay


## Column Merge

In [None]:
# We can use column names in common to join using "on"
# Note, any columns not in the join, with same names are suffixed
pd.merge(df1, df2, on='Job')

Unnamed: 0,Name,Surname,Job,Pay_x,Pay_y
0,Walter,White,Entrepreneur,1500000,200000
1,Jesse,Pinkman,Entrepreneur,1500000,200000
2,Hank,Schrader,Law Enforcement,100000,75000
3,Saul,Goodman,Lawyer,500000,150000


In [None]:
# We can control the suffixing
pd.merge(df1, df2, on='Job', suffixes=('_left', '_right'))

Unnamed: 0,Name,Surname,Job,Pay_left,Pay_right
0,Walter,White,Entrepreneur,1500000,200000
1,Jesse,Pinkman,Entrepreneur,1500000,200000
2,Hank,Schrader,Law Enforcement,100000,75000
3,Saul,Goodman,Lawyer,500000,150000


In [None]:
# We can directly specify the columns by name to match
# Useful if they are named differently
# Note, you can specifiy a string or an array of columns to join on
pd.merge(df1, df2, left_on=['Job'], right_on=['Job'])

Unnamed: 0,Name,Surname,Job,Pay_x,Pay_y
0,Walter,White,Entrepreneur,1500000,200000
1,Jesse,Pinkman,Entrepreneur,1500000,200000
2,Hank,Schrader,Law Enforcement,100000,75000
3,Saul,Goodman,Lawyer,500000,150000


## Indicator
* Might be helpful if you're debugging or want to keep some meta data

In [None]:
# Where did the row come from?  Inner, Left or Right
pd.merge(df1, df2, left_on=['Job'], right_on=['Job'], indicator=True)

Unnamed: 0,Name,Surname,Job,Pay_x,Pay_y,_merge
0,Walter,White,Entrepreneur,1500000,200000,both
1,Jesse,Pinkman,Entrepreneur,1500000,200000,both
2,Hank,Schrader,Law Enforcement,100000,75000,both
3,Saul,Goodman,Lawyer,500000,150000,both


In [None]:
# Here's the teacher row, of which we don't have a person in that job
pd.merge(df1, df2, left_on=['Job'], right_on=['Job'], how='outer', indicator=True)

Unnamed: 0,Name,Surname,Job,Pay_x,Pay_y,_merge
0,Walter,White,Entrepreneur,1500000.0,200000,both
1,Jesse,Pinkman,Entrepreneur,1500000.0,200000,both
2,Hank,Schrader,Law Enforcement,100000.0,75000,both
3,Saul,Goodman,Lawyer,500000.0,150000,both
4,,,Teacher,,50000,right_only


## Indexes
* We have to specify either the columns or the index to use in the join
* Here we look at index matching and mixed matching

In [None]:
# Match by index, note the columns labeled _x, _y
pd.merge(df1, df2, left_index=True, right_index=True)

Unnamed: 0,Name,Surname,Job_x,Pay_x,Job_y,Pay_y
0,Walter,White,Entrepreneur,1500000,Entrepreneur,200000
1,Hank,Schrader,Law Enforcement,100000,Law Enforcement,75000
2,Jesse,Pinkman,Entrepreneur,1500000,Lawyer,150000
3,Saul,Goodman,Lawyer,500000,Teacher,50000


In [None]:
# To illustrate mixed column / index merge, set one
pd.merge(df1, df2.set_index('Job'), left_on=['Job'], right_index=True)

Unnamed: 0,Name,Surname,Job,Pay_x,Pay_y
0,Walter,White,Entrepreneur,1500000,200000
2,Jesse,Pinkman,Entrepreneur,1500000,200000
1,Hank,Schrader,Law Enforcement,100000,75000
3,Saul,Goodman,Lawyer,500000,150000


# Concat
* Build out two dataframes (like before)
* We'd like to stack one frame on the other, horizontally and vertically

![](https://miro.medium.com/max/1694/1*0wu6DunCzPC4o9FIyRTW4w.png)

## Frames

In [None]:
df1 = pd.DataFrame([
  {'Name': 'Walter'},
  {'Name': 'Hank'},
  {'Name': 'Jesse'},
  {'Name': 'Saul'},
])
df1 = df1.set_index(df1.index[::-1] * 10)
df1

Unnamed: 0,Name
30,Walter
20,Hank
10,Jesse
0,Saul


In [None]:
df2 = pd.DataFrame([
  {'Surname': 'White'},
  {'Surname': 'Schrader'},
  {'Surname': 'Pinkman'},
  {'Surname': 'Goodman'},
])
df2

Unnamed: 0,Surname
0,White
1,Schrader
2,Pinkman
3,Goodman


## Vertical
* Default axis (0) uses the index to concat rows

In [None]:
pd.concat([df1, df2])

Unnamed: 0,Name,Surname
30,Walter,
20,Hank,
10,Jesse,
0,Saul,
0,,White
1,,Schrader
2,,Pinkman
3,,Goodman


## Horizontal

In [None]:
# Joined using the index in common, "outer"
pd.concat([df1, df2], axis=1)

Unnamed: 0,Name,Surname
0,Saul,White
1,,Schrader
2,,Pinkman
3,,Goodman
10,Jesse,
20,Hank,
30,Walter,


In [None]:
# We can move that behavior to inner joins
pd.concat([df1, df2], axis=1, join='inner')

Unnamed: 0,Name,Surname
0,Saul,White


In [None]:
# We can chose to ignore the index
# Still not the effect we're looking for
# Notice we lost column names
pd.concat([df1, df2], ignore_index=True, axis=1)

Unnamed: 0,0,1
0,Saul,White
1,,Schrader
2,,Pinkman
3,,Goodman
10,Jesse,
20,Hank,
30,Walter,


In [None]:
# To solve, we need to reset the index to order
pd.concat([df.reset_index(drop=True) for df in [df1, df2]], axis=1)

Unnamed: 0,Name,Surname
0,Walter,White
1,Hank,Schrader
2,Jesse,Pinkman
3,Saul,Goodman


In [None]:
# Alternatively, set the index of one frame to the other
# Here we have better control over the index
pd.concat([df1, df2.set_index(df1.index)], axis=1)

Unnamed: 0,Name,Surname
30,Walter,White
20,Hank,Schrader
10,Jesse,Pinkman
0,Saul,Goodman


# Join, Merge, Concat

![](https://www.ualberta.ca/science/media-library/news/2020/july/rock-paper-scissors-webinar.jpg)

* Join 
  * You have two dataframes 
  * You want to associate using the index
  * Can control the type of join
* Merge 
  * More flexible
  * Can use the index or columns in the dataframe 
  * Can control the type of join
* Concat
  * You want to stack horizontally or vertically
  * Remember the index when you append columns & rows

# Extras

## Data Tables

In [None]:
%load_ext google.colab.data_table
df1

Unnamed: 0,Name
30,Walter
20,Hank
10,Jesse
0,Saul


In [None]:
%unload_ext google.colab.data_table