# joining-tutorial.ipynb
# WESmith 11/5/22
### see https://www.datacamp.com/tutorial/joining-dataframes-pandas

In [None]:
import pandas as pd
import numpy as  np
import os

In [None]:
dummy_data1 = {
        'id': ['1', '2', '3', '4', '5'],
        'Feature1': ['A', 'C', 'E', 'G', 'I'],
        'Feature2': ['B', 'D', 'F', 'H', 'J']}
df1 = pd.DataFrame(dummy_data1, columns = ['id', 'Feature1', 'Feature2'])
df1

In [None]:
dummy_data2 = {
        'id': ['1', '2', '6', '7', '8'],
        'Feature1': ['K', 'M', 'O', 'Q', 'S'],
        'Feature2': ['L', 'N', 'P', 'R', 'T']}
df2 = pd.DataFrame(dummy_data2, columns = ['id', 'Feature1', 'Feature2'])
df2

In [None]:
dummy_data3 = {
        'id': ['1', '2', '3', '4', '5', '7', '8', '9', '10', '11'],
        'Feature3': [12, 13, 14, 15, 16, 17, 15, 12, 13, 23]}
df3 = pd.DataFrame(dummy_data3, columns = ['id', 'Feature3'])
df3

## CONCAT (NOTE: CONCAT() MAKES A FULL COPY)

In [None]:
df_row = pd.concat([df1, df2])  # WS add data sets along rows: axis=0 by default; indexes retained
df_row

In [None]:
df_row_reindex = pd.concat([df1, df2], ignore_index=True)  # WS reset indexes
df_row_reindex

In [None]:
frames = [df1,df2]
df_keys = pd.concat(frames, keys=['x', 'y'])  # WS keep track of where data came from
df_keys

In [None]:
df_keys.loc['y']  #  WS retrieving a particular dataset

In [None]:
pieces = {'x': df1, 'y': df2}  # WS label datasets in a dictionary: get the same result
df_piece = pd.concat(pieces)
df_piece

In [None]:
df_col = pd.concat([df1,df2], axis=1)  # WS concat along columns
df_col

## MERGE

In [None]:
df_row

In [None]:
df3

In [None]:
df_merge_col = pd.merge(df_row, df3, on='id')
df_merge_col
# WS if an id is not in both datasets, it is missing in merge(): ie, 6,9,10,11 missing in result
# also, IDs 1,2 are each duplicated since they were duplicated in first dataset

In [None]:
# WS if COLUMNS on which to merge have different NAMES (unlike this case): use:
df_merge_difkey = pd.merge(df_row, df3, left_on='id', right_on='id')
df_merge_difkey

## APPEND (DEPRECATED: USE CONCAT)

In [None]:
add_row = pd.Series(['10', 'X1', 'X2', 'X3'],
                    index=['id','Feature1', 'Feature2', 'Feature3'])
add_row

In [None]:
# WS append is deprecated, use concat
df_add_row = df_merge_col.append(add_row, ignore_index=True)
df_add_row

In [None]:
# WS here is the new workaround to DataFrame.append() (append deprecated because it is very slow)
# see https://stackoverflow.com/questions/70837397/
#             good-alternative-to-pandas-append-method-now-that-it-is-being-deprecated
add_row_as_df = pd.DataFrame(add_row).T  # the key is to make a DF of the Series, and transpose it
add_row_as_df

In [None]:
# WS then can concat it
df_add_row = pd.concat([df_merge_col, add_row_as_df], ignore_index=True)
df_add_row

## JOIN

### FULL OUTER JOIN USING MERGE()

In [None]:
df1

In [None]:
df2

In [None]:
# WS this automatically creates new column labels showing the source of data;
# all records from both DataFrames and fill in NaNs where data is missing
df_outer = pd.merge(df1, df2, on='id', how='outer')
df_outer

In [None]:
# WS can rename the suffixes for the new column labels
df_suffix = pd.merge(df1, df2, left_on='id',right_on='id',how='outer',suffixes=('_left','_right'))
df_suffix

### INNER JOIN USING MERGE()

In [None]:
df_inner = pd.merge(df1, df2, on='id', how='inner')
df_inner

### RIGHT JOIN USING MERGE()

In [None]:
df_right = pd.merge(df1, df2, on='id', how='right')
df_right

### LEFT JOIN USING MERGE()

In [None]:
df_left = pd.merge(df1, df2, on='id', how='left')
df_left

### JOINING ON INDEX USING MERGE()

In [None]:
df1

In [None]:
df2

In [None]:
# WS this joins on the indexes (ie, here 0,1,2,3,4)
df_index = pd.merge(df1, df2, right_index=True, left_index=True)
df_index

### TIME-SERIES MERGING USING MERGE_ASOF()

In [None]:
trades = pd.DataFrame({
    'time': pd.to_datetime(['20160525 13:30:00.023',
                            '20160525 13:30:00.038',
                            '20160525 13:30:00.048',
                            '20160525 13:30:00.048',
                            '20160525 13:30:00.048']),
    'ticker': ['MSFT', 'MSFT','GOOG', 'GOOG', 'AAPL'],
    'price': [51.95, 51.95,720.77, 720.92, 98.00],
    'quantity': [75, 155,100, 100, 100]},
    columns=['time', 'ticker', 'price', 'quantity'])
trades

In [None]:
quotes = pd.DataFrame({
    'time': pd.to_datetime(['20160525 13:30:00.023',
                            '20160525 13:30:00.023',
                            '20160525 13:30:00.030',
                            '20160525 13:30:00.041',
                            '20160525 13:30:00.048',
                            '20160525 13:30:00.049',
                            '20160525 13:30:00.072',
                            '20160525 13:30:00.075']),
    'ticker': ['GOOG', 'MSFT', 'MSFT','MSFT', 'GOOG', 'AAPL', 'GOOG','MSFT'],
    'bid': [720.50, 51.95, 51.97, 51.99,720.50, 97.99, 720.50, 52.01],
    'ask': [720.93, 51.96, 51.98, 52.00,720.93, 98.01, 720.88, 52.03]},
    columns=['time', 'ticker', 'bid', 'ask'])
quotes

In [None]:
df_merge_asof = pd.merge_asof(trades, quotes,
              on='time',
              by='ticker')
df_merge_asof

In [None]:
# WS specify a time tolerance: trade must be no more than 2ms after bid/ask quote
# one MSFT thrown out because trade is 8ms after bid/ask
df_merge_asof_tolerance = pd.merge_asof(trades, quotes,
              on='time',
              by='ticker',
              tolerance=pd.Timedelta('2ms'))
df_merge_asof_tolerance