# Merging Data

Combining and reshaping data from multiple sources

For this module, we will be exploring the techniques for combining and reshaping data from multiple sources. You will learn about:

* Combining data using pandas library
* Validating Merges
* Debugging Chains
* Exporting to Excel


In [None]:
import numpy as np
import pandas as pd
import zipfile

In [None]:
# https://www.kaggle.com/datasets/dgomonov/new-york-city-airbnb-open-data?resource=download
with zipfile.ZipFile('data/AB_NYC_2019.csv.zip') as zip:
  print(zip.namelist())

In [None]:
ab = pd.read_csv('data/AB_NYC_2019.csv.zip', dtype_backend='pyarrow',
                engine='pyarrow')
ab

In [None]:
temps = pd.read_csv('data/nyc-ab-temp.csv', index_col=0, dtype_backend='pyarrow',
                engine='pyarrow')

In [None]:
temps

## Merging

In [None]:
# Error expected here. We're working to fix it in the next few cells.

(ab
 .merge(temps)
)

In [None]:
(ab.columns.intersection(temps.columns))

In [None]:
ab.columns

In [None]:
temps.columns

In [None]:
(ab
 .merge(temps, left_on=['latitude', 'longitude'],
        right_on=['lat', 'lon']
       )
)

## Validating Merges

In [None]:
left = pd.DataFrame({'name': ['Ravi', 'Jose', 'Jose'],
                    'pet': ['Dog', 'Cat', 'Dog']})
right = pd.DataFrame({'name': ['Ravi', 'Jose', 'Sally'],
                      'age': [10, 17, 5]})

In [None]:
display(left)
display(right)

In [None]:
left.merge(right, how='inner')

In [None]:
left.merge(right, how='left')

In [None]:
left.merge(right, how='right')

In [None]:
left.merge(right, how='outer')

In [None]:
left.merge(right, how='cross')

In [None]:
left

In [None]:
left.merge(right, how='inner', validate='1:1')

In [None]:
left.merge(right, how='inner', validate='m:1')

## Debugging Trick

In [None]:
def limit(df, n_rows, n_cols):
  return df.iloc[:n_rows, :n_cols]

(ab
 .pipe(limit, n_rows=3, n_cols=6)
)

In [None]:
# add debug to helpers
def debug(df, extra=''):
  print(f'{extra} {df.shape=}')
  return df

(ab
 .pipe(debug, extra='before')
 .merge(temps, left_on=['latitude', 'longitude'],
        right_on=['lat', 'lon']
       )
 .pipe(debug, extra='after') 
)

In [None]:
(ab
 .pipe(debug, extra='before')
 .merge(temps, left_on=['latitude', 'longitude'],
        right_on=['lat', 'lon']
       )
 .pipe(debug, extra='after') 
 .groupby('neighbourhood_group')
 .mean()
 .pipe(debug, extra='summary') 
)

In [None]:
# pandas 2 requires numeric_only
(ab
 .pipe(debug, extra='before')
 .merge(temps, left_on=['latitude', 'longitude'],
        right_on=['lat', 'lon']
       )
 .pipe(debug, extra='after') 
 .groupby('neighbourhood_group')
 .mean(numeric_only=True)
 .pipe(debug, extra='summary') 
)

## Cleanup Columns

In [None]:
(ab
 .pipe(debug, extra='before')
 .merge(temps, left_on=['latitude', 'longitude'],
        right_on=['lat', 'lon']
       )
 .pipe(debug, extra='after') 
 .columns
)

In [None]:
# Explicitly list columns
(ab
 .pipe(debug, extra='before')
 .merge(temps, left_on=['latitude', 'longitude'],
        right_on=['lat', 'lon']
       )
 .pipe(debug, extra='after') 
 .loc[:, ['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'temp']]
 .pipe(debug, extra='limit cols')  
)

## Export to Excel

In [None]:
xls_out = pd.ExcelWriter('data/airbnb.xlsx')
xl = (ab
 .pipe(debug, extra='before')
 .merge(temps, left_on=['latitude', 'longitude'],
        right_on=['lat', 'lon']
       )
 .pipe(debug, extra='after') 
 .loc[:, ['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'temp']]
)

xl.to_excel(xls_out, sheet_name='all')
(xl
 .query('neighbourhood_group=="Brooklyn"')
 .to_excel(xls_out, sheet_name='Brookly')
)
xls_out.close()

In [None]:
!open data/airbnb.xlsx