In [1]:
# Create data tables 

import pandas as pd

prices = pd.DataFrame({
    "date": ["2025-01-01","2025-01-01","2025-01-02","2025-01-02"],
    "ticker": ["GOOG","MSFT","GOOG","MSFT"],
    "close": [135.1, 410.5, 136.8, 412.2]
})

fundamentals = pd.DataFrame({
    "ticker": ["GOOG","MSFT","NVDA"],
    "sector": ["Technology","Technology","Semiconductors"],
    "pe_ratio": [28.1, 35.4, 72.5]
})


1. Merge prices with fundamentals to add sector and PE data to each price row.

Expected columns:
date, ticker, close, sector, pe_ratio

In [2]:
prices.merge(fundamentals[['ticker', 'pe_ratio']], on = 'ticker')

Unnamed: 0,date,ticker,close,pe_ratio
0,2025-01-01,GOOG,135.1,28.1
1,2025-01-02,GOOG,136.8,28.1
2,2025-01-01,MSFT,410.5,35.4
3,2025-01-02,MSFT,412.2,35.4


2. Perform a left merge:

- Keep all rows from prices

- Add fundamentals where available

In [3]:
prices.merge(fundamentals, on = 'ticker',how = 'left')

Unnamed: 0,date,ticker,close,sector,pe_ratio
0,2025-01-01,GOOG,135.1,Technology,28.1
1,2025-01-01,MSFT,410.5,Technology,35.4
2,2025-01-02,GOOG,136.8,Technology,28.1
3,2025-01-02,MSFT,412.2,Technology,35.4


3. Perform an inner join and compare the number of rows with the left join.

In [4]:
prices.merge(fundamentals, on = 'ticker',how = 'inner')

Unnamed: 0,date,ticker,close,sector,pe_ratio
0,2025-01-01,GOOG,135.1,Technology,28.1
1,2025-01-02,GOOG,136.8,Technology,28.1
2,2025-01-01,MSFT,410.5,Technology,35.4
3,2025-01-02,MSFT,412.2,Technology,35.4


4. Perform a right join of fundamentals to prices.

In [5]:
prices.merge(fundamentals, on = 'ticker',how = 'right')

Unnamed: 0,date,ticker,close,sector,pe_ratio
0,2025-01-01,GOOG,135.1,Technology,28.1
1,2025-01-02,GOOG,136.8,Technology,28.1
2,2025-01-01,MSFT,410.5,Technology,35.4
3,2025-01-02,MSFT,412.2,Technology,35.4
4,,NVDA,,Semiconductors,72.5


5. Perform an outer join and fill all missing values with:

- "Unknown" for text

- 0 for numbers

In [6]:
df = prices.merge(fundamentals, on = 'ticker',how = 'outer')
df['date'] = df['date'].fillna(0)
df['close'] = df['close'].fillna('Unknown')
df

Unnamed: 0,date,ticker,close,sector,pe_ratio
0,2025-01-01,GOOG,135.1,Technology,28.1
1,2025-01-02,GOOG,136.8,Technology,28.1
2,2025-01-01,MSFT,410.5,Technology,35.4
3,2025-01-02,MSFT,412.2,Technology,35.4
4,0,NVDA,Unknown,Semiconductors,72.5


In [7]:
market_sentiment = pd.DataFrame({
    "date": ["2025-01-01","2025-01-02"],
    "ticker": ["GOOG", "GOOG"],
    "sentiment": [0.8, 0.75]
})


6. Merge prices and market_sentiment using both date and ticker.

In [8]:
prices.merge(market_sentiment, on = ['date', 'ticker'])

Unnamed: 0,date,ticker,close,sentiment
0,2025-01-01,GOOG,135.1,0.8
1,2025-01-02,GOOG,136.8,0.75


7. Merge with validate="many_to_one" and explain why it passes or fails:

In [9]:
prices.merge(fundamentals, on="ticker", validate="many_to_one")


Unnamed: 0,date,ticker,close,sector,pe_ratio
0,2025-01-01,GOOG,135.1,Technology,28.1
1,2025-01-02,GOOG,136.8,Technology,28.1
2,2025-01-01,MSFT,410.5,Technology,35.4
3,2025-01-02,MSFT,412.2,Technology,35.4


The many_to_one merge passes here because it checks if the merge keys are unique in the right table. Here the right table is fundamentals and it has unique values in the ticker column, which is the merge key.

8. Check whether fundamentals['ticker'] contains duplicates.
If duplicates exist, how would they affect a merge?

In [10]:
fundamentals['ticker'].is_unique

True

The fundamentals['ticker'] column does not have duplicates. If duplicates exist, each row of fundamentals table with each ticker will join with every row of the same ticker in the prices table. This will result in creating rows with incorrect data. 

9. Find which tickers in fundamentals do not appear in prices.

In [11]:
fundamentals['ticker'][~fundamentals['ticker'].isin(prices['ticker'])]

2    NVDA
Name: ticker, dtype: object

10. Write a function that:

takes two DataFrames and a join key

prints:

- rows in left-only

- rows in right-only

- rows matched

- rows duplicated on either side

In [23]:
def report(left_table, right_table, join_key):
    left_rows = left_table
    right_rows = right_table
    matched_rows = left_table.merge(right_table, on = join_key, how = 'inner')
    left_duplicated_rows = left_table[left_table.duplicated()]
    right_duplicated_rows = right_table[right_table.duplicated()]
    return left_rows, right_rows, matched_rows, left_duplicated_rows, right_duplicated_rows


In [24]:
left_rows, right_rows, matched_rows, left_duplicated_rows, right_duplicated_rows= report(prices, fundamentals, 'ticker')
print('left_rows:', left_rows)
print('right_rows:', right_rows)
print('matched_rows:', matched_rows)
print('left_duplicated_rows:', left_duplicated_rows)
print('right_duplicated_rows:', right_duplicated_rows)

left_rows:          date ticker  close
0  2025-01-01   GOOG  135.1
1  2025-01-01   MSFT  410.5
2  2025-01-02   GOOG  136.8
3  2025-01-02   MSFT  412.2
right_rows:   ticker          sector  pe_ratio
0   GOOG      Technology      28.1
1   MSFT      Technology      35.4
2   NVDA  Semiconductors      72.5
matched_rows:          date ticker  close      sector  pe_ratio
0  2025-01-01   GOOG  135.1  Technology      28.1
1  2025-01-02   GOOG  136.8  Technology      28.1
2  2025-01-01   MSFT  410.5  Technology      35.4
3  2025-01-02   MSFT  412.2  Technology      35.4
left_duplicated_rows: Empty DataFrame
Columns: [date, ticker, close]
Index: []
right_duplicated_rows: Empty DataFrame
Columns: [ticker, sector, pe_ratio]
Index: []
