In [23]:
import pandas as pd
import numpy as np
# Relative path from notebooks/ to data/
file_path = "../data/google_stock_data.xlsx"

# Read the Excel file
price = pd.read_excel(file_path)

# Round all numerical columns to 2 decimal places
price = price.round(2)

# update the date format to YYYY-MM-DD
price['Date'] = pd.to_datetime(price['Date']).dt.strftime('%Y-%m-%d')

# Rename columns
price.rename(columns={'Date': 'date', 'Close_GOOG': 'close', 'High_GOOG': 'high'
                   ,'Low_GOOG': 'low', 'Open_GOOG': 'open'
                   ,'Volume_GOOG': 'volume'}, inplace=True)


1. Set date as index

In [24]:
price['date'] = pd.to_datetime(price['date'])
price = price.sort_values('date')

price.set_index('date', inplace=True)

2. Create a column daily_return defined as:

(close − previous_day_close)/previous_day_close


Constraints:

- Do not use apply

- Use vectorized pandas logic

In [25]:
price['daily_return'] = (price['close'] - price['close'].shift(1)) / price['close'].shift(1)
price

Unnamed: 0_level_0,close,high,low,open,volume,daily_return
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-02,67.90,67.94,66.62,66.62,28132000,
2020-01-03,67.57,68.16,66.82,66.93,23728000,-0.004860
2020-01-06,69.24,69.35,67.04,67.04,34646000,0.024715
2020-01-07,69.19,69.67,69.05,69.42,30054000,-0.000722
2020-01-08,69.74,70.10,69.07,69.13,30560000,0.007949
...,...,...,...,...,...,...
2024-12-24,196.93,197.03,194.57,195.54,6809800,0.008036
2024-12-26,196.46,197.52,195.24,196.10,7907900,-0.002387
2024-12-27,193.41,196.16,191.35,195.84,14693000,-0.015525
2024-12-30,192.07,193.15,189.75,190.25,12209500,-0.006928


3. Create a column close_7d_avg representing the rolling 7-day average of close.

In [26]:
price['close_7d_avg'] = price['close'].rolling(7).mean()
price

Unnamed: 0_level_0,close,high,low,open,volume,daily_return,close_7d_avg
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-02,67.90,67.94,66.62,66.62,28132000,,
2020-01-03,67.57,68.16,66.82,66.93,23728000,-0.004860,
2020-01-06,69.24,69.35,67.04,67.04,34646000,0.024715,
2020-01-07,69.19,69.67,69.05,69.42,30054000,-0.000722,
2020-01-08,69.74,70.10,69.07,69.13,30560000,0.007949,
...,...,...,...,...,...,...,...
2024-12-24,196.93,197.03,194.57,195.54,6809800,0.008036,193.894286
2024-12-26,196.46,197.52,195.24,196.10,7907900,-0.002387,193.742857
2024-12-27,193.41,196.16,191.35,195.84,14693000,-0.015525,193.304286
2024-12-30,192.07,193.15,189.75,190.25,12209500,-0.006928,193.665714


4. Create a column volatility_7d defined as the rolling standard deviation of daily_return over 7 days.

In [27]:
price['volatility_7d'] = price['daily_return'].rolling(7).std()
price

Unnamed: 0_level_0,close,high,low,open,volume,daily_return,close_7d_avg,volatility_7d
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-02,67.90,67.94,66.62,66.62,28132000,,,
2020-01-03,67.57,68.16,66.82,66.93,23728000,-0.004860,,
2020-01-06,69.24,69.35,67.04,67.04,34646000,0.024715,,
2020-01-07,69.19,69.67,69.05,69.42,30054000,-0.000722,,
2020-01-08,69.74,70.10,69.07,69.13,30560000,0.007949,,
...,...,...,...,...,...,...,...,...
2024-12-24,196.93,197.03,194.57,195.54,6809800,0.008036,193.894286,0.022301
2024-12-26,196.46,197.52,195.24,196.10,7907900,-0.002387,193.742857,0.017750
2024-12-27,193.41,196.16,191.35,195.84,14693000,-0.015525,193.304286,0.018600
2024-12-30,192.07,193.15,189.75,190.25,12209500,-0.006928,193.665714,0.012111


5. Create a boolean column volume_spike that is True when:

volume > 1.5 × rolling 7-day average volume

Rules:

- No loops

- No apply

In [28]:
price['volume_spike'] = price['volume'] > (1.5 * price['close_7d_avg'])
price

Unnamed: 0_level_0,close,high,low,open,volume,daily_return,close_7d_avg,volatility_7d,volume_spike
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-01-02,67.90,67.94,66.62,66.62,28132000,,,,False
2020-01-03,67.57,68.16,66.82,66.93,23728000,-0.004860,,,False
2020-01-06,69.24,69.35,67.04,67.04,34646000,0.024715,,,False
2020-01-07,69.19,69.67,69.05,69.42,30054000,-0.000722,,,False
2020-01-08,69.74,70.10,69.07,69.13,30560000,0.007949,,,False
...,...,...,...,...,...,...,...,...,...
2024-12-24,196.93,197.03,194.57,195.54,6809800,0.008036,193.894286,0.022301,True
2024-12-26,196.46,197.52,195.24,196.10,7907900,-0.002387,193.742857,0.017750,True
2024-12-27,193.41,196.16,191.35,195.84,14693000,-0.015525,193.304286,0.018600,True
2024-12-30,192.07,193.15,189.75,190.25,12209500,-0.006928,193.665714,0.012111,True


6. Create an integer column trend_score with the following logic:

- +1 if close > close.shift(1)
- +1 if close > close_7d_avg
- +1 if volume > volume_7d_avg

Scores range from 0 to 3.

In [29]:
price['volume_7d_avg'] = price['volume'].rolling(7).mean()

price['trend_score'] = 0
price.loc[price['close'] > price['close'].shift(1), 'trend_score'] += 1
price.loc[price['close'] > price['close_7d_avg'], 'trend_score'] += 1
price.loc[price['volume'] > price['volume_7d_avg'], 'trend_score'] += 1
price

Unnamed: 0_level_0,close,high,low,open,volume,daily_return,close_7d_avg,volatility_7d,volume_spike,volume_7d_avg,trend_score
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-01-02,67.90,67.94,66.62,66.62,28132000,,,,False,,0
2020-01-03,67.57,68.16,66.82,66.93,23728000,-0.004860,,,False,,0
2020-01-06,69.24,69.35,67.04,67.04,34646000,0.024715,,,False,,1
2020-01-07,69.19,69.67,69.05,69.42,30054000,-0.000722,,,False,,0
2020-01-08,69.74,70.10,69.07,69.13,30560000,0.007949,,,False,,1
...,...,...,...,...,...,...,...,...,...,...,...
2024-12-24,196.93,197.03,194.57,195.54,6809800,0.008036,193.894286,0.022301,True,2.548044e+07,2
2024-12-26,196.46,197.52,195.24,196.10,7907900,-0.002387,193.742857,0.017750,True,2.200320e+07,1
2024-12-27,193.41,196.16,191.35,195.84,14693000,-0.015525,193.304286,0.018600,True,2.065513e+07,1
2024-12-30,192.07,193.15,189.75,190.25,12209500,-0.006928,193.665714,0.012111,True,1.845100e+07,0


7. Filter and return only dates where trend_score == 3.

In [30]:
price = price.reset_index()
price.loc[price['trend_score'] == 3, ['date','trend_score']]


Unnamed: 0,date,trend_score
6,2020-01-10,3
7,2020-01-13,3
11,2020-01-17,3
12,2020-01-21,3
21,2020-02-03,3
...,...,...
1239,2024-12-04,3
1242,2024-12-09,3
1243,2024-12-10,3
1244,2024-12-11,3


8. Aggregate the data by month and calculate:

- average close

- max high

- total volume

- average volatility

In [31]:
price.set_index('date', inplace=True)
price['average_close'] = price['close'].resample('M').transform('mean')
price

Unnamed: 0_level_0,close,high,low,open,volume,daily_return,close_7d_avg,volatility_7d,volume_spike,volume_7d_avg,trend_score,average_close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-01-02,67.90,67.94,66.62,66.62,28132000,,,,False,,0,71.344762
2020-01-03,67.57,68.16,66.82,66.93,23728000,-0.004860,,,False,,0,71.344762
2020-01-06,69.24,69.35,67.04,67.04,34646000,0.024715,,,False,,1,71.344762
2020-01-07,69.19,69.67,69.05,69.42,30054000,-0.000722,,,False,,0,71.344762
2020-01-08,69.74,70.10,69.07,69.13,30560000,0.007949,,,False,,1,71.344762
...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-24,196.93,197.03,194.57,195.54,6809800,0.008036,193.894286,0.022301,True,2.548044e+07,2,187.639048
2024-12-26,196.46,197.52,195.24,196.10,7907900,-0.002387,193.742857,0.017750,True,2.200320e+07,1,187.639048
2024-12-27,193.41,196.16,191.35,195.84,14693000,-0.015525,193.304286,0.018600,True,2.065513e+07,1,187.639048
2024-12-30,192.07,193.15,189.75,190.25,12209500,-0.006928,193.665714,0.012111,True,1.845100e+07,0,187.639048


9. For each month:

- Identify the date with max close

- Identify the date with min close

In [32]:
price = price.reset_index()
price['month'] = price['date'].dt.month
price.loc[price.groupby('month')['close'].idxmax(), ['month', 'date', 'close']]

Unnamed: 0,month,date,close
1024,1,2024-01-29,153.79
1033,2,2024-02-09,149.2
1066,3,2024-03-28,151.23
1086,4,2024-04-26,172.51
1103,5,2024-05-21,178.32
1128,6,2024-06-27,185.8
1136,7,2024-07-10,191.57
1152,8,2024-08-01,171.47
1193,9,2024-09-30,166.46
1215,10,2024-10-30,175.37


In [33]:
#price = price.reset_index()
price['month'] = price['date'].dt.month
price.loc[price.groupby('month')['close'].idxmin(), ['month', 'date', 'close']]

Unnamed: 0,month,date,close
1,1,2020-01-03,67.57
38,2,2020-02-27,65.46
55,3,2020-03-23,52.47
64,4,2020-04-03,54.52
83,5,2020-05-01,65.58
122,6,2020-06-26,67.53
125,7,2020-07-01,71.41
148,8,2020-08-04,72.75
183,9,2020-09-23,70.28
192,10,2020-10-06,72.18


10. Why is apply() usually worse than vectorized operations for these problems? explain in words, no code.

Apply function gets executed row by row, which takes time for the iteration across the rows. In contrast, vectorized operations perform operations column-wise, which uses underlying optimized implementation. 