In [1]:
import pandas as pd
import numpy as np
# Relative path from notebooks/ to data/
file_path = "../data/google_stock_data.xlsx"

# Read the Excel file
df = pd.read_excel(file_path)

# Round all numerical columns to 2 decimal places
df = df.round(2)

# update the date format to YYYY-MM-DD
df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%Y-%m-%d')

In [2]:
# Rename columns
df.rename(columns={'Close_GOOG': 'Close', 'High_GOOG': 'High'
                   ,'Low_GOOG': 'Low', 'Open_GOOG': 'Open'
                   ,'Volume_GOOG': 'Volume'}, inplace=True)

# Display the columns of the DataFrame
df.head()

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2020-01-02,67.9,67.94,66.62,66.62,28132000
1,2020-01-03,67.57,68.16,66.82,66.93,23728000
2,2020-01-06,69.24,69.35,67.04,67.04,34646000
3,2020-01-07,69.19,69.67,69.05,69.42,30054000
4,2020-01-08,69.74,70.1,69.07,69.13,30560000


1. Group the data by month and calculate the average volume per month.

In [3]:
df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month

# monthly average close
df.groupby('Month')['Volume'].mean()

Month
1     2.877193e+07
2     3.429635e+07
3     3.597954e+07
4     2.967110e+07
5     2.739296e+07
6     2.689744e+07
7     2.552884e+07
8     2.173888e+07
9     2.551709e+07
10    2.534728e+07
11    2.440357e+07
12    2.393215e+07
Name: Volume, dtype: float64

2. For each month, compute:

- average Open

- maximum High

- minimum Low

- total Volume

In [4]:
df.groupby('Month').agg({
    'Open': 'mean',
    'High': 'max',
    'Low': 'min',
    'Volume': 'sum'
})

Unnamed: 0_level_0,Open,High,Low,Volume
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,106.790594,154.15,66.62,2905965200
2,110.442396,151.07,63.12,3292449700
3,106.517387,152.16,50.33,3993728800
4,113.648835,175.22,53.62,3056123500
5,117.942857,178.73,64.51,2876260500
6,119.859905,186.44,66.89,2824231200
7,124.709429,192.21,70.01,2680528400
8,125.465225,174.68,72.44,2413015300
9,122.420194,166.63,69.85,2628260100
10,123.524771,182.99,71.31,2762853400


3. Create a new column:
range = High - Low
Then compute the average price range by month.

In [5]:
df['range'] = df['High'] - df['Low']
df.groupby('Month')['range'].mean()

Month
1     2.435644
2     2.712292
3     2.828649
4     2.661942
5     2.598571
6     2.389238
7     2.764095
8     2.383874
9     2.561553
10    2.786881
11    2.558932
12    2.723302
Name: range, dtype: float64

4. Find the month with the highest average daily price range.

In [6]:
#df['Monthly_avg'] = df.groupby('Month')['range'].mean()
sorted_df= df.groupby('Month')['range'].mean().sort_values(ascending=False).reset_index()
sorted_df['Month'].iloc[0]

3

5. Compute a 7-day rolling mean for the Close price.

In [7]:
df['Close'].rolling(7).mean()

0              NaN
1              NaN
2              NaN
3              NaN
4              NaN
           ...    
1253    193.894286
1254    193.742857
1255    193.304286
1256    193.665714
1257    193.771429
Name: Close, Length: 1258, dtype: float64

6. Calculate each year's overall average closing price.

In [8]:
df['Year'] = df['Date'].dt.year

In [9]:
df.groupby('Year')['Close'].mean()

Year
2020     73.566719
2021    124.677222
2022    114.410837
2023    118.798920
2024    164.348810
Name: Close, dtype: float64

7. Calculate the monthly total volume for each year.

In [10]:
df.groupby(['Year', 'Month'])['Volume'].sum()

Year  Month
2020  1         673832000
      2         741612000
      3        1427842000
      4         926702000
      5         637326000
      6         776568000
      7         704940000
      8         756618000
      9         811352000
      10        785084000
      11        700708000
      12        629318000
2021  1         661654000
      2         588580000
      3         695884000
      4         614494000
      5         509114000
      6         548656000
      7         456776000
      8         372312000
      9         505198000
      10        496572000
      11        443172000
      12        479260000
2022  1         615510000
      2         732650000
      3         637420000
      4         564940000
      5         717486000
      6         627132000
      7         638766600
      8         430961500
      9         532993100
      10        584537500
      11        603449000
      12        498585400
2023  1         526178100
      2         754400500


### Pivot Tables

8. Create a pivot table with:

- index = month

- columns = year

- values = average closing price

In [11]:
df.pivot_table(
    index = 'Month',
    columns = 'Year',
    values = 'Close',
    aggfunc = 'mean'
    
)

Year,2020,2021,2022,2023,2024
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,71.344762,89.803684,135.1185,93.3775,144.437143
2,72.708947,102.714737,135.648947,96.151053,143.0885
3,59.015455,102.293043,134.436522,97.888261,142.506
4,61.28619,113.714762,127.7355,105.625789,157.653182
5,68.5875,116.83,112.756667,115.951364,172.393636
6,71.065,124.218636,112.058571,122.39,178.175789
7,75.163182,131.439048,112.5555,122.7155,181.163636
8,76.771429,138.373182,116.31913,130.256957,164.384091
9,75.411905,140.917619,103.401429,134.279,158.602
10,76.816818,139.492857,98.787619,134.434545,166.177826


9. Make a pivot table that shows, for each month:

- mean closing price

- total volume

In [12]:
pivot = df.pivot_table(
    index = 'Month',
    columns = 'Year',
    values = ['Close', 'Volume'],
    aggfunc = {'Close': 'mean' , 'Volume': 'sum'}
    
)

10. From your pivot table, determine which month had the highest total volume across all years.

In [13]:
pivot['Total_Volume']= pivot[[('Volume', 2020), 
                              ('Volume', 2021),
                              ('Volume', 2022),
                              ('Volume', 2023),
                              ('Volume', 2024)]].sum(axis = 1)

pivot['Total_Volume'].idxmax()

3