In [14]:
import pandas as pd
import numpy as np
# Relative path from notebooks/ to data/
file_path = "../data/google_stock_data.xlsx"

# Read the Excel file
df = pd.read_excel(file_path)

# Round all numerical columns to 2 decimal places
df = df.round(2)

# update the date format to YYYY-MM-DD
df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%Y-%m-%d')

In [15]:
# Rename columns
df.rename(columns={'Close_GOOG': 'Close', 'High_GOOG': 'High'
                   ,'Low_GOOG': 'Low', 'Open_GOOG': 'Open'
                   ,'Volume_GOOG': 'Volume'}, inplace=True)

In [16]:
# Display the columns of the DataFrame
df.head()

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2020-01-02,67.9,67.94,66.62,66.62,28132000
1,2020-01-03,67.57,68.16,66.82,66.93,23728000
2,2020-01-06,69.24,69.35,67.04,67.04,34646000
3,2020-01-07,69.19,69.67,69.05,69.42,30054000
4,2020-01-08,69.74,70.1,69.07,69.13,30560000


1. Sort by a single column

In [17]:
df['High'].sort_values()

55       53.20
52       54.95
64       55.80
63       55.96
62       56.10
         ...  
1253    197.03
1254    197.52
1249    198.05
1247    199.99
1248    202.23
Name: High, Length: 1258, dtype: float64

2. Sort by multiple columns

In [18]:
df.sort_values(by= ['Volume', 'Close'], ascending = [True, False])

Unnamed: 0,Date,Close,High,Low,Open,Volume
1253,2024-12-24,196.93,197.03,194.57,195.54,6809800
248,2020-12-24,86.35,86.71,85.87,86.16,6936000
1132,2024-07-03,186.33,186.56,184.33,185.24,7409100
1254,2024-12-26,196.46,197.52,195.24,196.10,7907900
731,2022-11-25,96.94,98.27,96.87,97.80,8567800
...,...,...,...,...,...,...
50,2020-03-16,53.85,57.22,53.36,54.43,85048000
122,2020-06-26,67.53,71.19,67.14,71.08,85354000
210,2020-10-30,80.50,83.78,79.68,83.04,86582000
526,2022-02-02,147.03,151.07,144.57,150.83,89750000


3. Find basic descriptive statistics

In [19]:
df.describe()

Unnamed: 0,Close,High,Low,Open,Volume
count,1258.0,1258.0,1258.0,1258.0,1258.0
mean,119.128609,120.422067,117.804459,119.040739,27411600.0
std,32.724273,32.955081,32.53096,32.759203,12719130.0
min,52.47,53.2,50.33,52.47,6809800.0
25%,93.55,94.75,92.0225,93.535,18850220.0
50%,120.195,121.48,119.275,120.225,24457900.0
75%,140.7325,142.0375,139.5425,140.755,32134000.0
max,197.52,202.23,196.06,197.89,97798600.0


4. Create a new column:
Range = High - Low

In [20]:
df['Range'] = df['High'] - df['Low']
df

Unnamed: 0,Date,Close,High,Low,Open,Volume,Range
0,2020-01-02,67.90,67.94,66.62,66.62,28132000,1.32
1,2020-01-03,67.57,68.16,66.82,66.93,23728000,1.34
2,2020-01-06,69.24,69.35,67.04,67.04,34646000,2.31
3,2020-01-07,69.19,69.67,69.05,69.42,30054000,0.62
4,2020-01-08,69.74,70.10,69.07,69.13,30560000,1.03
...,...,...,...,...,...,...,...
1253,2024-12-24,196.93,197.03,194.57,195.54,6809800,2.46
1254,2024-12-26,196.46,197.52,195.24,196.10,7907900,2.28
1255,2024-12-27,193.41,196.16,191.35,195.84,14693000,4.81
1256,2024-12-30,192.07,193.15,189.75,190.25,12209500,3.40


5. Calculate the average closing price across the entire dataset.

In [21]:
df['Close'].mean()

119.12860890302068

6. Compute total volume for the full dataset.

In [22]:
df['Volume'].sum()

34483792300

7. Group data by month and calculate:

- average Close

- total Volume

In [34]:
df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month

# monthly average close
df.groupby('Month')['Close'].mean()


Month
1     107.006040
2     110.406458
3     106.708739
4     113.640680
5     118.284095
6     120.047619
7     124.807905
8     125.622613
9     122.057961
10    123.610183
11    126.944854
12    129.367264
Name: Close, dtype: float64

In [35]:
# monthly total volume
df.groupby('Month')['Volume'].sum()

Month
1     2905965200
2     3292449700
3     3993728800
4     3056123500
5     2876260500
6     2824231200
7     2680528400
8     2413015300
9     2628260100
10    2762853400
11    2513567800
12    2536808400
Name: Volume, dtype: int64

8. Return the row where the Closing price is highest.

In [40]:
df.loc[df['Close'].idxmax()]

Date      2024-12-16 00:00:00
Close                  197.52
High                   199.99
Low                    193.48
Open                   193.74
Volume               32248600
Range                    6.51
Month                      12
Name: 1247, dtype: object

9. Add a 5-day rolling average of Close price.

In [41]:
df['Close'].rolling(5).mean()

0           NaN
1           NaN
2           NaN
3           NaN
4        68.728
         ...   
1253    192.652
1254    194.036
1255    194.900
1256    194.846
1257    193.740
Name: Close, Length: 1258, dtype: float64

10. Find days where Volume > (mean + 2 × std).

In [43]:
vol_mean = df['Volume'].mean()
vol_std = df['Volume'].std()

df[df['Volume'] > vol_mean + (2*vol_std)]


Unnamed: 0,Date,Close,High,Low,Open,Volume,Range,Month
21,2020-02-03,73.79,73.99,72.45,72.6,60736000,1.54,2
22,2020-02-04,71.86,72.98,70.83,72.36,78660000,2.15,2
35,2020-02-24,70.6,71.36,70.09,70.82,57342000,1.27,2
38,2020-02-27,65.46,68.12,65.41,67.64,59566000,2.71,2
39,2020-02-28,66.51,66.6,63.12,63.44,75782000,3.48,2
44,2020-03-06,64.48,64.87,62.62,63.42,53212000,2.25,3
45,2020-03-09,60.36,62.31,59.59,59.86,67308000,2.72,3
48,2020-03-12,55.37,59.29,55.29,55.92,84534000,4.0,3
49,2020-03-13,60.57,60.57,55.48,58.55,74002000,5.09,3
50,2020-03-16,53.85,57.22,53.36,54.43,85048000,3.86,3
