In [15]:
import pandas as pd
import numpy as np
# Relative path from notebooks/ to data/
file_path = "../data/google_stock_data.xlsx"

# Read the Excel file
df = pd.read_excel(file_path)

In [16]:
# Display the column names of the DataFrame
df.columns

Index(['Date', 'Close_GOOG', 'High_GOOG', 'Low_GOOG', 'Open_GOOG',
       'Volume_GOOG'],
      dtype='object')

Clean the dataset.

In [17]:
# Round all numerical columns to 2 decimal places
df = df.round(2)

# update the date format to YYYY-MM-DD
df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%Y-%m-%d')

1. Select only the Date and Close columns.

In [18]:
df[["Date", "Close_GOOG"]]

Unnamed: 0,Date,Close_GOOG
0,2020-01-02,67.90
1,2020-01-03,67.57
2,2020-01-06,69.24
3,2020-01-07,69.19
4,2020-01-08,69.74
...,...,...
1253,2024-12-24,196.93
1254,2024-12-26,196.46
1255,2024-12-27,193.41
1256,2024-12-30,192.07


2. Select all rows where Close > 105.

In [19]:
df[df['Close_GOOG'] > 105]

Unnamed: 0,Date,Close_GOOG,High_GOOG,Low_GOOG,Open_GOOG,Volume_GOOG
282,2021-02-16,105.37,106.90,104.50,104.50,22676000
283,2021-02-17,105.69,105.96,104.23,104.29,21418000
284,2021-02-18,105.14,105.91,104.47,104.80,22432000
299,2021-03-11,105.02,105.56,102.91,103.00,24782000
314,2021-04-01,106.16,106.42,104.13,104.18,33980000
...,...,...,...,...,...,...
1253,2024-12-24,196.93,197.03,194.57,195.54,6809800
1254,2024-12-26,196.46,197.52,195.24,196.10,7907900
1255,2024-12-27,193.41,196.16,191.35,195.84,14693000
1256,2024-12-30,192.07,193.15,189.75,190.25,12209500


3. Select rows where Close > 105 and Volume > 1,000,000.

In [20]:
df[(df['Close_GOOG'] > 105) & (df['Volume_GOOG'] > 1000000)]

Unnamed: 0,Date,Close_GOOG,High_GOOG,Low_GOOG,Open_GOOG,Volume_GOOG
282,2021-02-16,105.37,106.90,104.50,104.50,22676000
283,2021-02-17,105.69,105.96,104.23,104.29,21418000
284,2021-02-18,105.14,105.91,104.47,104.80,22432000
299,2021-03-11,105.02,105.56,102.91,103.00,24782000
314,2021-04-01,106.16,106.42,104.13,104.18,33980000
...,...,...,...,...,...,...
1253,2024-12-24,196.93,197.03,194.57,195.54,6809800
1254,2024-12-26,196.46,197.52,195.24,196.10,7907900
1255,2024-12-27,193.41,196.16,191.35,195.84,14693000
1256,2024-12-30,192.07,193.15,189.75,190.25,12209500


4. Select all rows between "2024-01-01" and "2024-01-10"

In [21]:
df[(df['Date'] >= '2024-01-01') & (df['Date'] <= '2024-01-10')]

Unnamed: 0,Date,Close_GOOG,High_GOOG,Low_GOOG,Open_GOOG,Volume_GOOG
1006,2024-01-02,138.61,139.66,136.8,138.65,20071900
1007,2024-01-03,139.41,140.13,137.49,137.66,18974300
1008,2024-01-04,137.1,139.68,137.07,138.9,18253300
1009,2024-01-05,136.46,137.87,135.92,137.41,15439500
1010,2024-01-08,139.57,139.68,136.94,137.06,17645300
1011,2024-01-09,141.59,141.83,138.84,139.11,19579700
1012,2024-01-10,142.82,143.54,141.49,141.55,16641900


5. Select the first 5 rows and last 5 rows using indexing

In [22]:
print(df.iloc[:5])
print(df.iloc[-5:])

         Date  Close_GOOG  High_GOOG  Low_GOOG  Open_GOOG  Volume_GOOG
0  2020-01-02       67.90      67.94     66.62      66.62     28132000
1  2020-01-03       67.57      68.16     66.82      66.93     23728000
2  2020-01-06       69.24      69.35     67.04      67.04     34646000
3  2020-01-07       69.19      69.67     69.05      69.42     30054000
4  2020-01-08       69.74      70.10     69.07      69.13     30560000
            Date  Close_GOOG  High_GOOG  Low_GOOG  Open_GOOG  Volume_GOOG
1253  2024-12-24      196.93     197.03    194.57     195.54      6809800
1254  2024-12-26      196.46     197.52    195.24     196.10      7907900
1255  2024-12-27      193.41     196.16    191.35     195.84     14693000
1256  2024-12-30      192.07     193.15    189.75     190.25     12209500
1257  2024-12-31      189.83     192.63    188.97     191.82     14355200


6. Select rows where Date is '2020-01-03'.

In [23]:
df[df['Date'] == '2020-01-03']

Unnamed: 0,Date,Close_GOOG,High_GOOG,Low_GOOG,Open_GOOG,Volume_GOOG
1,2020-01-03,67.57,68.16,66.82,66.93,23728000


7. Add a column 'High_Volume' that is True if Volume > 1,200,000, otherwise False.

In [24]:
df['High_volume'] = np.where(df['High_GOOG']> 1200000, True, False)
df.head()

Unnamed: 0,Date,Close_GOOG,High_GOOG,Low_GOOG,Open_GOOG,Volume_GOOG,High_volume
0,2020-01-02,67.9,67.94,66.62,66.62,28132000,False
1,2020-01-03,67.57,68.16,66.82,66.93,23728000,False
2,2020-01-06,69.24,69.35,67.04,67.04,34646000,False
3,2020-01-07,69.19,69.67,69.05,69.42,30054000,False
4,2020-01-08,69.74,70.1,69.07,69.13,30560000,False
