# Dependencies

In [1]:
!pip install yfinance



In [None]:
!pip install gnews

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Target Stocks and Dates

In [3]:
stocks = ['AAPL', 'META', 'NVDA']
related_terms = {
    'AAPL': ['AAPL', 'Apple'],
    'META': ['META', 'Metaverse'],
    'NVDA': ['NVDA', 'NVIDIA'],
}
# company url names for Investing.com
url_names = {
    'AAPL': 'apple-computer-inc',
    'META': 'facebook-inc',
    'NVDA': 'nvidia-corp',
}
start = "2022-01-01"
end = "2024-01-01"

# Retrieve Stock Data

In [4]:
import yfinance as yf
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [5]:
stock_dfs = {}

for stock in stocks:
    stock_df = yf.download(stock, start=start, end=end)[['Adj Close','Open', 'High', 'Low', 'Close', 'Volume']].round(3)
    stock_df = stock_df.reset_index()
    stock_df.to_csv(f"/content/drive/My Drive/WPI/Senior Year/CS539 (ML)/{stock}_stock.csv", index=False)
    stock_dfs[stock] = stock_df

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


# Retrieve News Articles

In [None]:
from gnews import GNews
import pandas as pd

def convert_date(date_str):
    yy, mm, dd = date_str.split('-')
    return (int(yy), int(mm), int(dd))

def format_date(date_str):
    return pd.to_datetime(date_str).strftime('%Y-%m-%d')

articles_by_stock = {}
max_retries = 5  # Maximum retries for fetching articles

# For each ticker
for ticker, stock_df in stock_dfs.items():
    all_dfs = []
    all_dates = stock_df['Date'].dt.strftime('%Y-%m-%d').tolist()

    # Iterate through each day in the range of stock data dates, except the last date to avoid index error
    for i in tqdm(range(len(all_dates) - 1)):
        s_d = convert_date(all_dates[i])
        e_d = convert_date(all_dates[i+1])

        dts = [all_dates[i], all_dates[i+1]]

        retries = 0
        articles_fetched = False
        while not articles_fetched and retries < max_retries:
            google_news = GNews(
                language='en',
                country='US',
                start_date=s_d,
                end_date=e_d,
                max_results=10,
            )
            # Perform search for each keyword
            for term in related_terms[ticker]:
                json_resp = google_news.get_news(term)
                if json_resp:
                    df = pd.DataFrame(json_resp)
                    df['published date'] = df['published date'].apply(format_date)
                    if all(date in dts for date in df['published date'].tolist()):
                        all_dfs.append(df)
                        articles_fetched = True
                        break  # Exit the loop once articles are fetched
            retries += 1  # Increment retry count

    if all_dfs:
        all_news_df = pd.concat(all_dfs, ignore_index=True)
    else:
        all_news_df = pd.DataFrame()

    all_news_df.to_csv(f"/content/drive/My Drive/WPI/Senior Year/CS539 (ML)/{ticker}_news.csv", index=False)
    print(f"{ticker} news saved.")
    articles_by_stock[ticker] = all_news_df