## Import the List of Companies

In [1]:
from config import companies_of_interest

# prints a list of all Companies of Interest
# for company in companies_of_interest:
#         print(company)

## Importing Necessary Libraries

In [2]:
import requests
from bs4 import BeautifulSoup
import json
import time

## Helper Functions

In [3]:
def write_to_file(data, filename='hindenburg_data.json'):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=1)

## Main Scraping Function

In [4]:
def scrape_hindenburg_research(company_names):
    base_url = "https://hindenburgresearch.com/"
    current_url = base_url
    scraped_data = []
    
    while current_url:
        response = requests.get(current_url)
        soup = BeautifulSoup(response.content, "html.parser")
        articles = soup.find_all("div", class_="post-preview")

        for article in articles:
            title_tag = article.find("h2", class_="post-title")
            title = title_tag.find("a").text.strip()
            link = title_tag.find("a")["href"]

            date_published_element = article.find("time", class_="entry-date published")
            date_published_element = article.find("time",\
                                                  class_=lambda value: value and value.startswith("entry-date published"))

            if date_published_element:
                date_published = date_published_element.text.strip()
            else:
                date_published = "Not found"

            # Check if the article title contains any of the company names
            if any(company_name in title for company_name in company_names):
                data = {
                    "title": title,
                    "link": link,
                    "date_published": date_published
                }
                scraped_data.append(data)
        # Find the "next" button and update the current_url for the next iteration
        next_button = soup.find("a", class_="next page-numbers")
        if next_button:
            current_url = next_button["href"]
        else:
            current_url = None

        # Add a delay to prevent overloading the server with requests
        time.sleep(1)
    
    # saves the scrapped data onto a file
    write_to_file(scraped_data)
    return scraped_data

In [5]:
if __name__ == "__main__":
    hindenburg_data = scrape_hindenburg_research(companies_of_interest)
    # displays the saved data
    #print(json.dumps(hindenburg_data, indent=1))
#     saves the scrapped data onto a file
#      write_to_file(hindenburg_data)