In [34]:
import requests
from bs4 import BeautifulSoup

try:
    # URL für S&P 500 Ticker
    sp500_url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    sp500_response = requests.get(sp500_url)
    sp500_response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)

    sp500_soup = BeautifulSoup(sp500_response.text, "html.parser")

    # Try to find the table, catch AttributeError if not found
    sp500_table = sp500_soup.find("table", {"class": "wikitable"})

    if sp500_table is None:
        raise AttributeError("Table with class 'wikitable' not found.")

    # Extract tickers from the first column of the table
    tickers = [row.td.a.text.strip() for row in sp500_table.find_all('tr')[1:]]  # Skip the header row

except requests.exceptions.RequestException as e:
    print(f"Error: Could not crawl data from {sp500_url}. Exception: {e}")

except AttributeError as e:
    print(f"Error: {e}")





In [36]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import requests
import pandas as pd
import datetime
import time
import csv
import random
from urllib.error import HTTPError

def crawl(url):
    try:
        req = Request(url=url, headers={"user-agent": "my-app"})
        response = urlopen(req)
        html = BeautifulSoup(response, 'html.parser')
        
        insider_table = html.find(class_="body-table styled-table-new is-rounded p-0 mt-2")
        if insider_table is None:
            print("Table from " + url + " could not be found.")
            return None

        req = Request(url = url, headers={"user-agent": "my-app"})
        response = urlopen(req)

        html = BeautifulSoup(response, 'html')
        
        insider_table = html.find(class_="body-table styled-table-new is-rounded p-0 mt-2")
        print(insider_table)
        if insider_table is None:
            print("Tabelle aus "+ url + " kann nicht gefunden werden")
            return None
        
        #print(insider_table)
        parsed_data = []
    
        for row in insider_table.find_all("tr")[1:]:
            insider_name = row.find('td')
            # Get the Tradername in the first td element
            if insider_name is not None:
                insider_name = insider_name.text.strip()
            else:
                insider_name = None
            # Get the title of the Insider in the second td element
            insider_title = row.find_all('td')[1]
            if insider_title is not None:
                insider_title = insider_title.text.strip()
            else:
                insider_title = None
            # Get the date of the transaction in the third td element
            transaction_date = row.find_all('td')[2]
            if transaction_date is not None:
                transaction_date = transaction_date.text.strip()
            else:
                transaction_date = None
            # Get the type of transaction in the fourth td element
            transaction_type = row.find_all('td')[3]
            if transaction_type is not None:
                transaction_type = transaction_type.text.strip()
            else:
                transaction_type = None
            # Get the Cost of transaction in the fifth td element
            cost = row.find_all('td')[4]
            if cost is not None:
                cost = cost.text.strip()
            else:
                cost = None
            # Get the num of shares in the sixth td element
            num_shares = row.find_all('td')[5]
            if num_shares is not None:
                num_shares = num_shares.text.strip()
            else:
                num_shares = None
            # Get the Value of shares in the seventh td element
            value_shares = row.find_all('td')[6]
            if value_shares is not None:
                value_shares = value_shares.text.strip()
            else:
                value_shares = None
            # Get the num of shares the insider owns in the eighth td element
            total_num_shares_insider_owns = row.find_all('td')[7]
            if total_num_shares_insider_owns is not None:
                total_num_shares_insider_owns = total_num_shares_insider_owns.text.strip()
            else:
                total_num_shares_insider_owns = None
            # Get the date of the SEC Form, ka was das ist,  the ninth td element
            SEC_date = row.find_all('td')[8]
            if SEC_date is not None:
                SEC_date = SEC_date.text.strip()
            else:
                SEC_date = None

            parsed_data.append([insider_name, insider_title, transaction_date, transaction_type,
                                cost, num_shares, value_shares, total_num_shares_insider_owns, SEC_date])
        return parsed_data

    except HTTPError as e:
        print(f"HTTP Error {e.code}: {e.reason}. Unable to crawl data from {url}")
        return None
    

def enrich_date(df):
    import datetime
    # the table in Finviz has the dateformat dd.mm and i want to enrich the date with the year of the insider trade
    current_year = datetime.datetime.now().year

    # Convert the 'Date' column to datetime objects
    df['Date'] = pd.to_datetime(df['transaction_date'], format='%b %d')

    # Add the current year to the datetime objects
    df['Date'] = df['Date'].apply(lambda x: x.replace(year=current_year))

    return df
    
def create_initial_df(path):

    list = crawl(url)
    #print(list)

    if list is None:
        return None
    else:
        df = pd.DataFrame(list, columns = ["insider" , "title", "transaction_date", "transaction_type",
                                "cost", "num_shares", "value_shares", "total_num_shares_insider_owns", "SEC_date"])
        enrich_date(df)
        #save df to local storage
        df.to_csv(path, index=False)
  
for ticker in tickers:

    root = "https://finviz.com/quote.ashx?t="
    url = root + ticker
    path = "C:/Users/trist/code/Data/Finviz_insider/Data/" + ticker + ".csv"


    create_initial_df(path)


delay_seconds = random.uniform(0, 1)
time.sleep(delay_seconds)


<table cellpadding="0" cellspacing="0" class="body-table styled-table-new is-rounded p-0 mt-2" width="100%">
<thead>
<tr>
<th align="left" class="">Insider Trading</th>
<th align="left" class="">Relationship</th>
<th align="left" class="">Date</th>
<th align="center" class="">Transaction</th>
<th align="right" class="">Cost</th>
<th align="right" class="">#Shares</th>
<th align="right" class="">Value ($)</th>
<th align="right" class="">#Shares Total</th>
<th align="center" class="">SEC Form 4</th>
</tr>
</thead>
<tr class="fv-insider-row is-option" valign="top"><td><a class="tab-link" href="insidertrading.ashx?oc=1820595&amp;tc=7">Lavers Jeffrey R</a></td><td style="white-space:nowrap">Group President</td><td>Jun 30</td><td align="center" class="transaction" style="white-space:nowrap"><span>Option Exercise</span></td><td align="right" class="value">100.09</td><td align="right" class="value">7,783</td><td align="right" class="value">779,000</td><td align="right" class="value">14,460</td