In [None]:
%load_ext autoreload
%autoreload 2

Todo's:

- Remove sentence limitation

In [None]:
import pandas as pd
from bs4 import BeautifulSoup

from datetime import datetime
import json
import yaml
import hashlib

from financial_report_analyzer.database_conntector import DatabaseConnector
from financial_report_analyzer.scraping import SECScraper
from financial_report_analyzer.model import ScoringModel
from financial_report_analyzer.content_extractor import TextExtractor

In [None]:
nasdaq = pd.read_clipboard()

In [None]:
nasdaq = nasdaq.applymap(lambda x: x.replace(".","").replace(",", "."))

In [None]:
datetime.now().date()

In [None]:
nasdaq_tickers = {}
for _, row in nasdaq.iterrows():
    nasdaq_tickers[row["Symbol"]] = {
        "name": row["Name"],
        "market_cap": row["Market Cap"],
        "last_sale": row["Last Sale"],
        "date": datetime.now().date()
    }

In [None]:
with open("../financial_report_analyzer/defaults/nasdaq_tickers.yaml", "w") as f:
    yaml.dump(nasdaq_tickers, f)

In [None]:
scraper = SECScraper()

In [None]:
archive = scraper.request_archive("ADBE")
archive_urls = scraper.fetch_filing_urls(archive)

In [None]:
filing_url = archive_urls[2024]["filings_url"]

In [None]:
filings_response = scraper.request_filings(filing_url)
archive_soup = BeautifulSoup(filings_response.text, "html.parser")

In [None]:
archive_soup

In [None]:
soup = BeautifulSoup(archive.text, "html.parser")
table = soup.find("table", class_="tableFile2")
rows = table.find_all("tr")

In [None]:
table

In [None]:
data = []
columns = ['Description', 'Document']

i = 0
cell_dict = {}
# Extract data from each row
for row in rows[1:4]:  # Skipping the header row
    cells = row.find_all('td')
    cell_dict[i] = cells
    data.append([cells[0].text.strip(), cells[1].find("a")["href"]])
    i += 1
# Create a DataFrame
df = pd.DataFrame(data, columns=columns)

In [None]:
for row in rows[1:]:
    url = row.find("a")["href"]
    print(url)

In [None]:
cell_dict

In [None]:
df

In [None]:
df.to_clipboard(index=False)

In [None]:
def create_hash(texts: list):
    raw_text = "".join(texts)
    return hashlib.sha256(raw_text.encode()).hexdigest()

def merged(df, col1="ticker", col2="year"):
    return df[col1] + df[col2]

### Initialize

In [None]:
sec_scraper = SECScraper()
model = ScoringModel()
connector = DatabaseConnector()

### Load Data

In [None]:
connector.table_names

filings = connector.fetch_data("sec_filings").set_index("id")
# scores = connector.fetch_data("scores")

### Hashing

In [None]:
def get_filing_url(filings, ticker, year):
    year = str(year)
    ticker = str(ticker)
    return filings.query("ticker==@ticker & year==@year")["filing_url"].values[0]

In [None]:
filing_url = filings.pipe(get_filing_url, "AAPL", 2023)

In [None]:
report = sec_scraper.fetch_report(filing_url)

In [None]:
soup = BeautifulSoup(report.content, parser="xml")

In [None]:
with open('output/aapl_2023.xml', 'w') as file:
    file.write(soup.prettify())

In [None]:
from arelle import Cntlr

In [None]:
xbrl = Cntlr.Cntlr().modelManager.load(filing_url)

In [None]:
factData = pd.DataFrame(data=[(fact.concept.qname,
                           fact.value,
                           fact.isNumeric,
                           fact.contextID,
                           fact.context.startDatetime,
                           fact.context.endDatetime) for fact in xbrl.facts], columns=["qname", "value", "isnumeric", "context_id", "start_date", "end_date"])

In [None]:
factData["qname_type"] = factData["qname"].apply(lambda x: str(x).split(":")[0])
factData["name"] = factData["qname"].apply(lambda x: str(x).split(":")[1])
factData = factData.drop("qname", axis=1)
factData = factData.set_index("name").reset_index()

In [None]:
factData.query("qname_type=='us-gaap' & isnumeric==False")["value"].iloc[12]#.to_clipboard(index=False)

In [None]:
factData.query("qname_type=='us-gaap' & isnumeric==False")

In [None]:
sendtb = factData.query("name=='StockholdersEquityNoteDisclosureTextBlock'")["value"].values[0]

In [None]:
soup = BeautifulSoup(sendtb, parser="lxml")

In [None]:
soup.find('div', string='Share Repurchase Program').find_next_sibling('div').text.strip()

In [None]:
title = soup.find('p').text
print(f"Title: {title}")

# Extract information about the Share Repurchase Program
share_repurchase_program_info = soup.find('div', text='Share Repurchase Program').find_next_sibling('div').text.strip()
print(f"Share Repurchase Program Information: {share_repurchase_program_info}")

# Assuming you want to extract table data, find the table and iterate over its rows
table = soup.find('table')
rows = table.find_all('tr')

# Extracting headers (assuming the first row contains headers)
headers = [th.text.strip() for th in rows[0].find_all('td')]

# Extracting each row data
for row in rows[1:]:
    cells = [td.text.strip() for td in row.find_all('td')]
    row_data = dict(zip(headers, cells))
    print(row_data)

In [None]:
for fact in xbrl.facts:
    print(fact.qname, fact.value)

In [None]:
import re

In [None]:
html_content

In [None]:
html_content = report.text

# Define the regular expression pattern to match item headers
pattern = r'<a name="ITEM[^"]*"[^>]*>(.*?)</a>'

# Extract item headers using regular expressions
item_headers = re.findall(pattern, html_content)

In [None]:
item_headers

In [None]:
item_headers = soup.find_all('a', attrs={'name': lambda x: x and 'ITEM' in x})

# Extract the text from the item headers
item_headers_text = [item_header.get_text() for item_header in item_headers]


### Analyze Reports

In [None]:
analyzed = (filings.pipe(merged)).isin((scores.pipe(merged)).tolist())

analyzed_filings = filings[analyzed]
not_analyzed_filings = filings[~analyzed]

In [None]:
def create_report_scores(filing, sentences_limit=False):
    ticker = filing["ticker"]
    year = filing["year"]
    filing_url = filing["filing_url"]

    report = sec_scraper.fetch_report(filing_url)

    extractor = TextExtractor(report)
    sentences = extractor.get_sentences()
    text_hash = create_hash(sentences)

    if sentences_limit:
        sentences = sentences[:50]
    
    report_scores = model.calculate_report_scores(
        sentences[:50]
    )
    report_scores.update(
        {
            "ticker": ticker,
            "year": year,
            "analysis_timestamp": datetime.now(),
            "text_hash": text_hash,
        }
    )
    return report_scores

In [None]:
new_report_scores = []

for _, filing in not_analyzed_filings.iterrows():
    report_scores = create_report_scores(filing, sentences_limit=True)
    new_report_scores.append(report_scores)

## Store scores in database

In [None]:
upload_scores = pd.concat([scores, pd.DataFrame(new_report_scores)])

In [None]:
connector.store_data(upload_scores)