# Init

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import os
from financial_report_analyzer.database_conntector import DatabaseConnector
from financial_report_analyzer.scraping import SECScraper
from financial_report_analyzer.model import ScoringModel
from financial_report_analyzer.content_extractor import TextExtractor
from tqdm import tqdm
from datetime import datetime

# 10-K filings SEC

In [None]:
DB_PATH = #YOUR_PATH

In [None]:
connector = DatabaseConnector(DB_PATH)

filings = connector.fetch_data("filings")
scores = connector.fetch_data("scores")

In [None]:
filings = filings.drop_duplicates(subset=["year", "ticker"], keep="first")
filings = filings.sort_values(by=["ticker", "year"]).reset_index(drop=True)

In [None]:
scores = scores.drop_duplicates(subset=["year", "ticker"], keep="first")
scores = scores.sort_values(by=["ticker", "year"]).reset_index(drop=True)

In [None]:
scraper = SECScraper()
model = ScoringModel()

In [None]:
limit = False

## Select Tickers

In [None]:
filings["ticker_years"] = filings["ticker"] + "_" + filings["year"]
scores["ticker_years"] = scores["ticker"] + "_" + scores["year"]

not_analyzed = filings[~filings["ticker_years"].isin(scores["ticker_years"].tolist())]
not_analyzed_tickers = list(not_analyzed["ticker"].unique())

In [None]:
tickers = not_analyzed_tickers[:1]
tickers

## Run Analysis

In [None]:
session_scores = []

for ticker in tickers:

    for _, filing in tqdm(
        filings.query("ticker==@ticker").iterrows(), ncols=60, desc=ticker
    ):
        filing_url = filing["url"]
        filing_url_type = filing["url_type"]
        year = filing["year"]

        report = scraper.fetch_report(filing_url)
        extractor = TextExtractor(report)
        sentences = extractor.get_sentences(url_type=filing_url_type)
        text_hash = extractor.create_hash(sentences)

        sample_sentences = sentences

        if limit:
            sample_sentences = sentences[:50]

        report_scores = model.calculate_report_scores(sample_sentences)
        report_scores.update(
            {
                "ticker": ticker,
                "year": year,
                "analysis_timestamp": datetime.now(),
                "text_hash": text_hash,
            }
        )
        session_scores.append(report_scores)

In [None]:
session_output = pd.DataFrame(session_scores)
prev_scores = connector.fetch_data("scores").drop_duplicates(subset=["year", "ticker"], keep="first")

In [None]:
new_scores = pd.concat([prev_scores, session_output])

In [None]:
new_scores = new_scores.sort_values(by=["ticker", "year"]).reset_index(drop=True)
new_scores

## Store Output

In [None]:
connector.store_data(new_scores, "scores")

# Annual Reports Dax Companies

In [None]:
# Your path
DAX_REPORTS_DIR = # Your path

In [None]:
filings_data = []
for root, dirs, files in os.walk(DAX_REPORTS_DIR):
    for file in files:
        if file.endswith(".pdf"):
            isin, year = file.replace(".pdf", "").split("_")
            filepath = os.path.join(root, file)
            filings_data.append({"isin": isin, "year": int(year), "isin_years": f"{isin}_{year}", "path": filepath})

filings_dax = pd.DataFrame(filings_data)
filings_dax

In [None]:
# Your path
scores_dax = pd.read_csv()
scores_dax

In [None]:
scraper = SECScraper()
model = ScoringModel()

In [None]:
limit = False

## Select Tickers

In [None]:
not_analyzed = filings_dax[~filings_dax["isin_years"].isin(scores_dax["isin_years"].tolist())]
not_analyzed_tickers = list(not_analyzed["isin"].unique())

In [None]:
isins = not_analyzed_tickers[:1]
isin

## Run Analysis

In [None]:
session_scores = []

for _, row in tqdm(
        filings_dax.query("isin==@isin").iterrows(), ncols=60, desc=ticker
    ):
    path = row["path"]
    isin = row["isin"]
    year = row["year"]
    isin_years = row["isin_years"]

    extractor = TextExtractor(path)
    sentences = extractor.get_scentences_dax()
    text_hash = extractor.create_hash(sentences)

    sample_sentences = sentences

    if limit:
        sample_sentences = sentences[:50]

    report_scores = model.calculate_report_scores(sample_sentences)
    report_scores.update(
        {
            "isin": isin,
            "year": year,
            "isin_years": isin_years
            "analysis_timestamp": datetime.now(),
            "text_hash": text_hash,
        }
    )
    session_scores.append(report_scores)

In [None]:
session_output = pd.DataFrame(session_scores)
prev_dax_scores = pd.read_csv()

In [None]:
new_dax_scores = pd.concat([prev_dax_scores, session_output])

In [None]:
new_dax_scores = new_dax_scores.sort_values(by=["ticker", "year"]).reset_index(drop=True)
new_dax_scores

## Store Output

In [None]:
#Your path
new_dax_scores.to_csv()