# Init

In [None]:
%load_ext autoreload
%autoreload 2

In [25]:
import pandas as pd
import os
from financial_report_analyzer.database_conntector import DatabaseConnector
from financial_report_analyzer.scraping import SECScraper
from financial_report_analyzer.model import ScoringModel
from financial_report_analyzer.content_extractor import TextExtractor
from tqdm import tqdm
from datetime import datetime

# 10-K filings SEC

In [26]:
DB_PATH = #CHECK in database_connector.py

In [27]:
connector = DatabaseConnector(DB_PATH)

filings = connector.fetch_data("filings")
scores = connector.fetch_data("scores")

In [28]:
filings = filings.drop_duplicates(subset=["year", "ticker"], keep="first")
filings = filings.sort_values(by=["ticker", "year"]).reset_index(drop=True)
filings

Unnamed: 0,ticker,year,url,url_type,ticker_years
0,21C,2009,https://www.sec.gov/Archives/edgar/data/130816...,htm,21C_2009
1,21C,2010,https://www.sec.gov/Archives/edgar/data/130816...,htm,21C_2010
2,21C,2011,https://www.sec.gov/Archives/edgar/data/130816...,htm,21C_2011
3,21C,2012,https://www.sec.gov/Archives/edgar/data/130816...,htm,21C_2012
4,21C,2013,https://www.sec.gov/Archives/edgar/data/130816...,htm,21C_2013
...,...,...,...,...,...
7681,ZTS,2019,https://www.sec.gov/Archives/edgar/data/155528...,htm,ZTS_2019
7682,ZTS,2020,https://www.sec.gov/Archives/edgar/data/155528...,htm,ZTS_2020
7683,ZTS,2021,https://www.sec.gov/Archives/edgar/data/155528...,htm,ZTS_2021
7684,ZTS,2022,https://www.sec.gov/Archives/edgar/data/155528...,htm,ZTS_2022


In [29]:
scores = scores.drop_duplicates(subset=["year", "ticker"], keep="first")
scores = scores.sort_values(by=["ticker", "year"]).reset_index(drop=True)
scores

Unnamed: 0,environmental,social,governance,ticker,year,analysis_timestamp,text_hash,ticker_years
0,0.002580,0.002212,0.016587,21C,2009,2024-05-19 10:05:10.924960,c6c1501770298e9f26d9670997bb5456040a41a539ec04...,21C_2009
1,0.001159,0.001159,0.017002,21C,2010,2024-05-19 10:08:15.166698,83cc3e74595947aa42893057c29f4637ad13191d586bf4...,21C_2010
2,0.001436,0.000718,0.020108,21C,2011,2024-05-19 10:11:32.237642,eac93509a71c28ff2754c0b3d4be3e3c26a37edf393ad7...,21C_2011
3,0.001402,0.001752,0.018570,21C,2012,2024-05-19 10:14:55.024970,a799e094f6d514cc38b7e762db105be0729acaa5ab67f8...,21C_2012
4,0.001821,0.000729,0.018215,21C,2013,2024-05-19 10:18:13.914954,e7700bfe09398493f78e52edf1ddbc6d6810b876091191...,21C_2013
...,...,...,...,...,...,...,...,...
7656,0.035971,0.010072,0.018225,ZTS,2019,2024-04-08 20:51:57.585591,92040795118f658b219746f8661ff2c5a1fbc74316b9fc...,ZTS_2019
7657,0.037718,0.010155,0.017892,ZTS,2020,2024-04-08 21:06:30.153159,fa0eff1a067f98d73f5dffde21f7b65419e9999fe5361a...,ZTS_2020
7658,0.043478,0.023411,0.016722,ZTS,2021,2024-04-08 21:14:41.421702,d9139f2832f60f7f311290cc9d08df46c28e33580b2314...,ZTS_2021
7659,0.053088,0.026034,0.018377,ZTS,2022,2024-04-08 21:22:27.348887,a7b98d7c14cba964cd871ad51625c7ff91385540a9eada...,ZTS_2022


In [None]:
scraper = SECScraper()
model = ScoringModel()

In [None]:
limit = False

## Select Tickers

In [None]:
filings["ticker_years"] = filings["ticker"] + "_" + filings["year"]
scores["ticker_years"] = scores["ticker"] + "_" + scores["year"]

not_analyzed = filings[~filings["ticker_years"].isin(scores["ticker_years"].tolist())]
not_analyzed_tickers = list(not_analyzed["ticker"].unique())

In [None]:
tickers = not_analyzed_tickers[:1]
tickers = ["MCD"]

## Run Analysis

In [None]:
session_scores = []

for ticker in tickers:

    for _, filing in tqdm(
        filings.query("ticker==@ticker").iterrows(), ncols=60, desc=ticker
    ):
        filing_url = filing["url"]
        filing_url_type = filing["url_type"]
        year = filing["year"]

        report = scraper.fetch_report(filing_url)
        extractor = TextExtractor(report)
        sentences = extractor.get_sentences(url_type=filing_url_type)
        text_hash = extractor.create_hash(sentences)

        sample_sentences = sentences

        if limit:
            sample_sentences = sentences[:50]

        report_scores = model.calculate_report_scores(sample_sentences)
        report_scores.update(
            {
                "ticker": ticker,
                "year": year,
                "analysis_timestamp": datetime.now(),
                "text_hash": text_hash,
            }
        )
        session_scores.append(report_scores)

In [None]:
session_output = pd.DataFrame(session_scores)
prev_scores = connector.fetch_data("scores").drop_duplicates(subset=["year", "ticker"], keep="first")

In [None]:
new_scores = pd.concat([prev_scores, session_output])

In [None]:
new_scores = new_scores.sort_values(by=["ticker", "year"]).reset_index(drop=True)
new_scores

## Store Output

In [None]:
connector.store_data(new_scores, "scores")

# Annual Reports Dax Companies

In [None]:
# Your path
DAX_REPORTS_DIR = # Your path

In [None]:
filings_data = []
for root, dirs, files in os.walk(DAX_REPORTS_DIR):
    for file in files:
        if file.endswith(".pdf"):
            isin, year = file.replace(".pdf", "").split("_")
            filepath = os.path.join(root, file)
            filings_data.append({"isin": isin, "year": int(year), "isin_years": f"{isin}_{year}", "path": filepath})

filings_dax = pd.DataFrame(filings_data)
filings_dax

In [None]:
# Your path
scores_dax = pd.read_csv()
scores_dax

In [None]:
scraper = SECScraper()
model = ScoringModel()

In [None]:
limit = False

## Select Tickers

In [None]:
not_analyzed = filings_dax[~filings_dax["isin_years"].isin(scores_dax["isin_years"].tolist())]
not_analyzed_tickers = list(not_analyzed["isin"].unique())

In [None]:
isins = not_analyzed_tickers[:1]
isin

## Run Analysis

In [None]:
session_scores = []

for _, row in tqdm(
        filings_dax.query("isin==@isin").iterrows(), ncols=60, desc=ticker
    ):
    path = row["path"]
    isin = row["isin"]
    year = row["year"]
    isin_years = row["isin_years"]

    extractor = TextExtractor(path)
    sentences = extractor.get_scentences_dax()
    text_hash = extractor.create_hash(sentences)

    sample_sentences = sentences

    if limit:
        sample_sentences = sentences[:50]

    report_scores = model.calculate_report_scores(sample_sentences)
    report_scores.update(
        {
            "isin": isin,
            "year": year,
            "isin_years": isin_years
            "analysis_timestamp": datetime.now(),
            "text_hash": text_hash,
        }
    )
    session_scores.append(report_scores)

In [None]:
session_output = pd.DataFrame(session_scores)
prev_dax_scores = pd.read_csv()

In [None]:
new_dax_scores = pd.concat([prev_dax_scores, session_output])

In [None]:
new_dax_scores = new_dax_scores.sort_values(by=["ticker", "year"]).reset_index(drop=True)
new_dax_scores

## Store Output

In [None]:
#Your path
new_dax_scores.to_csv()