In [1]:
import sys , types

try:
    import keras
    keras.__version__ = '2.12.0'
except ImportError:
    pass
# 2) Create a tf_keras alias so Transformers can import it
if 'tf_keras' not in sys.modules:
    try:
        import keras as _k
        m = types.ModuleType('tf_keras')
        # Copy attributes from keras
        for attr in dir(_k):
            setattr(m, attr, getattr(_k, attr))
        sys.modules['tf_keras'] = m
    except ImportError:
        pass



# Setup project root and import path
import os, sys
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, project_root)





In [2]:
# %%
# Download raw data
from src.data_loader import DataLoader

# Define tickers and download directory
tickers = ["AAPL", "TSLA", "MSFT"]
loader = DataLoader(save_dir="data/raw", delay=0.5)
for t in tickers:
    loader.fetch_10k_filings(t, count=1)

[+] Ticker: AAPL → CIK: 0000320193
[+] Found 1 10-K filings
[✓] Downloaded: AAPL_10K_2024.txt
[+] Ticker: TSLA → CIK: 0001318605
[+] Found 1 10-K filings
[✓] Downloaded: TSLA_10K_2024.txt
[+] Ticker: MSFT → CIK: 0000789019
[+] Found 1 10-K filings
[✓] Downloaded: MSFT_10K_2024.txt


In [3]:
# Preprocess raw filings
from src.preprocessing import Preprocessor

pp = Preprocessor(raw_dir="data/raw", processed_dir="data/processed")
pp.batch_preprocess()

Preprocessing AAPL_10K_2024.txt
Preprocessed: data/processed\AAPL_10K_2024.txt
Preprocessing MSFT_10K_2024.txt
Preprocessed: data/processed\MSFT_10K_2024.txt
Preprocessing TSLA_10K_2024.txt
Preprocessed: data/processed\TSLA_10K_2024.txt


In [4]:

# Build main dataset
from src.build_dataset import DatasetBuilder

builder = DatasetBuilder(processed_dir="data/processed", output_file="data/processed/reports.parquet")
builder.save()

Dataset saved to: data/processed/reports.parquet


In [5]:

# Enrich with market data (local CSV loader)
from src.market_data import LocalMarketDataLoader
import pandas as pd

# Map tickers to your own CSV paths
price_files = {
        "AAPL": "C:/Users/theod/OneDrive/Bureau/Theo/Master IEF Dauphine/S2/NLP/data/AAPL.csv",
        "MSFT": "C:/Users/theod/OneDrive/Bureau/Theo/Master IEF Dauphine/S2/NLP/data/MSFT.csv",
        "TSLA": "C:/Users/theod/OneDrive/Bureau/Theo/Master IEF Dauphine/S2/NLP/data/TSLA.csv",
    }
reports = pd.read_parquet("data/processed/reports.parquet")
market_loader = LocalMarketDataLoader(price_files, output_file="data/processed/reports_with_market.parquet")
enriched = market_loader.enrich(reports)
market_loader.save(enriched)

[✓] Saved enriched data to data/processed/reports_with_market.parquet


In [6]:
#!pip install textstat
#!pip install spacy

# Compute linguistic features

from src.features import FeatureEngineer
fe = FeatureEngineer(input_file=os.path.join(project_root, "data/processed/reports_with_market.parquet"),
                        output_file=os.path.join(project_root, "data/processed/reports_features.parquet"))
fe.save()




Saved features to c:\Users\theod\OneDrive\Documents\GitHub\nlp-financial-reports\data/processed/reports_features.parquet


In [9]:

# Generate TF-IDF features for SVM

import pandas as pd
from src.vectorization import TfidfFeatureExtractor

df_feats = pd.read_parquet(os.path.join(project_root,"data/processed/reports_features.parquet"))
texts = df_feats['item1a']  # focus on Risk Factors

tfidf_extractor = TfidfFeatureExtractor(max_features=5000,
                                         pca_components=0,
                                         output_file="data/processed/tfidf_features.parquet")
tfidf_matrix = tfidf_extractor.fit_transform(texts)
tfidf_extractor.save(tfidf_matrix)

[✓] Saved TF-IDF features to data/processed/tfidf_features.parquet


Found existing installation: keras 3.10.0
Uninstalling keras-3.10.0:
  Successfully uninstalled keras-3.10.0
Collecting tf-keras
  Downloading tf_keras-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow<2.20,>=2.19 (from tf-keras)
  Downloading tensorflow-2.19.0-cp312-cp312-win_amd64.whl.metadata (4.1 kB)
Collecting keras>=3.5.0 (from tensorflow<2.20,>=2.19->tf-keras)
  Using cached keras-3.10.0-py3-none-any.whl.metadata (6.0 kB)
Downloading tf_keras-2.19.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ---------------------------------------- 1.7/1.7 MB 18.8 MB/s eta 0:00:00
Downloading tensorflow-2.19.0-cp312-cp312-win_amd64.whl (376.0 MB)
   ---------------------------------------- 0.0/376.0 MB ? eta -:--:--
   ---------------------------------------- 1.8/376.0 MB 10.1 MB/s eta 0:00:38
    --------------------------------------- 4.7/376.0 MB 11.9 MB/s eta 0:00:32
    --------------------------------------- 8.4/376.0 MB 

ERROR: Could not install packages due to an OSError: [WinError 5] Accès refusé: 'c:\\Users\\theod\\anaconda3\\Lib\\site-packages\\tensorflow\\compiler\\mlir\\lite\\python\\_pywrap_converter_api.pyd'
Consider using the `--user` option or check the permissions.



In [11]:

# Generate embedding features for neural models
from src.vectorization import EmbeddingFeatureExtractor

embed_extractor = EmbeddingFeatureExtractor(model_name='all-MiniLM-L6-v2',
                                            output_file='data/processed/embedding_features.parquet')
embeddings = embed_extractor.transform(texts)
embed_extractor.save(embeddings)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[✓] Saved embedding features to data/processed/embedding_features.parquet
