In [2]:
from my_package.load import FootballNewsLoader, FinancialNewsLoader
from my_package.document_chunker import DocumentChunker
from my_package.embed import Embedder

LIMIT = 100

# Step 1: Load data
loader = FootballNewsLoader(directory_path='data/football')
documents = loader.load()

# Step 2: Chunk documents
chunker = DocumentChunker()
all_chunks = []
for document in documents[:LIMIT]:
    chunks = chunker.chunk(document)
    all_chunks.extend(chunks)

# Step 3: Embed chunks
embedder = Embedder()
embedded_chunks = embedder.embed(all_chunks)

# Further processing or saving embedded_chunks
print(f"Processed {len(embedded_chunks)} chunks.")
print(all_chunks[0])

Processed 1376 chunks.
Chunk(id=b45da1fe16507a1b9033dbad46463c6c05fa74e6f099e9d7a82993c99e378a96_0, doc_id=b45da1fe16507a1b9033dbad46463c6c05fa74e6f099e9d7a82993c99e378a96, text=Roman Abramovich has..., embedding=[0.019449638202786446, 0.007399196736514568, 0.0320294015109539, 0.043133825063705444, -0.0119040347635746])


### Todos

[P1]
- [x] Process all files in football folder

- [x] Implement the financial dataset loader

- [x] Add a real embedder


[P2] 
- [x] Data cleaning

- [ ] Add publish time to the document
    Plan: publish time is another critical feature that would be useful in training and testing purposes. We can add the publish time to the document data structure and drop any document that does not have a publish time. This would help further cleaning up the data. 

- [ ] Improve chuncker
    Plan: chunking the document with overlapping windows would allow for better search results. 

In [3]:
# Step 1: Load data
loader = FinancialNewsLoader(directory_path='data/financial')
documents = loader.load()

# Step 2: Chunk documents
chunker = DocumentChunker()
all_chunks = []
for document in documents[:LIMIT]:
    chunks = chunker.chunk(document)
    all_chunks.extend(chunks)

# Step 3: Embed chunks
embedder = Embedder()
embedded_chunks = embedder.embed(all_chunks)

# Further processing or saving embedded_chunks
print(f"Processed {len(embedded_chunks)} chunks.")
print(all_chunks[0])

Processed 1116 chunks.
Chunk(id=4e41266ca1707a052245161948413f057982c0b2_0, doc_id=4e41266ca1707a052245161948413f057982c0b2, text=March 27(Reuters) - ..., embedding=[0.029279867187142372, -0.0169009268283844, 0.017760103568434715, 0.0007397688459604979, -0.022756900638341904])
