In [9]:
from my_package.load import FootballNewsLoader, FinancialNewsLoader
from my_package.document_chunker import DocumentChunker
from my_package.embed import Embedder

LIMIT = 3

# Step 1: Load data
loader = FootballNewsLoader(directory_path='data/football')
documents = loader.load()

# Step 2: Chunk documents
chunker = DocumentChunker()
all_chunks = []
for document in documents[:LIMIT]:
    chunks = chunker.chunk(document)
    all_chunks.extend(chunks)

# Step 3: Embed chunks
embedder = Embedder()
embedded_chunks = embedder.embed(all_chunks)

# Further processing or saving embedded_chunks
print(f"Processed {len(embedded_chunks)} chunks.")
print(all_chunks[0])
print(documents[0])

Processed 48 chunks.
Chunk(id=0_0, doc_id=0, text=Roman Abramovich has..., embedding=[0.019449638202786446, 0.007399196736514568, 0.0320294015109539, 0.043133825063705444, -0.0119040347635746])
Document(id=0, dataset=football, title=Explained: How Chels..., text=Roman Abramovich has...)


### Todos

[P1]
- [x] Process all files in football folder

- [x] Implement the financial dataset loader

- [x] Add a real embedder


[P2] 
- [ ] Data cleaning

- [ ] Add publish time to the document

- [ ] Improve chuncker

In [10]:
# Step 1: Load data
loader = FinancialNewsLoader(directory_path='data/financial')
documents = loader.load()

# Step 2: Chunk documents
chunker = DocumentChunker()
all_chunks = []
for document in documents[:LIMIT]:
    chunks = chunker.chunk(document)
    all_chunks.extend(chunks)

# Step 3: Embed chunks
embedder = Embedder()
embedded_chunks = embedder.embed(all_chunks)

# Further processing or saving embedded_chunks
print(f"Processed {len(embedded_chunks)} chunks.")
print(all_chunks[0])
print(documents[0])

Processed 13 chunks.
Chunk(id=4e41266ca1707a052245161948413f057982c0b2_0, doc_id=4e41266ca1707a052245161948413f057982c0b2, text=March 27(Reuters) - ..., embedding=[0.029279867187142372, -0.0169009268283844, 0.017760103568434715, 0.0007397688459604979, -0.022756900638341904])
Document(id=4e41266ca1707a052245161948413f057982c0b2, dataset=financial, title=BRIEF-AU Optronics t..., text=March 27(Reuters) - ...)


In [8]:
from openai import OpenAI
client = OpenAI()

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

emb = get_embedding("Hello, world!")
print(len(emb))

1536
