In [13]:
import pandas as pd
df =pd.read_csv('output.csv')
df.head()

Unnamed: 0,Title,Content,Date,full_page_content
0,Peterborough hold talks with Williams over man...,"Date: 2025-10-26, Content: Latest from Sky Spo...",2025-10-26,Title: Peterborough hold talks with Williams o...
1,"Emery: Elliott has to prove, that is why I lef...","Date: 2025-10-26, Content: Aston Villa boss ha...",2025-10-26,"Title: Emery: Elliott has to prove, that is wh..."
2,'West Ham to stick with Nuno',"Date: 2025-10-25, Content: West Ham intend to ...",2025-10-25,Title: 'West Ham to stick with Nuno'\n\nDate: ...
3,Malacia travels with squad to face Brighton,"Date: 2025-10-25, Content: Tyrell Malacia is p...",2025-10-25,Title: Malacia travels with squad to face Brig...
4,Moyes coy on Toney links,"Date: 2025-10-25, Content: David Moyes says he...",2025-10-25,Title: Moyes coy on Toney links\n\nDate: 2025-...


In [14]:
# convert data types - date first
import datetime
df['Date'] = pd.to_datetime(df['Date']).dt.strftime("%d-%m-%y")

In [15]:
df.head()

Unnamed: 0,Title,Content,Date,full_page_content
0,Peterborough hold talks with Williams over man...,"Date: 2025-10-26, Content: Latest from Sky Spo...",26-10-25,Title: Peterborough hold talks with Williams o...
1,"Emery: Elliott has to prove, that is why I lef...","Date: 2025-10-26, Content: Aston Villa boss ha...",26-10-25,"Title: Emery: Elliott has to prove, that is wh..."
2,'West Ham to stick with Nuno',"Date: 2025-10-25, Content: West Ham intend to ...",25-10-25,Title: 'West Ham to stick with Nuno'\n\nDate: ...
3,Malacia travels with squad to face Brighton,"Date: 2025-10-25, Content: Tyrell Malacia is p...",25-10-25,Title: Malacia travels with squad to face Brig...
4,Moyes coy on Toney links,"Date: 2025-10-25, Content: David Moyes says he...",25-10-25,Title: Moyes coy on Toney links\n\nDate: 2025-...


### Data Loader - Load the Data

In [17]:
# %pip show langchain_community

In [18]:
# %pip install -r .\requirment.txt 

In [33]:
from langchain_community.document_loaders import DataFrameLoader

loader = DataFrameLoader(df, page_content_column='full_page_content')

documents = loader.load()

In [None]:
print(f'Total number of doucment: {len(documents)}')
print(documents[0].metadata)

Total number of doucment: 3678
{'Title': 'Peterborough hold talks with Williams over manager role', 'Content': "Date: 2025-10-26, Content: Latest from Sky Sports News' Dharmesh Sheth and Mark McAdam: Peterborough United have held talks with Luke Williams over their vacant managerial post. The League One side are searching for a new manager after parting ways with Darren Ferguson on Saturday, following their 2-1 home defeat to Blackpool. Williams has been out of work since leaving Swansea City in February.", 'Date': '26-10-25'}


In [None]:
avg_len = sum(len(d.page_content) for d in documents) / len(documents)
count = sum(len(d.page_content) > 1000 for d in documents)
avg_len, count

(634.8920609026645, 506)

### Splitting the Text

In [34]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def text_splitter(documents, chunk_size = 1000, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators=["\n\n", "\n", " ", ""]
    )
    
    split_docs = splitter.split_documents(documents)
    print(f'Split {len(documents)} Documents into {len(split_docs)} chunks')
    
    return split_docs

split_docs = text_splitter(documents)

Split 3678 Documents into 4640 chunks


### Embedding and VectoreStore

In [47]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

class EmbeddingModel:
    def __init__(self, model_name: str = 'all-miniLM-L6-v2'):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            self.model = SentenceTransformer(self.model_name)
            print(f'Loaded embedding model: {self.model_name}, Embeddng dimension: {
                self.model.get_sentence_embedding_dimension()}')
            
        except Exception as e:
            print(f'Error loading embedding model: {e}')
            raise e
    
    def get_embeddings(self, texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Embedding model is not Loaded....")
        embeddings = self.model.encode(texts, show_progress_bar=False)
        print(f'Generated embedding with shape: {embeddings.shape}')
        return embeddings


In [None]:
embeding_model = EmbeddingModel()
embedded_docs = [embeding_model.get_embeddings(doc.page_content) for doc in split_docs]

ChromaDB

In [48]:
import os
class VectorStore:
	def __init__(self, collection_name: str='transfer_rumors', persist_dir: str = '../data/chromadb'):
		self.collection_name = collection_name
		self.persist_dir = persist_dir
		self.client = None
		self.collection = None
		self._connect()
		
	def _connect(self):
		try:
			os.makedirs(self.persist_dir, exist_ok=True)
			self.client = chromadb.PersistentClient(self.persist_dir)
			
			# Get or create collection
			self.collection = self.client.get_or_create_collection(
				name = self.collection_name,
				metadata={"desc": "Embedding model for tranfer rumors RAG app"}
			)
			print(f'Vector Store initialized with collection: {self.collection_name}')
	
		except Exception as e:
			print(f'Error initilaizing vector Store: {e}')
			raise

	def add_embeddings(self, documents: List[Any], embeddings: np.ndarray):
		try:
			if not self.collection:
				raise ValueError("Vector Store not initialized...")
			
			ids = []
			metadatas =[]
			embedding_list = []
			document_list= []
			
			for i, (doc,embedding) in enumerate(zip(documents, embeddings)):
				id = f'doc_{(uuid.uuid4().hex[:8])}_{i}'
				ids.append(id)
				
				metadata = doc.metadata
				metadata['content_length'] = len(doc.page_content)
				metadatas.append(metadata)
				
				embedding_list.append(embedding.tolist())
				document_list.append(doc.page_content)
				
			self.collection.add(
				ids = ids,
				documents = document_list,
				embeddings = embedding_list,
				metadatas = metadatas
			)
			print(f'Successfully added {len(ids)} collection to vector store.')

		except Exception as e:
			print(f'Error adding embeddings to vector store: {e}')
			raise




In [49]:
# Create instance of VectorStore
vc = VectorStore()
vc.add_embeddings(split_docs, embedded_docs)

Vector Store initialized with collection: transfer_rumors
Successfully added 4640 collection to vector store.


### Retreival Pipeline from VectorStore

In [50]:
class RAGRetrever:
    def __init__(self, vector_store: VectorStore, embedding_model: EmbeddingModel):
        self.vector_store = vector_store
        self.embedding_model = embedding_model
        
    def retrieve(self, query: str, top_k: int=5, score_threshold: float= 0.0) -> List[Dict[str, Any]]:
        print(f'Retreiving top {top_k} documents for query: {query}')
        
        # Generate embeddig for quert
        query_embeddings = self.embedding_model.get_embeddings([query])[0]
        
        # Search in Vector Store
        try:
            
            results = self.vector_store.collection.query(
                query_embeddings= query_embeddings.tolist(),
                n_results= top_k
            )
            
            # Process results
            retreived_docs = []
            
            if results['documents'] and results['documents'][0]:
                for i in range(len(results['documents'][0])):
                    score = results['distances'][0][i]
                    if score >= score_threshold:
                        doc_info = {
                            'id': results['ids'][0][i],
                            'content': results['documents'][0][i],
                            'metadata': results['metadatas'][0][i],
                            'score': score
                        }
                        retreived_docs.append(doc_info)
                        print(f'Retrieved {len(retreived_docs)} documents from vector store.')
                    else:
                        print('No documents retrieved')
            return retreived_docs
        except Exception as e:
            print(f'Error fetching results from vector store: {e}')
            raise
            
        

In [51]:
rag = RAGRetrever(vector_store=vc, embedding_model=embeding_model)
retrive_res = rag.retrieve(query="whats the latesr news about William Saliba", top_k=5)

Retreiving top 5 documents for query: whats the latesr news about William Saliba


Batches: 100%|██████████████████████████████████| 1/1 [00:00<00:00, 107.54it/s]

Generated embedding with shape: (1, 384)
Retrieved 1 documents from vector store.
Retrieved 2 documents from vector store.
Retrieved 3 documents from vector store.
Retrieved 4 documents from vector store.
Retrieved 5 documents from vector store.





In [64]:
import os
from dotenv import load_dotenv

load_dotenv(dotenv_path='C:/Users/Sahan/OneDrive/Data Science - Kiel/Semester 2/SMA/transfer_rumor_rag/.env')
# print(os.getenv("grog_key"))

True

In [63]:
from langchain_groq import ChatGroq
# Set environment variables from Python (don't use shell 'export' in a notebook cell)



grog_api_key = os.getenv("grog_key")
if not grog_api_key:
    raise ValueError("Grog API key not found in the .env file")

llm = ChatGroq(api_key=grog_api_key, model="llama-3.1-8b-instant", temperature=0.1, max_tokens=500)

# RAG function to generate output with context
def genrate_response(query:str, retriever:RAGRetrever, llm, top_k=3):
    # Retrieved relevant docs
    retrieved_docs = retriever.retrieve(query, top_k)
    context = "\n\n".join([doc['content'] for doc in retrieved_docs]) if retrieved_docs else ""
    if not context:
        return "No relevant context found for the query"
    
    prompt = f"""
        You are an AI assistant specialized in explaining the latest Football(Soccer) transfer news about a player or a Team.

        ---CONTEXT---
        {context}
        ---END OF CONTEXT---

        ---QUESTION---
        {query}
        ---ANSWER---
        Instructions:
        - Summarize the key points from the context in plain language.
        - Avoid repeating raw markdown, tables, or code.
        - Find the latest transfer news based on the date
        - Present the answer as short paragraphs or numbered points.
        - Be concise and clear, suitable for a human reader.
        
        """

    response = llm.invoke(prompt.format(context = context, query = query))
    
    return response.content

In [61]:
answer = genrate_response("Explain the current trasnfer situation of Willim Saliba", rag, llm, top_k=3)

Retreiving top 3 documents for query: Explain the current trasnfer situation of Willim Saliba


Batches: 100%|███████████████████████████████████| 1/1 [00:00<00:00, 62.50it/s]

Generated embedding with shape: (1, 384)
Retrieved 1 documents from vector store.
Retrieved 2 documents from vector store.
Retrieved 3 documents from vector store.





In [62]:
print(answer)

**William Saliba's Transfer Situation**

William Saliba, a 24-year-old French defender, has recently signed a new long-term contract with Arsenal. Here are the key points about his transfer situation:

**New Contract**

- William Saliba has signed a five-year deal with Arsenal, keeping him at the club until the summer of 2030.
- This new contract is a significant development in his transfer situation, securing his future with the Gunners.

**Manager's Comments**

- Arsenal manager Mikel Arteta has expressed his admiration for Saliba, praising his character, commitment, and attitude.
- Arteta believes Saliba has grown significantly since joining the club and has a strong connection with the supporters and staff.

**Previous Developments**

- In July, Saliba hinted at the possibility of signing a new contract, saying "hopefully" and mentioning a good conversation with his agent and the club.
- His centre-back partner Gabriel had signed a new deal earlier in the summer, which might have i