In [7]:
import os
from typing import List
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class SimpleRAG:
    def __init__(self, documents: List[str], llm=None):
        """
        Initialize SimpleRAG with a list of documents and optional LLM
        
        Args:
            documents (List[str]): List of reference documents
            llm: Language model (defaults to ChatGPT if not provided)
        """
        self.documents = documents
        
        # Use TF-IDF for document similarity
        self.vectorizer = TfidfVectorizer()
        self.doc_vectors = self.vectorizer.fit_transform(documents)
        
        # Initialize language model
        if llm is None:
            llm = ChatOpenAI(
                openai_api_key=os.getenv('OPENAI_API_KEY'),
                model='gpt-3.5-turbo',
                temperature=0.7
            )
        
        # Create prompt template for retrieval-augmented generation
        prompt = PromptTemplate(
            input_variables=['context', 'query'],
            template="""Based on the following context documents, answer the query:
            
Context:
{context}

Query: {query}

Provide a comprehensive and accurate answer using the context. If the context doesn't contain sufficient information, 
state that clearly and provide what information you can based on the available context."""
        )
        
        # Create LLM chain
        self.chain = LLMChain(llm=llm, prompt=prompt)
    
    def retrieve_relevant_docs(self, query: str, top_k: int = 2) -> List[str]:
        """
        Retrieve most relevant documents using cosine similarity
        
        Args:
            query (str): Input query
            top_k (int): Number of top documents to retrieve
        
        Returns:
            List of most relevant documents
        """
        # Vectorize the query
        query_vector = self.vectorizer.transform([query])
        
        # Calculate cosine similarities
        similarities = cosine_similarity(query_vector, self.doc_vectors)[0]
        
        # Get indices of top-k similar documents
        top_indices = similarities.argsort()[-top_k:][::-1]
        
        return [self.documents[i] for i in top_indices]
    
    def generate_response(self, query: str) -> str:
        """
        Generate response using retrieval-augmented generation
        
        Args:
            query (str): Input query
        
        Returns:
            Generated response
        """
        # Retrieve relevant documents
        relevant_docs = self.retrieve_relevant_docs(query)
        
        # Combine retrieved documents into context
        context = "\n\n".join(relevant_docs)
        
        # Generate response
        response = self.chain.run(context=context, query=query)
        
        return response

def main():
    # Example documents
    documents = [
        "Python is a high-level programming language known for its simplicity and versatility.",
        "Machine learning is a subset of artificial intelligence focusing on pattern recognition and predictive modeling.",
        "Artificial intelligence aims to create intelligent machines that can simulate human-like thinking and decision-making.",
        "Data science combines statistics, computer science, and domain expertise to extract insights from data.",
        "Neural networks are a key technology in deep learning, inspired by the human brain's neural structure."
    ]
    
    # Initialize SimpleRAG
    rag = SimpleRAG(documents)
    
    # Example queries
    queries = [
        "What is Python?",
        "Explain machine learning",
        "Tell me about artificial intelligence"
    ]
    
    # Run queries
    for query in queries:
        print(f"\nQuery: {query}")
        response = rag.generate_response(query)
        print("Response:", response)

if __name__ == '__main__':
    main()

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable