In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import os

# Step 1: Load the data and perform EDA (keeping your existing EDA function)
df = pd.read_csv('D:/movie recommendation/titles.csv')
df = perform_eda(df)  # Assuming perform_eda is defined elsewhere

# Step 2: Initialize LangChain with ChromaDB and Sentence Transformers
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

def initialize_langchain_with_chroma(df, batch_size=5461):
    """
    Initializes LangChain with ChromaDB using SentenceTransformer embeddings.
    """
    # Create embedding model
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    
    # Initialize Chroma vector store with LangChain
    chroma_db = Chroma(embedding_function=embedding_model)
    
    # Add movie data to ChromaDB in batches
    documents = []
    for idx, row in df.iterrows():
        # Combine features into a single text string for embedding
        description = row.get('description', 'No description available')
        genres = ', '.join(row['genres']) if isinstance(row['genres'], list) else row.get('genres', 'Unknown genres')
        imdb_score = row.get('imdb_score', 'Unknown')
        release_year = row.get('release_year', 'Unknown')
        
        combined_text = f"{description} | Genres: {genres} | IMDb Score: {imdb_score} | Release Year: {release_year}"
        
        # Create a Document object with an id
        document = Document(
            page_content=combined_text,
            metadata={
                'title': row['title'],
                'genres': genres,
                'imdb_score': imdb_score,
                'release_year': release_year
            },
            id=str(idx)  # Use a unique ID, for example, the DataFrame index
        )
        
        documents.append(document)
        
        # When the batch size reaches the limit, add to the vector store
        if len(documents) >= batch_size:
            chroma_db.add_documents(documents)
            documents = []  # Reset for the next batch
    
    # Add any remaining documents
    if documents:
        chroma_db.add_documents(documents)
    
    return chroma_db

# Initialize LangChain with ChromaDB
chroma_db = initialize_langchain_with_chroma(df)


# Step 3: Set up the LLM and RetrievalQA Chain
def initialize_llm_and_retrievalqa():
    """
    Set up the LLM for response generation and the RetrievalQA chain.
    """
    # Initialize Hugging Face model and tokenizer
    hf_token = "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"  # Replace with your Hugging Face token
    model_name = "google/gemma-2-2b-it"

    # Initialize the Hugging Face LLM with LangChain
    llm = HuggingFaceHub(repo_id=model_name, huggingfacehub_api_token=hf_token)

    # Use the LangChain RetrievalQA chain to generate answers
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=chroma_db.as_retriever(search_type="similarity", search_kwargs={"k": 5}),
        return_source_documents=True
    )
    return qa_chain

# Initialize the LLM and RetrievalQA Chain
qa_chain = initialize_llm_and_retrievalqa()

# Step 4: Generate Response
def generate_response(user_input, qa_chain):
    """
    Generate a response using LangChain's RetrievalQA based on user input.
    This function returns only the 'result' from the response, which contains
    the generated answer to the user's query.
    """
    # Use invoke() to capture both outputs if necessary
    response = qa_chain.invoke({"query": user_input})
    
    # Extract the 'result' from the response dictionary
    answer = response.get('result', 'No answer found.')
    
    return answer


# Example user input and response generation
user_input = "I want a horror movie"
response = generate_response(user_input, qa_chain)
print("Generated response:\n", response)
