### Load the libraries

In [7]:
import os
import pinecone
from dotenv import load_dotenv
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain_pinecone import Pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader, CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings

### Defining the env variables

In [8]:
load_dotenv()
KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
PINECONE_API_ENV=os.getenv("PINECONE_API_ENV")
INDEXNAME=os.getenv("INDEXNAME")

### Initialising the llm 

In [9]:
llm =  ChatOpenAI(openai_api_key=KEY,model_name='gpt-4o-mini')

### Loading the data to get the embeddings

In [10]:
def load_csv(data_dir):
    documents = []
    csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
    for csv_file in csv_files:
        csv_path = os.path.join(data_dir, csv_file)
        csv_loader = CSVLoader(csv_path)
        csv_documents = csv_loader.load()
        documents.extend(csv_documents)
    return documents

### Fetching the csv files from the data dir | create a folder named data 

In [13]:
file_path = "data"
extracted_data = load_csv(file_path)

### Creating splits in data to create the chunks for embeddings

In [14]:
# Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks
text_chunks = text_split(extracted_data)
print(len(text_chunks))

14634


### Calling the OpenAI embeddings

In [15]:
embedding = OpenAIEmbeddings()

  embedding = OpenAIEmbeddings()


### Testing the embeddings

In [16]:
query = embedding.embed_query("Hello World")
print(len(query))

1536


### Connecting to the Pinecone DB

In [17]:
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
index_name = INDEXNAME
# Connect to the index
index = pc.Index(index_name)
index

<pinecone.data.index.Index at 0x12ebf9810>

### Generate and upsert the embeddings

In [18]:
'''
# Generate and upsert embeddings with metadata
vectors = []
for i, chunk in enumerate(text_chunks):
    metadata = {
                "source": chunk.metadata.get("source", ""),
                "text": chunk.page_content
            }
    vector = {
        "id": str(i),
        "values": embedding.embed_documents([chunk.page_content])[0],  # embed the chunk's content
        "metadata": metadata
    }   
    vectors.append(vector)

# Function to process documents in batches
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

# Batch size (make sure it's within the limit)
batch_size = 100

# Upsert vectors in batches
for i, batch_vectors in enumerate(batch(vectors, batch_size)):
    try:
        index.upsert(vectors=batch_vectors)
        print(f"Successfully upserted batch {i+1}")
    except Exception as e:
        print(f"Error in batch {i+1}: {e}")

'''
# No need to run the embeddings as I have already created it in the pinecone so we just need to query on it 

'\n# Generate and upsert embeddings with metadata\nvectors = []\nfor i, chunk in enumerate(text_chunks):\n    metadata = {\n                "source": chunk.metadata.get("source", ""),\n                "text": chunk.page_content\n            }\n    vector = {\n        "id": str(i),\n        "values": embedding.embed_documents([chunk.page_content])[0],  # embed the chunk\'s content\n        "metadata": metadata\n    }   \n    vectors.append(vector)\n\n# Function to process documents in batches\ndef batch(iterable, n=1):\n    l = len(iterable)\n    for ndx in range(0, l, n):\n        yield iterable[ndx:min(ndx + n, l)]\n\n# Batch size (make sure it\'s within the limit)\nbatch_size = 100\n\n# Upsert vectors in batches\nfor i, batch_vectors in enumerate(batch(vectors, batch_size)):\n    try:\n        index.upsert(vectors=batch_vectors)\n        print(f"Successfully upserted batch {i+1}")\n    except Exception as e:\n        print(f"Error in batch {i+1}: {e}")\n\n'

### Using Pinecone wrapper via langchin 

In [19]:
# Initialize Langchain Pinecone wrapper
docsearch = Pinecone(index=index, embedding=embedding, text_key="text")

  docsearch = Pinecone(index=index, embedding=embedding, text_key="text")


### Test the similar embeddings

In [20]:
query = embedding.embed_query("Hello World")
print(len(query))

1536


### Create a prompt for my llm 

In [21]:
#  Behavious for the llm to respond 
prompt_template = """
        Analyze the following product reviews and provide business insights. Focus on:
        1. Overall sentiment and key themes
        2. Common customer pain points
        3. Positive aspects highlighted by customers
        4. Potential areas for improvement
        5. Business recommendations

        Context: {context}
        Question: {question} 

        Provide a detailed analysis with specific examples from the reviews.
        """

### Setting the parameters in the required format 

In [22]:
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context","question"])
chain_type_kwargs = {"prompt": PROMPT}

### Retreival QA chain initialising (RAG)

In [23]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(search_kwargs={'k':3}),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs,
)

### Testing the RAG model 

In [24]:
user_input = "Best computer for college students?" # USER 
result = qa({'query': user_input})
result['result']

  result = qa({'query': user_input})


'Based on the provided product reviews, here\'s a detailed analysis focusing on key insights related to overall sentiment, common pain points, positive aspects, areas for improvement, and business recommendations.\n\n### 1. Overall Sentiment and Key Themes\nThe overall sentiment appears to be **positive**, with users expressing satisfaction with the product. Common themes include:\n\n- **Ease of Use**: Multiple reviewers noted that the laptops are user-friendly, suitable for middle schoolers and beginners.\n- **Performance**: The performance is described as adequate for basic tasks like web browsing and running applications.\n- **Portability**: The laptops are also highlighted for being lightweight and sleek.\n\n### 2. Common Customer Pain Points\nThere are few notable pain points mentioned across the reviews:\n\n- **Speed Limitations**: Although the speed is generally described as "average" or "decent," some users mentioned that restarting the laptop daily is necessary to maintain per

### Find the contexts passed on to the llm

In [25]:
result

{'query': 'Best computer for college students?',
 'result': 'Based on the provided product reviews, here\'s a detailed analysis focusing on key insights related to overall sentiment, common pain points, positive aspects, areas for improvement, and business recommendations.\n\n### 1. Overall Sentiment and Key Themes\nThe overall sentiment appears to be **positive**, with users expressing satisfaction with the product. Common themes include:\n\n- **Ease of Use**: Multiple reviewers noted that the laptops are user-friendly, suitable for middle schoolers and beginners.\n- **Performance**: The performance is described as adequate for basic tasks like web browsing and running applications.\n- **Portability**: The laptops are also highlighted for being lightweight and sleek.\n\n### 2. Common Customer Pain Points\nThere are few notable pain points mentioned across the reviews:\n\n- **Speed Limitations**: Although the speed is generally described as "average" or "decent," some users mentioned t

#### Now we can update the prompt, we dont have to run the embeddings as I wil push the embeddings and u can access it via the pinecone api keys

## Thank You