In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
pip install sentence-transformers pinecone-client langchain langchain-community langchain-pinecone bitsandbytes accelerate

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting langchain
  Downloading langchain-0.2.14-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.2.12-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain-pinecone
  Downloading langchain_pinecone-0.1.3-py3-none-any.whl.metadata (1.7 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.0.3-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting langchain-core<0.3.0,>=0.2.32 (from langchain)
  Downloading lan

In [4]:
import os
import torch
from typing import List, Dict
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
import textwrap
import random

class ProdecoAssistant:
    def __init__(self, model_name: str, pinecone_api_key: str, pinecone_env: str, index_name: str, hf_token: str):
        self.model_name = model_name
        self.pinecone_api_key = pinecone_api_key
        self.pinecone_env = pinecone_env
        self.index_name = index_name

        # Set up Pinecone
        os.environ["PINECONE_API_KEY"] = pinecone_api_key
        os.environ["PINECONE_ENVIRONMENT"] = pinecone_env
        pc = Pinecone(api_key=pinecone_api_key, environment=pinecone_env)
        self.index = pc.Index(index_name)

        # Set up embedding model
        self.embed_model = SentenceTransformer('all-MiniLM-L6-v2')

        # Set up LLM
        self.quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )

        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=self.quantization_config,
            device_map="auto",
            use_auth_token=hf_token
        )

        self.pipe = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            torch_dtype=torch.float16,
            device_map="auto",
            max_new_tokens=512,
            do_sample=True,
            temperature=0.7,
            top_p=0.95,
            repetition_penalty=1.15
        )

        # Create LangChain LLM
        self.llm = HuggingFacePipeline(pipeline=self.pipe)

        # Set up memory
        self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

        self.default_system_prompt = """You are an AI sales assistant for Prodeco electronics, an electronics store.
        Use the following context provided to answer customer queries about electronics products.
        Be concise, specific, and helpful. Don't talk too much, answer briefly in 2-3 lines. If you don't have enough information, ask for clarification."""

    def get_embedding(self, text: str) -> List[float]:
        """Generate embedding for the given text."""
        with torch.no_grad():
            return self.embed_model.encode(text).tolist()

    def get_prompt(self, instruction: str, context: str, new_system_prompt: str = None) -> str:
        B_INST, E_INST = "[INST]", "[/INST]"
        B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
        if new_system_prompt is None:
            new_system_prompt = self.default_system_prompt
        SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
        prompt_template = f"{B_INST}{SYSTEM_PROMPT}Context:\n{context}\n\nCustomer: {instruction}\n\nAssistant:{E_INST}"
        print(f"Generated prompt:\n{prompt_template}")
        return prompt_template

    def get_relevant_context(self, query: str, top_k: int = 3) -> List[Dict]:
        """Retrieve relevant context from Pinecone index."""
        query_embedding = self.get_embedding(query)
        results = self.index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
        contexts = [match['metadata'] for match in results['matches']]
        print(f"Retrieved contexts: {contexts}")
        return contexts

    def format_context(self, contexts: List[Dict]) -> str:
        """Format retrieved contexts into a string."""
        formatted_contexts = []
        for ctx in contexts:
            formatted_contexts.append(
                f"Product: {ctx['name']}\n"
                f"Price: ${ctx['price']}\n"
                f"Description: {ctx['description']}\n"
                f"Best for: {ctx['best_for']}\n"
            )
        formatted_context = "\n\n".join(formatted_contexts)
        print(f"Formatted context:\n{formatted_context}")
        return formatted_context

    def generate_response(self, query: str, context: str) -> str:
        """Generate a response using the model."""
        prompt = self.get_prompt(query, context)

        with torch.no_grad():
            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to('cuda')
            output = self.model.generate(
                input_ids,
                max_length=512,
                temperature=0.7,
                top_p=0.95,
                repetition_penalty=1.15,
                do_sample=True,
                num_return_sequences=1,
                pad_token_id=self.tokenizer.eos_token_id
            )

        response = self.tokenizer.decode(output[0], skip_special_tokens=True)
        response = response.split("Assistant:")[-1].strip()
        print(f"Generated response:\n{response}")
        return response

    def chat(self, query: str) -> str:
        """Process a query and return a response using the RAG pipeline."""
        try:
            contexts = self.get_relevant_context(query)
            formatted_context = self.format_context(contexts)
            response = self.generate_response(query, formatted_context)

            # Simple check for low-quality responses
            if len(response) < 20 or "I don't know" in response.lower():
                return self.get_fallback_response(query)

            return response
        except Exception as e:
            print(f"Error processing query: {e}")
            return self.get_fallback_response(query)

    def get_fallback_response(self, query: str) -> str:
        """Provide a fallback response when the model fails to generate a good one."""
        fallback_responses = [
            "I apologize, but I couldn't find specific information about that. Could you please provide more details?",
            "I'm sorry, I'm not able to answer that question right now. Is there something else I can help you with regarding our electronics products?",
            "That's an interesting question! While I don't have a specific answer, I'd be happy to help you find information about our available electronics. What type of product are you interested in?"
        ]
        response = random.choice(fallback_responses)
        print(f"Fallback response:\n{response}")
        return response

    def parse_text(self, text: str):
        wrapped_text = textwrap.fill(text, width=100)
        print(wrapped_text + '\n\n')

# Usage
if __name__ == "__main__":
    PINECONE_API_KEY = "bb29df9e-1917-4a6f-a6d8-1fa41ca5a6f2"
    PINECONE_ENV = 'us-east-1'
    INDEX_NAME = "electra"
    MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
    HF_TOKEN = "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"

    assistant = ProdecoAssistant(MODEL_NAME, PINECONE_API_KEY, PINECONE_ENV, INDEX_NAME, HF_TOKEN)

    queries = [
        "What's a good laptop for video editing?",
        "Recommend any budget headphones",
        "What's your return policy?",
        "I'm looking for a drone."
    ]

    for query in queries:
        print(f"\nQuery: {query}")
        response = assistant.chat(query)
        assistant.parse_text(response)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

  warn_deprecated(



Query: What's a good laptop for video editing?


The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Retrieved contexts: [{'best_for': 'Gamers and professionals who need a powerful yet portable laptop', 'description': '14-inch QHD 120Hz display, AMD Ryzen 9, 16GB RAM, 1TB SSD, NVIDIA GeForce RTX 3060', 'name': 'ASUS ROG Zephyrus G14', 'price': 1649.99}, {'best_for': 'Hardcore gamers and content creators who need desktop-level performance in a laptop', 'description': '15.6-inch QHD 240Hz display, Intel Core i7, 16GB RAM, 1TB SSD, NVIDIA GeForce RTX 3070 Ti', 'name': 'Razer Blade 15', 'price': 2499.99}, {'best_for': 'Students, professionals who need a lightweight and powerful laptop for everyday tasks', 'description': '13.6-inch Liquid Retina display, Apple M2 chip, 8GB RAM, 256GB SSD', 'name': 'MacBook Air M2', 'price': 1199.0}]
Formatted context:
Product: ASUS ROG Zephyrus G14
Price: $1649.99
Description: 14-inch QHD 120Hz display, AMD Ryzen 9, 16GB RAM, 1TB SSD, NVIDIA GeForce RTX 3060
Best for: Gamers and professionals who need a powerful yet portable laptop


Product: Razer Blade 1

In [5]:
print("Welcome to prodeco.")
while True:
        query = input("How can i help? ")
        if query.lower() in ["exit", "quit"]:
            break
        response = assistant.chat(query)
        assistant.parse_text(response)

Welcome to prodeco.
How can i help? How many estimated products do you sell?
Retrieved contexts: [{'best_for': 'Apple enthusiasts, photographers, and those who want the latest smartphone technology', 'description': '6.1-inch Super Retina XDR display, A16 Bionic chip, 128GB storage, Triple-camera system', 'name': 'iPhone 14 Pro', 'price': 999.0}, {'best_for': 'Those wanting a large 4K TV with smart features at an affordable price', 'description': '55-inch 4K UHD display, HDR, Roku smart platform, Voice control', 'name': 'TCL 55', 'price': 319.99}, {'best_for': 'Avid readers who want a dedicated e-reader with a paper-like display', 'description': "'6.8' display, 300 ppi, Adjustable warm light, Waterproof, 10 weeks of battery life", 'name': 'Kindle Paperwhite', 'price': 139.99}]
Formatted context:
Product: iPhone 14 Pro
Price: $999.0
Description: 6.1-inch Super Retina XDR display, A16 Bionic chip, 128GB storage, Triple-camera system
Best for: Apple enthusiasts, photographers, and those wh