#Installing required dependencies

In [1]:
!pip install transformers sentence-transformers chromadb datasets accelerate faiss-cpu torch huggingface_hub python-dotenv

Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.23.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.0-cp311-cp311-manylinux_

In [None]:
import os
from dotenv import load_dotenv

# Load the .env file
load_dotenv(".env")

# Loading Data

In [2]:
from datasets import load_dataset
from huggingface_hub import login

# Access the environment variables
hf_token = os.getenv("HF_TOKEN")
login(token=hf_token)

# Load dataset
ds = load_dataset("MohammadOthman/mo-customer-support-tweets-945k")["train"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/3.40k [00:00<?, ?B/s]

preprocessed_data.json:   0%|          | 0.00/222M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/945278 [00:00<?, ? examples/s]

## Initialize ChromaDB storage

In [3]:
import chromadb


# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path="./chroma_db")  # Persistent storage

# Create a collection
collection = chroma_client.get_or_create_collection(name="customer_support")

## Initializing Embedding Model

In [4]:
from sentence_transformers import SentenceTransformer

embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = SentenceTransformer(embedding_model_name)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Adding Data to the ChromaDB Collection

In [6]:
for i, data in enumerate(ds):
    query_text = data["input"]
    response_text = data["output"]
    embedding = embedding_model.encode(query_text).tolist()  # Convert to list for ChromaDB

    collection.add(
        ids=[str(i)],  # Unique ID for each entry
        embeddings=[embedding],  # Vector representation
        metadatas=[{"input": query_text, "output": response_text}]
    )
    
print("ChromaDB populated with customer support queries!")



ChromaDB populated with customer support queries!


## Load LLAMA2 Model

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load LLaMA 2 model & tokenizer
model_name = "meta-llama/Llama-2-7b-chat-hf"  # Change this based on your resources
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

print("LLaMA 2 model loaded successfully!")

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

LLaMA 2 model loaded successfully!


In [17]:
# Cache to store the last 5 queries and responses
cache = []

def create_response(user_query):

    # Step 1: Encode the user query
    query_embedding = embedding_model.encode(user_query).tolist()

    # Step 2: Retrieve the most relevant query-response pairs
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=5  # Retrieve top 5 similar queries
    )

    # Step 3: Format retrieved context
    retrieved_context = "\n".join(
        [f"User: {res['input']}\nSupport: {res['output']}" for res in results["metadatas"][0]]
    )

    global cache

    # Construct the cache context from the most recent to the oldest
    cache_context = "\n".join([f"User: {item['query']}\nSupport: {item['response']}" for item in reversed(cache)])

    # Step 4: Construct prompt for LLaMA 2
    prompt = f"""
    You are a helpful and accurate customer support AI.

    Your primary goal is to assist users by providing clear, concise, and correct answers based on the information available.
    Leverage the following sources to generate your response:
    1. Retrieved past responses from similar queries.
    2. Recent user interactions and responses.
    3. Context from the previous conversation, if relevant.

    Guidelines:
    - Only provide a direct answer to the new query. Avoid repeating the prompt or giving unnecessary explanations.
    - Base your response solely on the information available. Do not use internal knowledge or make assumptions.
    - If the answer is unknown, clearly state that you do not know and suggest relevant questions that the user can ask.
    - If the user asks for clarification or a follow-up on a previous response, make use of the last message to maintain continuity and context.


    Recent Interactions:
    {cache_context}

    Retrieved Context:
    {retrieved_context}

    New User Query: {user_query}
    Support AI Response: """

    # Step 5: Generate response with LLaMA 2
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output_tokens = model.generate(**inputs, max_length=1000)
    response = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

    # To avoid if repetitoin of prompt in response.
    response = response.replace(prompt, "").strip()

    # Add the current query and response to the cache
    cache.append({"query": user_query, "response": response})

    # Ensure the cache does not exceed 5 items
    if len(cache) > 5:
        cache.pop(0)  # Remove the oldest entry

    return response

In [18]:
# user_query = "I ordered a laptop, but it arrived with a broken screen. What should I do?"
# response = create_response(user_query)
# print(response)

Sorry to hear that your laptop arrived with a broken screen. Can you please provide the order number and confirm the shipping details? We will assist you with the replacement or repair of the screen.


In [21]:
# user_query = "I don't understand why you need my order details. what you willdo after that with it?"
# response = create_response(user_query)
# print(response)

Hi! Thank you for reaching out to us. We need your order details to assist you with your query. We will use this information to verify your order and provide you with the appropriate next steps. If you have any questions or concerns, please feel free to ask.


In [22]:
!pip install flask flask-ngrok

Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [23]:
pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


In [25]:
from pyngrok import ngrok
port_no = 5000
ngrok_token = os.getenv("NGROK_AUTH_TOKEN")
ngrok.set_auth_token(ngrok_token)
public_url =  ngrok.connect(port_no).public_url

In [26]:
public_url

'https://5128-34-19-2-115.ngrok-free.app'

In [27]:
from flask import Flask, request, jsonify
from flask_ngrok import run_with_ngrok  # Allows Flask to work on Colab

app = Flask(__name__)
run_with_ngrok(app)  # Start ngrok to expose the server

@app.route('/generate_response', methods=['POST'])
def generate_response():
    data = request.json
    if not data or "query" not in data:
        return jsonify({"error": "Missing 'query' parameter"}), 400  # Return error if 'query' is missing
    else:
      print(data)
    user_query = data["query"]
    response = create_response(user_query)
    return jsonify({"response": response})

if __name__ == '__main__':
    app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


 * Running on http://5128-34-19-2-115.ngrok-free.app
 * Traffic stats available on http://127.0.0.1:4040
{'query': 'I need help resetting my password.'}


INFO:werkzeug:127.0.0.1 - - [29/Mar/2025 20:32:08] "POST /generate_response HTTP/1.1" 200 -


{'query': 'I didn’t receive the reset link.'}


INFO:werkzeug:127.0.0.1 - - [29/Mar/2025 20:33:00] "POST /generate_response HTTP/1.1" 200 -


{'query': 'My cat chewed my phone charger. Is this covered under warranty?'}


INFO:werkzeug:127.0.0.1 - - [29/Mar/2025 20:33:58] "POST /generate_response HTTP/1.1" 200 -


{'query': 'Why did you suggest contacting support?'}


INFO:werkzeug:127.0.0.1 - - [29/Mar/2025 20:34:33] "POST /generate_response HTTP/1.1" 200 -
