#Installing required dependencies

In [None]:
!pip install transformers sentence-transformers chromadb datasets accelerate faiss-cpu torch huggingface_hub python-dotenv

Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.23.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.0-cp311-cp311-manylinux_

In [None]:
from dotenv import load_dotenv
import os
# Load environment variables from .env file
load_dotenv()

huggingface_token = os.getenv("HF_TOKEN")
ngrok_token = os.getenv("NGROK_TOKEN")

# Loading Data

In [2]:
from datasets import load_dataset

# Load dataset
ds = load_dataset("MohammadOthman/mo-customer-support-tweets-945k")["train"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/3.40k [00:00<?, ?B/s]

preprocessed_data.json:   0%|          | 0.00/222M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/945278 [00:00<?, ? examples/s]

## Initialize ChromaDB storage

In [3]:
import chromadb

# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path="./chroma_db")  # Persistent storage

# Create a collection
collection = chroma_client.get_or_create_collection(name="customer_support")

## Initializing Embedding Model

In [4]:
from sentence_transformers import SentenceTransformer

embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = SentenceTransformer(embedding_model_name)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Adding Data to the ChromaDB Collection

In [None]:

for i, data in enumerate(ds):
    query_text = data["input"]
    response_text = data["output"]
    embedding = embedding_model.encode(query_text).tolist()  # Convert to list for ChromaDB

    collection.add(
        ids=[str(i)],  # Unique ID for each entry
        embeddings=[embedding],  # Vector representation
        metadatas=[{"input": query_text, "output": response_text}]
    )

print("ChromaDB populated with customer support queries!")

# Connect with Hugging Face for importing pre-tranied Model.

In [None]:
from huggingface_hub import login
login(token=huggingface_token)

## Load LLAMA2 Model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load LLaMA 2 model & tokenizer
model_name = "meta-llama/Llama-2-7b-chat-hf"  # Change this based on your resources
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

print("LLaMA 2 model loaded successfully!")

In [10]:
pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [11]:
import csv
import os
from nltk.translate.bleu_score import sentence_bleu
from bert_score import score

# Log file path
log_file = 'response_logs.csv'

# Check if the log file exists, if not, create it with headers
if not os.path.exists(log_file):
    with open(log_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Query', 'Generated Response', 'Retrieved Query', 'Retrieved Response', 'BLEU Score', 'BERT Score'])


def calculate_bleu(reference, candidate):
    return sentence_bleu([reference.split()], candidate.split())


def calculate_bert(reference, candidate):
    P, R, F1 = score([candidate], [reference], model_type='bert-base-uncased')
    return F1.mean().item()


def log_response(query, response, retrieved_documents):
    with open(log_file, mode='a', newline='') as file:
        writer = csv.writer(file)
        for doc in retrieved_documents:
            writer.writerow([query, response, doc['query'], doc['response'], doc['bleu_score'], doc['bert_score']])


In [12]:
# Cache to store the last 5 queries and responses
cache = []

def create_response(user_query):
    # Step 1: Encode the user query
    query_embedding = embedding_model.encode(user_query).tolist()

    # Step 2: Retrieve the most relevant query-response pairs
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=5
    )

    # Step 3: Format retrieved context
    retrieved_documents = []
    for res in results['metadatas'][0]:
        retrieved_query = res['input']
        retrieved_response = res['output']
        retrieved_documents.append({
            'query': retrieved_query,
            'response': retrieved_response
        })

    global cache
    cache_context = '\n'.join([f"User: {item['query']}\nSupport: {item['response']}" for item in reversed(cache)])

    # Step 4: Construct prompt for LLaMA 2
    prompt = f'''
    You are a helpful and accurate customer support AI.

    Your primary goal is to assist users by providing clear, concise, and correct answers based on the information available.
    Leverage the following sources to generate your response:
    1. Retrieved past responses from similar queries.
    2. Recent user interactions and responses.
    3. Context from the previous conversation, if relevant.

    Guidelines:
    - Please try to give short and concise answers.
    - Only provide a direct answer to the new query. Avoid repeating the prompt or giving unnecessary explanations.
    - Base your response solely on the information available. Do not use internal knowledge or make assumptions.
    - If the answer is unknown, clearly state that you do not know and suggest relevant questions that the user can ask.
    - If the user asks for clarification or a follow-up on a previous response, make use of the last message to maintain continuity and context.
    - If the user asks for explanation why you arrived at the last response, use the cache_context and tell how you arrived at that response.

    Recent Interactions:
    {cache_context}

    Retrieved Context:
    {retrieved_documents}

    New User Query: {user_query}
    Support AI Response: '''

    # Step 5: Generate response with LLaMA 2
    inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
    # Dynamically set max_length
    max_length = inputs['input_ids'].shape[1] + 100
    output_tokens = model.generate(**inputs, max_length=max_length)
    response = tokenizer.decode(output_tokens[0], skip_special_tokens=True).replace(prompt, '').strip()

    # Calculate BLEU and BERT scores for retrieved documents
    for doc in retrieved_documents:
        bleu_score = calculate_bleu(doc['response'], response)
        bert_score = calculate_bert(doc['response'], response)
        doc['bleu_score'] = bleu_score
        doc['bert_score'] = bert_score

    # Log the response
    log_response(user_query, response, retrieved_documents)

    # Add to cache
    cache.append({'query': user_query, 'response': response})
    if len(cache) > 5:
        cache.pop(0)

    return {
        'response': response,
        'retrieved_documents': retrieved_documents
    }


In [13]:
user_query = "i booked my flight using delta amex card. Checking in now amp was being charged for baggage"
response = create_response(user_query)
print(response)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

{'response': "Hi there! 😊 I see you're checking in for your flight and being charged for baggage. Can you please provide your booking reference number so I can assist you better? 😊", 'retrieved_documents': [{'query': 'i booked my flight using delta amex card. Checking in now amp was being charged for baggage', 'response': 'Glad to check. Pls, message your confirmation number for assistance.', 'bleu_score': 1.016127520387007e-231, 'bert_score': 0.48350024223327637}, {'query': 'Despite flying w/ same from EWR, forced MIA to check guitar amp charged 25. Not gate, Before security. Flt not full. WHY?', 'response': 'Your guitar must fit underneath the seat in front of you or in an overhead bin. Please see details here', 'bleu_score': 1.016127520387007e-231, 'bert_score': 0.4194357693195343}, {'query': 'pay for baggage to go in the hold and no bags arrive because the plane hold is “too small” what a absolute joke!! delta airlines', 'response': 'Oh no. Sorry to hear that. Please message your b

In [15]:
user_query = "I don't understand why you need my order details. what you willdo after that with it?"
response = create_response(user_query)
print(response)

{'response': 'Thank you for reaching out to us, [User Name]. We apologize for any confusion regarding the order details. After receiving your order details, we will be able to assist you with the issue you are experiencing. Our team will review your order and provide you with the best solution possible. Please provide us with your order number, and we will get back to you as soon as possible. If you have any further questions or concerns, please feel free to ask.', 'retrieved_documents': [{'query': 'hey I need help regarding my order.', 'response': 'we will certainly help you with your concern, Adit. Could you please elaborate on the issue?', 'bleu_score': 3.679859816803807e-155, 'bert_score': 0.556242823600769}, {'query': 'Thanks, I’ve emailed with the order number and details.', 'response': 'Email responses are eected within 12 hours. For a quicker resolution, please use the phone or chat option. Good Game', 'bleu_score': 8.588198359462805e-232, 'bert_score': 0.46737197041511536}, {'

In [14]:
user_query = "is the worst ISP I’ve ever had"
response = create_response(user_query)
print(response)

{'response': '😊 Hi there! 😊 Can you please provide your booking reference number so I can assist you better? 😊', 'retrieved_documents': [{'query': 'is the worst ISP I’ve ever had', 'response': 'What did we do to make you feel this way and how can we fix things between us?', 'bleu_score': 1.0377133938315695e-231, 'bert_score': 0.38984888792037964}, {'query': 'So my ISP Really screwed me over, so I switched to', 'response': 'Hi, I am glad to hear you switched to Xfinity! Let me know if you need any assistance!', 'bleu_score': 1.1484186507842885e-231, 'bert_score': 0.48820188641548157}, {'query': 'is the worst customer service', 'response': 'I would love the chance to review the account and provide assistance.', 'bleu_score': 1.0377133938315695e-231, 'bert_score': 0.43950068950653076}, {'query': 'My internet SUCKS!!!! And their is ALWAYS something wrong with my bill. Lol. it is crazy. I could have went to MetroPCS for this', 'response': 'Hey King, Thank you for the feedback. We would like

In [33]:
user_query = "i booked my flight using delta amex card. Checking in now amp was being charged for baggage"
response = create_response(user_query)
print(response)

{'response': "Hi there! I apologize for the inconvenience you're experiencing. Could you please provide more context or details about the issue you're facing? What store is this regarding? I'll do my best to assist you.", 'retrieved_documents': [{'query': 'i booked my flight using delta amex card. Checking in now amp was being charged for baggage', 'response': 'Glad to check. Pls, message your confirmation number for assistance.', 'bleu_score': 8.972141065609098e-232, 'bert_score': 0.4267297089099884}, {'query': 'Also double charged for checked bags on flight out amp told nothing could be done at time, try calling after return flight. Disappointing', 'response': 'Ty, if you were double charged for baggage, please follow and direct message your ticket number and tell us which flight it was.', 'bleu_score': 9.929306298309508e-232, 'bert_score': 0.4354952871799469}, {'query': 'check in online w/ 2 bags permitted at online check in. Get to airport rude agents decide policy is different amp

In [22]:
!pip install flask flask-ngrok

Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [23]:
pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


In [None]:
from pyngrok import ngrok
port_no = 5000
ngrok.set_auth_token(ngrok_token)
public_url =  ngrok.connect(port_no).public_url

In [26]:
public_url

'https://5128-34-19-2-115.ngrok-free.app'

In [27]:
from flask import Flask, request, jsonify
from flask_ngrok import run_with_ngrok  # Allows Flask to work on Colab

app = Flask(__name__)
run_with_ngrok(app)  # Start ngrok to expose the server

@app.route('/generate_response', methods=['POST'])
def generate_response():
    data = request.json
    if not data or "query" not in data:
        return jsonify({"error": "Missing 'query' parameter"}), 400  # Return error if 'query' is missing
    else:
      print(data)
    user_query = data["query"]
    response = create_response(user_query)
    return jsonify({"response": response})

if __name__ == '__main__':
    app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


 * Running on http://5128-34-19-2-115.ngrok-free.app
 * Traffic stats available on http://127.0.0.1:4040
{'query': 'I need help resetting my password.'}


INFO:werkzeug:127.0.0.1 - - [29/Mar/2025 20:32:08] "POST /generate_response HTTP/1.1" 200 -


{'query': 'I didn’t receive the reset link.'}


INFO:werkzeug:127.0.0.1 - - [29/Mar/2025 20:33:00] "POST /generate_response HTTP/1.1" 200 -


{'query': 'My cat chewed my phone charger. Is this covered under warranty?'}


INFO:werkzeug:127.0.0.1 - - [29/Mar/2025 20:33:58] "POST /generate_response HTTP/1.1" 200 -


{'query': 'Why did you suggest contacting support?'}


INFO:werkzeug:127.0.0.1 - - [29/Mar/2025 20:34:33] "POST /generate_response HTTP/1.1" 200 -
