In [1]:
import json
from pathlib import Path 
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import hashlib
from openai import OpenAI
from dotenv import find_dotenv, load_dotenv
import pandas as pd
import asyncio

import logging
from functools import partial
from typing import List, Dict
import numpy as np
load_dotenv(find_dotenv())
client = OpenAI()

In [3]:
es_client = Elasticsearch(['http://localhost:9200'])
es_client.info()

ObjectApiResponse({'name': '858de63ad72c', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'BWEYwsaOR5avAVRmBaF96w', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [4]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)



In [5]:
with open(Path().cwd().parents[0]/'data'/'initial_data.json', 'r') as f:
    data = json.load(f)

In [6]:
data

[{'vague': 'We are diving into the cloud and creating some virtual magic, so grab your surfboards.',
  'actual': 'We are migrating our services to a cloud infrastructure to improve flexibility and scalability.'},
 {'vague': 'Lets tinker under the hood and fine-tune those APIs before our users start crying foul.',
  'actual': 'We need to troubleshoot and optimize our application programming interfaces to avoid user complaints.'},
 {'vague': 'We need to switch gears and kick it into hyperdrive or we are going to be stuck in traffic.',
  'actual': 'We need to accelerate our development process to meet project deadlines.'},
 {'vague': 'We are in a bit of a pickle with our bandwidth hogs, so we better do some spring cleaning.',
  'actual': 'We need to analyze and optimize our network usage to improve performance.'},
 {'vague': 'Lets put the pedal to the metal and make sure our software is not just sitting on the shelf collecting dust.',
  'actual': 'We need to ensure our software is activel

In [7]:
for doc in data:
    doc['vague_embedding'] = model.encode(doc['vague']).tolist()
    concatenated_fields = doc['vague'] + doc['actual']
    doc['id'] = hashlib.md5(concatenated_fields.encode()).hexdigest()


In [8]:
with open(Path().cwd().parents[0]/'data'/'initial_data_w_id.json', 'w') as f:
    json.dump(data, f)

In [9]:
index_settings = {
    'settings': {
        'number_of_shards': 1,
        'number_of_replicas': 0
    },  
    'mappings': {
        'properties': {
            'id': {'type': 'keyword'},
            'vague': {'type': 'text'},
            'actual': {'type': 'text'},
            'vague_embedding': {'type': 'dense_vector', 
                                'dims': 384,
                                'index': True,
                                'similarity': 'cosine'}
        }
    }
}

In [10]:
index_name = 'vague-actual'
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'vague-actual'})

In [11]:
for doc in tqdm(data):
    es_client.index(index=index_name, id=doc['id'], body=doc)

  0%|          | 0/815 [00:00<?, ?it/s]

In [12]:
query = 'Can we pour some holy water on our deployment pipeline?'

In [None]:
v_q= model.encode(query)

In [13]:
def elastic_search_knn(vector):
    knn = {
        'field': 'vague_embedding',
        'query_vector': vector,
        'num_candidates': 100,
        'k':5
    }
    search_query = {'knn': knn,
                    '_source': ['vague', 'actual', 'id']
                    }
    es_results = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs    

In [None]:
res = elastic_search_knn(v_q)
res

In [None]:
def build_prompt(query, search_results):
    prompt_template = """
    Your are a translator from vague boss language into an everyday language. Translate
    the VAGUE statement or question based on the CONTEXT. Provide a clear and concise answer.
    VAGUE: {vague}
    
    CONTEXT: {context}
    """.strip()
    
    context = ''
    for doc in search_results:
        context+= f"vague: {doc['vague']}\nactual: {doc['actual']}\n\n"
    prompt = prompt_template.format(vague=query, context=context)    
    return prompt

In [None]:
def llm(prompt, gpt_model='gpt-4o-mini'):
    response = client.chat.completions.create(model=gpt_model,
                                             messages=[{'role': 'user', 'content': prompt}],)
    return response.choices[0].message.content

In [None]:
def rag(vague, gpt_model='gpt-4o-mini'):
    vectorized_vague= model.encode(vague)
    search_results = elastic_search_knn(vectorized_vague)
    prompt = build_prompt(vague, search_results=search_results)
    answer = llm(prompt, gpt_model=gpt_model)
    return answer

In [None]:
rag('Let us rock and roll this deployment')

# Evaluating retrieval

In [20]:
df_ground_truth = pd.read_csv('../data/ground_truth_data.csv')

In [21]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [28]:
import importlib
import text_retrieval_metrics
importlib.reload(text_retrieval_metrics)

<module 'text_retrieval_metrics' from '/home/taras/my_code_for_courses/llm-zoomcamp/Project/data_utils/text_retrieval_metrics.py'>

In [23]:
relevance_total = []
for entry in tqdm(ground_truth):
    doc_id = entry['doc_id']
    results = elastic_search_knn(vector=model.encode(entry['vague']))
    relevance = [document['id'] == doc_id for document in results]
    relevance_total.append(relevance)

  0%|          | 0/4075 [00:00<?, ?it/s]

In [29]:
text_retrieval_metrics.hit_rate(relevance_total)

0.8768098159509202

In [30]:
text_retrieval_metrics.mrr(relevance_total)

0.8103517382413096