In [1]:
# Bibliotecas
from langchain_qdrant import Qdrant
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DataFrameLoader
from qdrant_client import QdrantClient
import pandas as pd

In [5]:
# Carrega JSON do disco
json_path  = './dfs/data_test.json'
collection = 'cpp'

df = pd.read_json(json_path)  # Read JSON
df = df.fillna('')                # Fill empty fields
df = df.astype(str)               # Only string allowed
print("Shape:", df.shape)         # Shape of the df
df.head()                         # Show first four

Shape: (2354, 8)


Unnamed: 0,postId,postTypeId,title,body,tagName,creationDate,score,viewCount
0,25,1,How to use the C socket API in C++ on z/OS,I'm having issues getting the C sockets API to...,c++,20080801,176,16412
1,264,1,BerkeleyDB Concurrency,What's the optimal level of concurrency that ...,c++,20080801,38,2899
2,330,1,Should I use nested classes in this case?,I am working on a collection of classes used f...,c++,20080802,58,5019
3,601,1,Robust Random Number Generation,"I'm looking for a performant, reasonably robus...",c++,20080803,42,2145
4,609,1,Build for Windows NT 4.0 using Visual Studio 2...,An MFC application that I'm trying to migrate ...,c++,20080803,21,4505


In [6]:
# Converte JSON para Langchain
loader = DataFrameLoader(df, 'body')
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=550, chunk_overlap=55)
texts = text_splitter.split_documents(documents)

In [7]:
# Carrega modelo embedding na GPU (pode usar 'cpu' no lugar de 'cuda')
model_name="BAAI/bge-m3"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True}
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
embeddings

HuggingFaceBgeEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='BAAI/bge-m3', cache_folder=None, model_kwargs={'device': 'cuda'}, encode_kwargs={'normalize_embeddings': True}, query_instruction='Represent this question for searching relevant passages: ', embed_instruction='', show_progress=False)

In [8]:
# Cria a coleção (inicie o docker antes com  "docker run -p 6333:6333 -p 6334:6334 qdrant/qdrant"
# Uma vez carregada, não precisa novamnete, desde que você persista os dados no container
url = "http://localhost:6334"
client = QdrantClient(url=url)

Qdrant.from_documents(
    texts,
    embeddings,
    url=url,
    prefer_grpc=True,
    collection_name=collection,
    force_recreate=False
)
print("Dados carregados com sucesso no banco de dados de vetores!")



Dados carregados com sucesso no banco de dados de vetores!


In [6]:
# Reconfigura o Qdrant para usar a porta 6333
url = "http://localhost:6333"
client = QdrantClient(url=url)

db = Qdrant(client=client, embeddings=embeddings, collection_name=collection)

  db = Qdrant(client=client, embeddings=embeddings, collection_name="alana")


In [14]:
# Teste de similaridade usando texto
# query = "How to implement a HTTP client."
query = "What are the key benefits, challenges, and best practices associated with migrating from C++11 to C++13, as well as performing source code rejuvenation?"

docs = db.similarity_search_with_score(query=query, k=3)
for i in docs:
    doc, score = i
    print({"score": score, "content": doc.page_content, "metadata": doc.metadata },"\n")

{'score': 0.649009, 'content': 'of other programs rely on its behaviour not changing, so wholesale rewriting is pretty much not an option.  Note 2: the source is nearly 20 years old, and has perhaps 30% code churn (lines modified + added / previous total lines) per year. It is heavily maintained and extended, in other words. Thus, one of the goals would be to increase mantainability.  [For the sake of the question, assume that translation into C++ is mandatory, and that leaving it in C is not an option. The point of adding this condition is to weed out the "leave it in C"', 'metadata': {'creationDate': '20081014', 'postId': '199627', 'postTypeId': '1', 'score': '46', 'tagName': 'c++', 'title': 'Converting C source to C++', 'viewCount': '42406', '_id': 'c3ecdd09-d495-455d-9cf4-90eeb9abc165', '_collection_name': 'alana'}} 

{'score': 0.63355523, 'content': 'I have been working as a native C++ programmer for last few years. Now we are starting a new project from the scratch. So what is yo

In [15]:
# Teste de similaridade usando vetor
embedding_vector = embeddings.embed_query(query)
docs =  db.similarity_search_by_vector(embedding_vector, k=3)
for i in docs:
    doc = i
    print({"score": score, "content": doc.page_content, "metadata": doc.metadata},"\n")
print("Embedded query:",embedding_vector[:50])

{'score': 0.6163168, 'content': 'of other programs rely on its behaviour not changing, so wholesale rewriting is pretty much not an option.  Note 2: the source is nearly 20 years old, and has perhaps 30% code churn (lines modified + added / previous total lines) per year. It is heavily maintained and extended, in other words. Thus, one of the goals would be to increase mantainability.  [For the sake of the question, assume that translation into C++ is mandatory, and that leaving it in C is not an option. The point of adding this condition is to weed out the "leave it in C"', 'metadata': {'creationDate': '20081014', 'postId': '199627', 'postTypeId': '1', 'score': '46', 'tagName': 'c++', 'title': 'Converting C source to C++', 'viewCount': '42406', '_id': 'c3ecdd09-d495-455d-9cf4-90eeb9abc165', '_collection_name': 'alana'}} 

{'score': 0.6163168, 'content': 'I have been working as a native C++ programmer for last few years. Now we are starting a new project from the scratch. So what is yo

In [16]:
# Teste usando retriever 
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
retriever.invoke(query)

[Document(metadata={'creationDate': '20081014', 'postId': '199627', 'postTypeId': '1', 'score': '46', 'tagName': 'c++', 'title': 'Converting C source to C++', 'viewCount': '42406', '_id': 'c3ecdd09-d495-455d-9cf4-90eeb9abc165', '_collection_name': 'alana'}, page_content='of other programs rely on its behaviour not changing, so wholesale rewriting is pretty much not an option.  Note 2: the source is nearly 20 years old, and has perhaps 30% code churn (lines modified + added / previous total lines) per year. It is heavily maintained and extended, in other words. Thus, one of the goals would be to increase mantainability.  [For the sake of the question, assume that translation into C++ is mandatory, and that leaving it in C is not an option. The point of adding this condition is to weed out the "leave it in C"'),
 Document(metadata={'creationDate': '20080930', 'postId': '152436', 'postTypeId': '1', 'score': '12', 'tagName': 'c++', 'title': 'Do you recommend Native C++ to C++\\CLI shift?',

In [17]:
# Teste de envio do contexto para LLM
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

llm = ChatOllama(model="llama3")

system_prompt = ("""
You are an experienced C++ developer and a frequent contributor to Stack Overflow. 
Your task is to provide a comprehensive, well-structured, and insightful answer to the question below based on the provided context.

### Context Information:
{context}

### Question:
What are the key benefits, challenges, and best practices associated with migrating between C++ versions, as well as performing source code rejuvenation?

### Instructions:
1. Use only the context provided to support your answer, ensuring relevance and clarity.
2. Be specific and provide actionable, real-world examples when possible.
3. Answer each aspect of the question: benefits, challenges, and best practices.
4. Structure your answer clearly, using bullet points or numbered lists.
5. If the context does not provide enough information, state 'I don't know'.

### Response Format:
--------------------------------------------------------------
**Benefits:**
1. [Provide a benefit of migration here.]
2. [Provide another benefit of migration here.]
3. [Provide a third benefit of migration here.]

**Challenges:**
1. [Provide a challenge of migration here.]
2. [Provide another challenge of migration here.]
3. [Provide a third challenge of migration here.]

**Best Practices for Source Code Rejuvenation:**
1. [Provide a practice for source code rejuvenation here.]
2. [Provide another practice for source code rejuvenation here.]
3. [Provide a third practice for source code rejuvenation here.]

--------------------------------------------------------------
Ensure that the response is concise, actionable, and well-formatted. 
                 

""")

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

chain = prompt | llm | StrOutputParser()
question_answer_chain = create_stuff_documents_chain(llm, prompt)

rag_chain = create_retrieval_chain(retriever, question_answer_chain)

response = rag_chain.invoke({"input": "What are the key benefits, challenges, and best practices associated with migrating from C++11 to C++13, as well as performing source code rejuvenation??"})
print(response["answer"])

  llm = ChatOllama(model="llama3")


**Benefits:**

1. **Improved Code Readability**: C++13 introduces several language features that improve code readability, such as auto type deduction, generic lambdas, and improved constexpr support.
2. **Better Error Messages**: The new `[[nodiscard]]` attribute helps compilers generate more informative error messages when a function is not used or has a non-void return type.
3. **Enhanced Concurrency Support**: C++13 introduces the `<atomic>` header for atomic operations and the `<thread>` header for concurrent programming, making it easier to write thread-safe code.

**Challenges:**

1. **Compatibility Issues**: When migrating from C++11 to C++13, you may encounter compatibility issues with third-party libraries or code that relies on specific language features.
2. **Code Rewrite Effort**: Updating your code to take advantage of new language features can require significant effort and rewriting of existing code.
3. **Debugging Complexity**: With the introduction of new language fea

In [18]:
# Metadado da resposta (incluindo de onde vieram os posts)
response

{'input': 'What are the key benefits, challenges, and best practices associated with migrating from C++11 to C++13, as well as performing source code rejuvenation??',
 'context': [Document(metadata={'creationDate': '20081014', 'postId': '199627', 'postTypeId': '1', 'score': '46', 'tagName': 'c++', 'title': 'Converting C source to C++', 'viewCount': '42406', '_id': 'c3ecdd09-d495-455d-9cf4-90eeb9abc165', '_collection_name': 'alana'}, page_content='of other programs rely on its behaviour not changing, so wholesale rewriting is pretty much not an option.  Note 2: the source is nearly 20 years old, and has perhaps 30% code churn (lines modified + added / previous total lines) per year. It is heavily maintained and extended, in other words. Thus, one of the goals would be to increase mantainability.  [For the sake of the question, assume that translation into C++ is mandatory, and that leaving it in C is not an option. The point of adding this condition is to weed out the "leave it in C"'),