In [1]:
# Instala dependências
! pip install -qr requirements.txt

# Carrega bibliotecas
from langchain_qdrant import Qdrant
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DataFrameLoader
from qdrant_client import QdrantClient
import pandas as pd

# Ajuste cosmético do Pandas
pd.set_option('display.width', 1000)

# Carrega modelo para gerar embeddings
model_name='BAAI/bge-m3'
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True}
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Qdrant
# docker run -p 6333:6333 -p 6334:6334 qdrant/qdrant
url = 'http://localhost:6334'
client = QdrantClient(url=url)

In [2]:
# Carrega CPP
collection = 'cpp'
json_path  = f'./dfs/data_{collection}.json'

df = pd.read_json(json_path)      # Read JSON
df = df.fillna('')                # Fill empty fields
df = df.astype(str)               # Only string allowed
print("Shape:", df.shape)         # Shape of the df
print(df.head())                  # Show first four

# Converte JSON para Langchain
loader = DataFrameLoader(df, 'body')
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=550, chunk_overlap=55)
texts = text_splitter.split_documents(documents)

# Insere dados no Qdrant
Qdrant.from_documents(texts, embeddings, url=url, prefer_grpc=True, collection_name=collection, force_recreate=False)
print(f'Coleção {collection} carregada com sucesso no Qdrant!')

Shape: (38595, 8)
  postId postTypeId                                              title                                               body tagName creationDate score viewCount
0     25          1         How to use the C socket API in C++ on z/OS  I'm having issues getting the C sockets API to...     c++     20080801   176     16412
1    264          1                             BerkeleyDB Concurrency   What's the optimal level of concurrency that ...     c++     20080801    38      2899
2    330          1          Should I use nested classes in this case?  I am working on a collection of classes used f...     c++     20080802    58      5019
3    601          1                    Robust Random Number Generation  I'm looking for a performant, reasonably robus...     c++     20080803    42      2145
4    609          1  Build for Windows NT 4.0 using Visual Studio 2...  An MFC application that I'm trying to migrate ...     c++     20080803    21      4505




Coleção cpp carregada com sucesso no Qdrant!


In [3]:
# Carrega Java
collection = 'java'
json_path  = f'./dfs/data_{collection}.json'

df = pd.read_json(json_path)      # Read JSON
df = df.fillna('')                # Fill empty fields
df = df.astype(str)               # Only string allowed
print("Shape:", df.shape)         # Shape of the df
print(df.head())                  # Show first four

# Converte JSON para Langchain
loader = DataFrameLoader(df, 'body')
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=550, chunk_overlap=55)
texts = text_splitter.split_documents(documents)

# Insere dados no Qdrant
Qdrant.from_documents(texts, embeddings, url=url, prefer_grpc=True, collection_name=collection, force_recreate=False)
print(f'Coleção {collection} carregada com sucesso no Qdrant!')

Shape: (66526, 8)
  postId postTypeId                                              title                                               body tagName creationDate score viewCount
0    123          1        Java lib or app to convert CSV to XML file?  Is there an existing application or library in...    java     20080801   121     81172
1    126          1  How would you access Object properties from wi...  What is the "purist" or "correct" way to acces...    java     20080801   106     26730
3    564          1  What is the difference between an int and an I...  I was reading More Joel on Software when I cam...    java     20080802   271    249050
4   2092          1   How to get started writing a code coverage tool?  Looking for books or other references that dis...    java     20080805    30      3647




Coleção java carregada com sucesso no Qdrant!


In [4]:
# Carrega JavaScript
collection = 'javascript'
json_path  = f'./dfs/data_{collection}.json'

df = pd.read_json(json_path)      # Read JSON
df = df.fillna('')                # Fill empty fields
df = df.astype(str)               # Only string allowed
print("Shape:", df.shape)         # Shape of the df
print(df.head())                  # Show first four

# Converte JSON para Langchain
loader = DataFrameLoader(df, 'body')
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=550, chunk_overlap=55)
texts = text_splitter.split_documents(documents)

# Insere dados no Qdrant
Qdrant.from_documents(texts, embeddings, url=url, prefer_grpc=True, collection_name=collection, force_recreate=False)
print(f'Coleção {collection} carregada com sucesso no Qdrant!')

Shape: (64935, 8)
  postId postTypeId                                              title                                               body     tagName creationDate score viewCount
0    845          1  How to detect which one of the defined font wa...  Suppose I have the following CSS rule in my pa...  javascript     20080803   157     55224
1   1401          1              ASP.Net Custom Client-Side Validation  I have a custom validation function in JavaScr...  javascript     20080804    38      3694
2   1873          1  Triple Quotes? How do I delimit a databound Ja...  How do I delimit a Javascript data-bound strin...  javascript     20080805    47     23470
3   2914          1  How can I detect if a browser is blocking a po...  Occasionally, I've come across a webpage that ...  javascript     20080805   165    143178
4   3224          1  How can I make the browser see CSS and Javascr...  CSS and Javascript files don't change very oft...  javascript     20080806    64      7885




Coleção javascript carregada com sucesso no Qdrant!


In [5]:
# Carrega Python
collection = 'python'
json_path  = f'./dfs/data_{collection}.json'

df = pd.read_json(json_path)      # Read JSON
df = df.fillna('')                # Fill empty fields
df = df.astype(str)               # Only string allowed
print("Shape:", df.shape)         # Shape of the df
print(df.head())                  # Show first four

# Converte JSON para Langchain
loader = DataFrameLoader(df, 'body')
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=550, chunk_overlap=55)
texts = text_splitter.split_documents(documents)

# Insere dados no Qdrant
Qdrant.from_documents(texts, embeddings, url=url, prefer_grpc=True, collection_name=collection, force_recreate=False)
print(f'Coleção {collection} carregada com sucesso no Qdrant!')

Shape: (9696, 8)
  postId postTypeId                                              title                                               body     tagName creationDate score viewCount
0   8692          1                        How to use XPath in Python?  What are the libraries that support XPath? Is ...  python-2.x     20080812   258    382119
1  34611          1     Toolkit Options for 2D Python Game Programming  What are some toolkits for developing 2D games...  python-3.x     20080829    13      6154
2  36932          1           How can I represent an 'Enum' in Python?  I'm mainly a C# developer, but I'm currently w...  python-3.x     20080831  1141   1285129
3  91205          1  Will everything in the standard library treat ...  I'm a little confused about how the standard l...  python-3.x     20080918    12       489
4  94935          1  What is the difference between range and xrang...  Apparently xrange is faster but I have no idea...  python-2.x     20080918   848    491148




Coleção python carregada com sucesso no Qdrant!
