In [24]:
import pprint
import pandas as pd

from pathlib import Path

from langchain.document_loaders import TextLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.schema import Document

df = pd.read_csv("Data/cleaned_output1.csv")

documents = df.apply(lambda row: Document(page_content = row['textContent']
                                          , metadata = {'source' : row['id']}))

# Ten plik documnets możesz joinowac
# loader = TextLoader("../Data/state of the union.txt")
# documents = loader.load()
# csv_loader = CSVLoader()
# documents = csv_loader.load("../Data/clean_output1.csv")

text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
docs = text_splitter.split_documents(documents)

model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
model_kwargs = {"device": "cpu"}
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

chroma_db_filepath = Path("./chroma_db")
if chroma_db_filepath.exists():
    db = Chroma(embedding_function=embeddings, persist_directory=str(chroma_db_filepath))
else:
    db = Chroma.from_documents(docs, embeddings, persist_directory=str(chroma_db_filepath))

sim = db.similarity_search_with_score(
    "What did the president say about Ketanji Brown Jackson?", k=4
)

results = [(score, doc.metadata["source"], doc.page_content) for (doc, score) in sim]
results.sort(key=lambda x: x[0])

pprint.pprint(results)

Downloading (…)9e268/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)f2cd19e268/README.md:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading (…)cd19e268/config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)9e268/tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

Downloading (…)d19e268/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

[(10.45965223073447,
  '../Data/state of the union.txt',
  'One of the most serious constitutional responsibilities a President has is '
  'nominating someone to serve on the United States Supreme Court. \n'
  '\n'
  'And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge '
  'Ketanji Brown Jackson. One of our nation’s top legal minds, who will '
  'continue Justice Breyer’s legacy of excellence.'),
 (11.86820177173809,
  '../Data/state of the union.txt',
  'He will never extinguish their love of freedom. He will never weaken the '
  'resolve of the free world. \n'
  '\n'
  'We meet tonight in an America that has lived through two of the hardest '
  'years this nation has ever faced. \n'
  '\n'
  'The pandemic has been punishing. \n'
  '\n'
  'And so many families are living paycheck to paycheck, struggling to keep up '
  'with the rising cost of food, gas, housing, and so much more. \n'
  '\n'
  'I understand.'),
 (12.239472097286281,
  '../Data/state of the union.

In [23]:
!pip install sentence-transformers

Collecting sentence-transformers
  Using cached sentence-transformers-2.2.2.tar.gz (85 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting torchvision (from sentence-transformers)
  Using cached torchvision-0.16.0-cp39-cp39-win_amd64.whl (1.3 MB)
Collecting sentencepiece (from sentence-transformers)
  Using cached sentencepiece-0.1.99-cp39-cp39-win_amd64.whl (977 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers<5.0.0,>=4.6.0->sentence-transformers)
  Downloading tokenizers-0.13.3-cp39-cp39-win_amd64.whl (3.5 MB)
                                              0.0/3.5 MB ? eta -:--:--
                                              0.1/3.5 MB 1.7 MB/s eta 0:00:03
     --                                       0.2/3.5 MB 2.1 MB/s eta 0:00:02
     -----                                    0.5/3.5 MB 4.0 MB/s eta 0:00:01
     -------------                            1.1/3.5 MB 6.6 MB/s eta 0:00:01
     ----------


[notice] A new release of pip is available: 23.1.2 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
