### Import Libraries

In [29]:
from encoder.encoder import Encoder
from transcriptor.whisperx import WhisperX
from langchain.vectorstores.pgvector import PGVector
from langchain.docstore.document import Document
import utils
import os
from dotenv import load_dotenv

load_dotenv("env/connection.env")

COLLECTION_NAME = 'inflation' # or 'rag'

### Convert Video into Audio file

In [None]:
utils.convert_to_wav(f'data/{COLLECTION_NAME}.mp4')

### Transcribe audio using WhiperX and create documents to be stored in Postgres

In [31]:
whisperx = WhisperX()
transcription = whisperx.transcribe(f'data/{COLLECTION_NAME}.wav')

# create documents to store in Postgres
docs = [Document(page_content=f'start {item["start"]} - end {item["end"]}: {item["text"]}') for item in transcription['segments']]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Create connection settings

In [32]:
CONNECTION_STRING = PGVector.connection_string_from_db_params(
    driver=os.getenv("DRIVER"),
    host=os.getenv("HOST"),
    port=os.getenv("PORT"),     
    database=os.getenv("DATABASE"),
    user=os.getenv("USERNAME"),
    password=os.getenv("PASSWORD"),
)

### Insert embeddings into Postgres

In [33]:
db = PGVector.from_documents(
    embedding=Encoder().encoder,
    documents=docs,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

### Query Database

#### Inflation use case with portuguese audio

In [28]:
similar_docs_pt = db.similarity_search("banco central europeu e taxas de juro")
similar_docs_en = db.similarity_search("european central bank and interest rates")

print("Query PT: 'banco central europeu e taxas de juro' | Top 2 results:")
print('\n'.join([x.page_content for x in similar_docs_pt[:2]]))
print("Query EN: 'european central bank and interest rates' | Top 2 results:")
print('\n'.join([x.page_content for x in similar_docs_en[:2]]))

Query PT: 'banco central europeu e taxas de juro' | Top 2 results:
start 34.701 - end 45.0:  O abrendamento da subida de presos no euro dá um sinal forte ao banco central europeu para que não retomo o aumento das taxas de juros.
start 1.361 - end 20.88:  Em flação na zona euro-recoa para 2,4% em novembro, faça os 2,9% de outubro sendo o valor mais baixo desde julho de 2021, de acordo com a estimativa rápida do Eurostat divulgada esta quinta-feira.
Query EN: 'european central bank and interest rates' | Top 2 results:
start 1.361 - end 20.88:  Em flação na zona euro-recoa para 2,4% em novembro, faça os 2,9% de outubro sendo o valor mais baixo desde julho de 2021, de acordo com a estimativa rápida do Eurostat divulgada esta quinta-feira.
start 34.701 - end 45.0:  O abrendamento da subida de presos no euro dá um sinal forte ao banco central europeu para que não retomo o aumento das taxas de juros.


#### RAG use case with english audio

In [37]:
similar_docs_pt = db.similarity_search("Geração Aumentada de Recuperação")
similar_docs_en = db.similarity_search("Retrieval Augmented Generation")

print("Query PT: 'Geração Aumentada de Recuperação' | Top 4 results:")
print('\n'.join([x.page_content for x in similar_docs_pt[:4]]))
print("Query EN: 'Retrieval Augmented Generation' | Top 4 results:")
print('\n'.join([x.page_content for x in similar_docs_en[:4]]))

Query PT: 'Geração Aumentada de Recuperação' | Top 4 results:
start 4.681 - end 7.776:  Retrieval augmented generation, R-A-G,
start 85.66 - end 89.476:  Retrievers were developed to solve the problem of question answering, QA.
start 0.101 - end 2.657:  What is retrieval augmented generation?
start 74.96 - end 80.977:  by providing more accurate and up-to-date information through the documents provided by the retriever.
Query EN: 'Retrieval Augmented Generation' | Top 4 results:
start 94.881 - end 97.22:  what is retrieval augmented generation?
start 0.101 - end 2.657:  What is retrieval augmented generation?
start 4.681 - end 7.776:  Retrieval augmented generation, R-A-G,
start 85.66 - end 89.476:  Retrievers were developed to solve the problem of question answering, QA.
