### Import Libraries

In [69]:
from encoder.encoder import Encoder
from transcriptor.whisperx import WhisperX
from langchain.vectorstores.pgvector import PGVector
from langchain.docstore.document import Document
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
import utils
import os
from dotenv import load_dotenv

load_dotenv("env/connection.env")

COLLECTION_NAME = "pt"  # or 'en'

### Convert Video into Audio file

In [None]:
utils.convert_to_wav(f"data/{COLLECTION_NAME}.mp4")

### Transcribe audio using WhiperX and create documents to be stored in Postgres

In [None]:
# whisperX in cpu is too slow, we used large whisper
whisperx = WhisperX(model_name="whisper")
transcription = whisperx.transcribe(f"data/{COLLECTION_NAME}.wav")

# create documents to store in Postgres
docs = [
    Document(page_content=f'start {item["start"]} - end {item["end"]}: {item["text"]}')
    for item in transcription["segments"]
]

### Create connection settings

In [72]:
CONNECTION_STRING = PGVector.connection_string_from_db_params(
    driver=os.getenv("DRIVER"),
    host=os.getenv("HOST"),
    port=os.getenv("PORT"),
    database=os.getenv("DATABASE"),
    user=os.getenv("USERNAME"),
    password=os.getenv("PASSWORD"),
)

### Insert embeddings into Postgres

In [73]:
db = PGVector.from_documents(
    embedding=Encoder().encoder,
    documents=docs,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
    pre_delete_collection=True,  # deletes previous records, useful for testing
)

Collection not found


### Query Database

#### Huub use case with portuguese audio

In [68]:
similar_docs_pt = db.similarity_search("marcas e investimentos", k=4)
similar_docs_en = db.similarity_search("brands and investments", k=4)

print("Query PT: 'marcas e investimentos' | Top 4 results:")
print("\n".join([x.page_content for x in similar_docs_pt]))
print("Query EN: 'brands and investments' | Top 4 results:")
print("\n".join([x.page_content for x in similar_docs_en]))

Query PT: 'marcas e investimentos' | Top 4 results:
start 81.401 - end 85.26:  uma marca baseada em Berlim, portanto ao invés de esperarmos que esse investimento chegasse,
start 111.902 - end 117.26:  para maturar o produto, para investir em tecnologia e para investir no business development.
start 88.58 - end 93.039:  Estas duas rondas de investimento que nós já fizemos com uma capacidade também diferente
start 28.6 - end 32.64:  Portanto, damos toda a componente de infraestrutura logística que uma marca precisa.
Query EN: 'brands and investments' | Top 4 results:
start 81.401 - end 85.26:  uma marca baseada em Berlim, portanto ao invés de esperarmos que esse investimento chegasse,
start 105.26 - end 111.24:  Portanto esta ronda de investimento para nós significou um incremento daquilo que é a nossa capacidade
start 111.902 - end 117.26:  para maturar o produto, para investir em tecnologia e para investir no business development.
start 88.58 - end 93.039:  Estas duas rondas de investi

#### Probabilistic Deep Learning use case with english audio

In [110]:
similar_docs_pt = db.similarity_search("modelos de aprendizagem profunda", k=8)
similar_docs_en = db.similarity_search("deep learning models", k=8)

print("Query PT: 'modelos de aprendizagem profunda' | Top 8 result:")
print(similar_docs_pt[-1].page_content)
print("Query EN: 'deep learning models' | Top 1 result:")
print(similar_docs_en[0].page_content)

Query PT: 'modelos de aprendizagem profunda' | Top 8 result:
start 45.28 - end 51.9:  when we are using deep learning models, usually we are relying on maximum likelihood estimation
Query EN: 'deep learning models' | Top 1 result:
start 45.28 - end 51.9:  when we are using deep learning models, usually we are relying on maximum likelihood estimation


In [111]:
similar_docs_pt = db.similarity_search("distribuição normal", k=4)
similar_docs_en = db.similarity_search("normal distribution", k=4)

print("Query PT: 'distribuição normal' | Top 4 results:")
print("\n".join([x.page_content for x in similar_docs_pt]))
print("Query EN: 'normal distribution' | Top 4 results:")
print("\n".join([x.page_content for x in similar_docs_en]))

Query PT: 'distribuição normal' | Top 4 results:
start 61.102 - end 64.598: Well, when we calculated from a normal standard, normal distribution,
start 19.601 - end 24.739:  normal distribution that is exactly as we we've built it before, univariate normal distribution.
start 0.201 - end 2.228:  the mean of this normal distribution.
start 73.421 - end 74.545:  this normal distribution.
Query EN: 'normal distribution' | Top 4 results:
start 61.102 - end 64.598: Well, when we calculated from a normal standard, normal distribution,
start 0.201 - end 2.228:  the mean of this normal distribution.
start 73.421 - end 74.545:  this normal distribution.
start 19.601 - end 24.739:  normal distribution that is exactly as we we've built it before, univariate normal distribution.
