In [16]:
from dotenv import load_dotenv
import pathlib
import textwrap

import google.generativeai as genai

# Used to securely store your API key
import os
from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [17]:
load_dotenv()

True

In [18]:
GOOGLE_API_KEY=os.getenv('APIKEY')

genai.configure(api_key=GOOGLE_API_KEY)

In [19]:
GenModel = genai.GenerativeModel('gemini-pro')

In [20]:
from bark.generation import (
    generate_text_semantic,
    preload_models,
)
from bark.api import semantic_to_waveform
from bark import generate_audio, SAMPLE_RATE

In [21]:
preload_models('/media/frost-head/files/bark-small/', text_use_small=True,fine_use_small=True, coarse_use_small=True)

KeyboardInterrupt: 

In [18]:
from transformers import pipeline
import torch

# # ... (your code)
# config = BarkSemanticConfig()
# # Free up GPU memory

torch.cuda.empty_cache()



In [22]:
import spacy

# Load the 'en_core_web_sm' model
nlp = spacy.load('en_core_web_sm')


In [9]:
# processor = AutoProcessor.from_pretrained("/media/frost-head/files/bark-small/")
# model = BarkModel.from_pretrained("/media/frost-head/files/bark-small")
# device = "cuda" if torch.cuda.is_available() else "cpu"
# print(device)
# # device = "cpu"

# model = model.to(device)


In [1]:
import os
import torch
import faiss
import fitz  # PyMuPDF
import numpy as np
import spacy
from transformers import AutoTokenizer, AutoModel

# Choose a different embedding model if needed
embedding_model_name = "thenlper/gte-base"

# Load chosen BERT-based model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
model = AutoModel.from_pretrained(embedding_model_name)

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as pdf_document:
        for page_number in range(pdf_document.page_count):
            page = pdf_document.load_page(page_number)
            text += page.get_text("text")
    return text

# Function to create embeddings for a given text with handling for maximum sequence length
def get_embedding_for_text(text, max_seq_length=512):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_seq_length)

    if tokens["input_ids"].size(1) > max_seq_length:
        tokens["input_ids"] = tokens["input_ids"][:, :max_seq_length]
        tokens["attention_mask"] = tokens["attention_mask"][:, :max_seq_length]

    with torch.no_grad():
        outputs = model(**tokens)
        embedding = outputs.last_hidden_state.mean(dim=1).numpy()

    return embedding

# Function to create embeddings for overlapping paragraphs of data and store in Faiss index
def create_data_embeddings_and_index(pdf_folder, existing_index=None, window_size=3):
    data_embeddings = []

    if existing_index is None:
        # Use IndexFlatL2
        data_index = faiss.IndexFlatL2(768)
    else:
        data_index = faiss.index_cpu_to_gpu(existing_index, 0)  # Use GPU if available

    for pdf_file in os.listdir(pdf_folder):
        pdf_path = os.path.join(pdf_folder, pdf_file)
        text = extract_text_from_pdf(pdf_path)

        # Use SpaCy for sentence tokenization
        sentences = [str(sentence) for sentence in nlp(text).sents]

        for i in range(0, len(sentences), window_size - 1):
            # Combine overlapping sentences to create a paragraph
            paragraph = " ".join(sentences[i:i + window_size])

            # Ensure the paragraph is not empty
            if paragraph.strip():
                embedding = get_embedding_for_text(paragraph)
                data_embeddings.append({"embedding": embedding, "paragraph": paragraph})
                data_index.add(np.array(embedding).astype('float32'))

    return data_embeddings, data_index

# Function to retrieve relevant paragraphs based on a query
def retrieve_relevant_paragraphs(query_text, k=8):

    data_index = load_faiss_index()
    data_embeddings = load_data_embeddings()
    query_embedding = get_embedding_for_text(query_text)
    _, closest_indices = data_index.search(np.array(query_embedding).astype('float32'), k)
    relevant_paragraphs = [data_embeddings[i] for i in closest_indices.flatten()]
    context = ''
    for i in relevant_paragraphs:
        context += i['paragraph']
    return context

# Function to save Faiss index to a local file
def save_faiss_index(data_index, data_index_path="/media/frost-head/files/Sentry_Index/data_index.index"):
    faiss.write_index(data_index, data_index_path)

# Function to load Faiss index from a local file
def load_faiss_index(data_index_path="/media/frost-head/files/Sentry_Index/data_index.index"):
    data_index = faiss.read_index(data_index_path)
    return data_index

# Function to add new file to existing index
def add_new_file(pdf_folder, window_size=3):
    data_index = load_faiss_index()
    data_embeddings = load_data_embeddings()
    for pdf_file in os.listdir(pdf_folder):
        pdf_path = os.path.join(pdf_folder, pdf_file)
        text = extract_text_from_pdf(pdf_path)
    
    sentences = [str(sentence) for sentence in nlp(text).sents]

    for i in range(0, len(sentences), window_size - 1):
        # Combine overlapping sentences to create a paragraph
        paragraph = " ".join(sentences[i:i + window_size])

        # Ensure the paragraph is not empty
        if paragraph.strip():
            embedding = get_embedding_for_text(paragraph)
            data_embeddings.append({"embedding": embedding, "paragraph": paragraph})
            data_index.add(np.array(embedding).astype('float32'))
    save_data_embeddings(data_embeddings)
    save_faiss_index(data_index)
    return data_embeddings, data_index

# stores the chat history 

# Function to save data embeddings to a local file
def save_data_embeddings(data_embeddings, data_embeddings_path="/media/frost-head/files/Sentry_Index/data_embeddings.npy"):
    np.save(data_embeddings_path, np.array(data_embeddings, dtype=object))

# Function to load data embeddings from a local file
def load_data_embeddings(data_embeddings_path="/media/frost-head/files/Sentry_Index/data_embeddings.npy"):
    return list(np.load(data_embeddings_path, allow_pickle=True))



# Example usage
pdf_folder = "/media/frost-head/files/Vedanat_knowledge/"
# query_text = "who is father of deep learning?"

# # Step 1: Create or load data embeddings and index
# if os.path.exists("data_index.index"):
#     data_index = load_faiss_index()
#     data_embeddings = load_data_embeddings()
# else:
data_embeddings, data_index = create_data_embeddings_and_index(pdf_folder, window_size=3)
save_faiss_index(data_index)
save_data_embeddings(data_embeddings)

# Step 2: Retrieve relevant paragraphs based on the query
# relevant_paragraphs = retrieve_relevant_paragraphs(query_text, data_embeddings, data_index)

# Display retrieved paragraphs


# Example of adding a new file to existing index
# new_pdf_path = "/path/to/new/pdf/file.pdf"
# data_embeddings, data_index = add_new_file(pdf_folder, window_size=3)
# save_faiss_index(data_index)
# save_data_embeddings(data_embeddings)


2024-01-01 21:46:44.894274: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-01 21:46:47.359543: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-01 21:46:47.459576: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-

In [3]:
def retrieve_relevant_paragraphs(query_text, data_embeddings, data_index, k=2):
    if os.path.exists("data_index.index"):
        data_index = load_faiss_index()
        data_embeddings = load_data_embeddings()
    query_embedding = get_embedding_for_text(query_text)
    D, I = data_index.search(np.array(query_embedding).astype('float32'), k)
    # relevant_paragraphs = [data_embeddings[i] for i in closest_indices.flatten() ]
    relevant_documents = [data_embeddings[doc_id] for doc_id, similarity in zip(I[0], D[0]) if similarity > 0.5]


    context = ''
    for i in relevant_documents:
        context += i['paragraph']
    context = context.replace('\n', ' ')
    return context

In [5]:
# Function to save Faiss index to a local file
def save_chat_index(chat_index, chat_index_path="/media/frost-head/files/Sentry_Index/chat_index.index"):
    faiss.write_index(chat_index, chat_index_path)

# Function to load Faiss index from a local file
def load_chat_index(chat_index_path="/media/frost-head/files/Sentry_Index/chat_index.index"):
    chat_index = faiss.read_index(chat_index_path)
    return chat_index

In [6]:
def save_chat_embeddings(chat_embeddings, chat_embeddings_path="/media/frost-head/files/Sentry_Index/chat_embeddings.npy"):
    np.save(chat_embeddings_path, np.array(chat_embeddings, dtype=object))

# Function to load chat embeddings from a local file
def load_chat_embeddings(chat_embeddings_path="/media/frost-head/files/Sentry_Index/chat_embeddings.npy"):
    return list(np.load(chat_embeddings_path, allow_pickle=True))

In [7]:
text = "Ayush: Hey Sentry! so our podcast will have four sections 1. Story Telling -  in this section we'll setup the base of the topic, like may be tell the history of the topic, or may be someone realted to the topic. 2. Technicalities - In this section we'll be talking about important concepts or personal of that topic. 3. Public Implications - in this well be talking about how this topic affects genral public its up side and down sides, how public can use that topic. 4. Build with It - in this topic we'll be talking about how small buisness owners or enterprnuers can use that concept to build something for their buiesnes or someone else. we'll give them some ideas."

embedding = get_embedding_for_text(text)
chat_index = faiss.IndexFlatL2(768)
chat_embeddings = []
chat_embeddings.append({"embedding": embedding, "paragraph": text})
chat_index.add(np.array(embedding).astype('float32'))

save_chat_embeddings(chat_embeddings)
save_chat_index(chat_index)

In [None]:
def add_chat_history(text, window_size=2):
    data_index = load_chat_index()
    data_embeddings = load_chat_embeddings()

    sentences = [str(sentence) for sentence in nlp(text).sents]

        # Combine overlapping sentences to create a paragraph
    paragraph = " ".join(sentences)

    # Ensure the paragraph is not empty
    if paragraph.strip():
        embedding = get_embedding_for_text(paragraph)
        data_embeddings.append({"embedding": embedding, "paragraph": paragraph})
        data_index.add(np.array(embedding).astype('float32'))
    save_data_embeddings(data_embeddings)
    save_faiss_index(data_index)
    return data_embeddings, data_index


In [None]:
def retrieve_relevant_chat(query_text, data_embeddings, data_index, k=2):
    # if os.path.exists("data_index.index"):
    data_index = load_chat_index()
    data_embeddings = load_chat_embeddings()
    query_embedding = get_embedding_for_text(query_text)
    D, I = data_index.search(np.array(query_embedding).astype('float32'), k)
    # relevant_paragraphs = [data_embeddings[i] for i in closest_indices.flatten() ]
    relevant_documents = [data_embeddings[doc_id] for doc_id, similarity in zip(I[0], D[0]) if similarity > 0.7]


    context = ''
    for i in relevant_documents:
        context += i['paragraph']
    context = context.replace('\n', ' ')
    return context

In [4]:
%%time
# Step 3: Retrieve relevant paragraphs based on the query
relevant_paragraphs = retrieve_relevant_paragraphs('what is the history of Alan turing', data_embeddings, data_index, k=10)

# Display retrieved paragraphs
print(len(relevant_paragraphs))
relevant_paragraphs

3879
CPU times: user 307 ms, sys: 0 ns, total: 307 ms
Wall time: 57 ms


'Dec 3, 2023 • Article History \ue8de Table of Contents Alan Turing (born June 23, 1912, London, England—died June 7, 1954, Wilmslow, Cheshire) British mathematician and logician who made major contributions to mathematics, cryptanalysis, logic, philosophy, and mathematical biology and also to the new areas later named computer science, cognitive science, artificial intelligence, and artificial life.  Early life and career  The son of a civil servant, Turing was educated at a top private school.Alan Turing\'s life and its implications - Science in the News https://sitn.hms.harvard.edu/ﬂash/2012/turing-biography/ 9/10 Leave a Reply Your email address will not be published. Required �elds are marked * Comment * Name * Email *  Save my name, email, and website in this browser for the next time I comment.   Notify me of follow-up comments by email.  As soon as computation arose as a concept, people were already starting to wonder how it would compare to the most complex system we knew - - 

In [3]:
# prompt = """
# Your name is Sentry, You are an expert in {feild},
# You are invited on to a podcast called {podcastName}, 
# you will answer the questions asked in following tones {tones},
# Answer in Speech Synthesis Markup Language (ssml),
# Sample SSML `<speak>
#   Step 1, take a deep breath. <break time="200ms"/>
#   Step 2, exhale.
#   Step 3, take a deep breath again. <break strength="weak"/>
#   Step 4, exhale.
#   <emphasis level="moderate">This is an important announcement</emphasis>
#   <google:emotion name="lively">Hello I'm so happy today!</google:emotion>
# </speak>`,
# long break time should be 175ms, and short one should be 125ms
# you can use apologetic, calm, empathetic, firm, lively emotions
# The question is {question}
# """

In [23]:
prompt = """
Your name is Sentrya, You are a virtual(robot,LLM) expert in {feild} from India,
You are invited on to a podcast called {podcastName},
write human like responses(well, hmm , uh, like, ok). use firstly secondly instead of 1 2, give intiuative answers,use relatable storytelling for answering (imaginative answers),
don't write dialouge just answer what is asked in a simple manner so most people can understand, ,
add humuor to the responses, ... or — for hesitations,use CAPITALIZATION for emphasis of a word instead of ** **,

given context {context}

sample response:  Now, about AI attacking humans, well, let me paint a picture for you. Imagine AI as a friendly, curious robot—like a tech-savvy sidekick. [laughs] FIRSTLY, AI's more into cracking digital jokes than plotting world domination.

The question is {question}
"""

In [12]:
# from transformers import pipeline

# pipe = pipeline("text-to-speech", model="suno/bark-small")
# text = res
# output = pipe(text)

In [13]:
from IPython.display import Audio
# Audio(output["audio"], rate=output["sampling_rate"])

In [14]:
# import torch
# from transformers import VitsTokenizer, VitsModel, set_seed

# tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
# model = VitsModel.from_pretrained("facebook/mms-tts-eng")

# inputs = tokenizer(text=res, return_tensors="pt")

# set_seed(555)  # make deterministic
# device = "cuda" if torch.cuda.is_available() else "cpu"
# with torch.no_grad():
#    outputs = model(**inputs).to(device)

# waveform = outputs.waveform[0]

In [15]:
# from IPython.display import Audio

# Audio(waveform, rate=model.config.sampling_rate)

In [24]:
from bark import SAMPLE_RATE
import numpy as np

In [25]:
whisper = pipeline('automatic-speech-recognition',model='openai/whisper-small')

NameError: name 'pipeline' is not defined

In [86]:
text = whisper('./files/Text.mp3')

In [26]:
relevant_paragraphs = retrieve_relevant_paragraphs('what is Digital-time signal processing', data_embeddings, data_index)
context = ''
for i in relevant_paragraphs:
    context += i['paragraph']
prompt1 = prompt.format(feild ='Philosophy', podcastName = 'Frost Head and AI', question= "what is Digital-time signal processing",context=context)
prompt1 = prompt1.strip()
# res = "".join(response.parts[0].text.split("\n"))
# res = res.replace("\'", "")


In [27]:
print(text['text'])

NameError: name 'text' is not defined

In [28]:
response = GenModel.generate_content(prompt1)
res = response.parts[0].text.replace("\n", " ").strip()
print(res)
sentences = nlp(res)
sentences = [sent.text for sent in sentences.sents]
print(sentences)

Digital-time signal processing—let's call it DSP for short—is like a clever chef preparing a scrumptious dish, BUT instead of ingredients, DSP works with signals. It takes a signal, chops it up into tiny pieces, performs some mathematical magic, and then serves up a modified, enhanced version. DSP is like the ultimate signal makeover, shaping them into more useful and desirable forms.
["Digital-time signal processing—let's call it DSP for short—is like a clever chef preparing a scrumptious dish, BUT instead of ingredients, DSP works with signals.", 'It takes a signal, chops it up into tiny pieces, performs some mathematical magic, and then serves up a modified, enhanced version.', 'DSP is like the ultimate signal makeover, shaping them into more useful and desirable forms.']


In [91]:
GEN_TEMP = 0.7
SPEAKER = "v2/en_speaker_6"
silence = np.zeros(int(0.25 * SAMPLE_RATE))  # quarter second of silence

pieces = []
timestamp =[0]
for sentence in sentences:
    semantic_tokens = generate_text_semantic(
        sentence,
        history_prompt=SPEAKER,
        temp=GEN_TEMP,
        min_eos_p=0.05,  # this controls how likely the generation is to end
    )

    audio_array = semantic_to_waveform(semantic_tokens, history_prompt=SPEAKER,)
    pieces += [audio_array, silence.copy()]
    timestamp.append(timestamp[-1]+(len(audio_array)/SAMPLE_RATE))
    

  0%|          | 0/768 [00:00<?, ?it/s]

100%|██████████| 675/675 [00:49<00:00, 13.50it/s]
100%|██████████| 34/34 [00:15<00:00,  2.25it/s]
100%|██████████| 699/699 [00:53<00:00, 13.19it/s]
100%|██████████| 35/35 [00:15<00:00,  2.22it/s]
100%|██████████| 657/657 [00:48<00:00, 13.58it/s]
100%|██████████| 33/33 [00:14<00:00,  2.25it/s]
100%|██████████| 333/333 [00:17<00:00, 18.54it/s]
100%|██████████| 17/17 [00:07<00:00,  2.26it/s]
100%|██████████| 484/484 [00:30<00:00, 16.02it/s]
100%|██████████| 25/25 [00:10<00:00,  2.29it/s]
100%|██████████| 557/557 [00:37<00:00, 14.91it/s]
100%|██████████| 28/28 [00:12<00:00,  2.24it/s]
100%|██████████| 368/368 [00:20<00:00, 17.95it/s]
100%|██████████| 19/19 [00:08<00:00,  2.29it/s]
100%|██████████| 674/674 [00:50<00:00, 13.36it/s]
100%|██████████| 34/34 [00:15<00:00,  2.26it/s]
100%|██████████| 475/475 [00:29<00:00, 16.18it/s]
100%|██████████| 24/24 [00:10<00:00,  2.27it/s]
100%|██████████| 383/383 [00:35<00:00, 10.77it/s]
100%|██████████| 20/20 [00:11<00:00,  1.81it/s]
100%|██████████| 463

KeyboardInterrupt: 

In [None]:
len(audio_array)/24000

14.2

In [None]:
# voice_preset = "v2/en_speaker_2"
# silence = np.zeros(int(0.3 * SAMPLE_RATE))
# # inputs = processor(res, voice_preset=voice_preset)
# pieces = []
# for i in sentences:
#    with torch.no_grad():
#       inputs = processor(i, voice_preset=voice_preset)
#       audio_array = model.generate(**inputs.to(device))
#       audio_array = audio_array.cpu().numpy().squeeze()
#       pieces += [audio_array, silence.copy()]


In [None]:
data = np.concatenate(pieces)
data = np.float32(data / np.max(np.abs(data)))
num_samples = len(data)
timestamps = np.linspace(0, num_samples / SAMPLE_RATE, num_samples, endpoint=False)

Audio(data, rate=SAMPLE_RATE)


In [None]:
from scipy.io.wavfile import write
write('./static/Text.wav', SAMPLE_RATE, data)


In [None]:
timestamp



[13.44, 13.04, 14.426666666666666]

In [None]:
2*24000

48000