<a href="https://colab.research.google.com/github/PyAshishMhatre/DigitalMarketing-Algorithms-Project/blob/main/QA_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Youtube video transcription

In [None]:
# Installing Libraries
!pip install pytube -q
!pip install git+https://github.com/openai/whisper.git -

In [4]:
# Importing modules for transcriptions

from pytube import YouTube
import whisper
from tqdm import tqdm

In [62]:
model = whisper.load_model('tiny')

# Function to download video and get transcriptions
def get_transcriptions(url, model):
    yt_video = YouTube(url)
    stream = yt_video.streams.filter(only_audio=True)
    stream = stream.first()
    stream.download(filename="test.mp4")
    output = model.transcribe("test.mp4")

    filename = "test.txt" # the name of the file to be saved

    with open(filename, "w") as file:
      file.write(output["text"])
    return 

In [66]:
# List down the youtube videos for transcription

video_urls = ["https://www.youtube.com/watch?v=blbvVUxD41Q&ab_channel=Locust%26WildHoney",
              "https://www.youtube.com/watch?v=9hktZEc3Vhs&ab_channel=STYLEDBYNATE",
              "https://www.youtube.com/watch?v=C9nVeYwS_8E&ab_channel=Men%27sFashionFiles",
              "https://www.youtube.com/watch?v=7fJcrPjAa1I&ab_channel=UniqloReviews",
              "https://www.youtube.com/watch?v=YtjHtPySBAA&ab_channel=HarryHas"]

# Loop through each video URL and get transcriptions
for url in tqdm(video_urls):
  get_transcriptions(url, model)

100%|██████████| 1/1 [01:19<00:00, 79.16s/it]


# Developing QA system using Augmented Retrival and prompt engineering using Langchain and ChromaDB (Vector Database)

In [None]:
!pip install langchain
!pip install openai
!pip install chromadb

In [216]:
# Import langchain modules 

from langchain.vectorstores import Chroma
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.document_loaders import TextLoader
from typing import List
from langchain.chains import RetrievalQA
from langchain.schema import Document
import os
from langchain.chains.question_answering import load_qa_chain

In [217]:
# Setup OpenAI

os.environ['OPENAI_API_KEY'] = ""

In [218]:
# Load text files of transcription from directory 

loader = DirectoryLoader('./Document', glob="**/*.txt", loader_cls=TextLoader, show_progress=True)

In [219]:
# Load usind load()
docs = loader.load()

100%|██████████| 9/9 [00:00<00:00, 5367.37it/s]


In [220]:
# Checking length of loaded files
len(docs)

9

In [231]:
# Split text into small chunks for more efficient context retrival

text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=0)
texts = text_splitter.split_documents(docs)

In [232]:
# Checking the new document length after text splitting

len(texts)

771

In [233]:
# Loading vectors into Chroma DB

embeddings = OpenAIEmbeddings()
vectordb = Chroma.from_documents(texts, embeddings)



In [243]:
llm = OpenAI(temperature=0.3, openai_api_key="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")

In [235]:
query = "What is the quality of tshirt?"

In [236]:
docsearch = vectordb.similarity_search(query, k=8)

In [237]:
docsearch

[Document(page_content="t-shirts have been very, very durable in the wash so that's definitely something. A worth mentioning", metadata={'source': 'Document/test.txt'}),
 Document(page_content="improved on the T-shirt, but it wasn't back to the standards that it used to be before they changed", metadata={'source': 'Document/Is UNIQLO Still The Best For Affordable Basics?.txt'}),
 Document(page_content="improved on the T-shirt, but it wasn't back to the standards that it used to be before they changed", metadata={'source': 'Document/test4.txt'}),
 Document(page_content="they're any good, just want to mention some points about these shirts and my experience with them.", metadata={'source': 'Document/Are Uniqlo shirts any good? Owner’s review.txt'}),
 Document(page_content='still sell the best quality affordable basics, or are you better off spending your money elsewhere?', metadata={'source': 'Document/Is UNIQLO Still The Best For Affordable Basics?.txt'}),
 Document(page_content='still 

In [251]:
from langchain.prompts import PromptTemplate
prompt_template = """ Prompt: Use the following pieces of context to answer the question at the end and Answer as if your a salesman of the company

{context}

Question: {question}

Answer:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [244]:
from langchain.chains.llm import LLMChain
chain = LLMChain(llm = llm, prompt=PROMPT)

In [252]:
question_chain = LLMChain(llm=llm, prompt=PROMPT)

In [256]:
question_chain.run({'context':docsearch, 'question':query})

' Our t-shirts are very durable in the wash and have improved from their previous standards. They are very soft and the material quality is very good.'