<a href="https://colab.research.google.com/github/PyAshishMhatre/DigitalMarketing-Algorithms-Project/blob/main/QA_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Youtube video transcription

In [None]:
# Installing Libraries
!pip install pytube -q
!pip install git+https://github.com/openai/whisper.git -

In [4]:
# Importing modules for transcriptions

from pytube import YouTube
import whisper
from tqdm import tqdm

In [62]:
model = whisper.load_model('tiny')

# Function to download video and get transcriptions
def get_transcriptions(url, model):
    yt_video = YouTube(url)
    stream = yt_video.streams.filter(only_audio=True)
    stream = stream.first()
    stream.download(filename="test.mp4")
    output = model.transcribe("test.mp4")

    filename = "test.txt" # the name of the file to be saved

    with open(filename, "w") as file:
      file.write(output["text"])
    return 

In [66]:
# List down the youtube videos for transcription

video_urls = ["https://www.youtube.com/watch?v=blbvVUxD41Q&ab_channel=Locust%26WildHoney",
              "https://www.youtube.com/watch?v=9hktZEc3Vhs&ab_channel=STYLEDBYNATE",
              "https://www.youtube.com/watch?v=C9nVeYwS_8E&ab_channel=Men%27sFashionFiles",
              "https://www.youtube.com/watch?v=7fJcrPjAa1I&ab_channel=UniqloReviews",
              "https://www.youtube.com/watch?v=YtjHtPySBAA&ab_channel=HarryHas"]

# Loop through each video URL and get transcriptions
for url in tqdm(video_urls):
  get_transcriptions(url, model)

100%|██████████| 1/1 [01:19<00:00, 79.16s/it]


# Developing QA system using Augmented Retrival and prompt engineering using Langchain and ChromaDB (Vector Database)

In [12]:
!pip install langchain
!pip install openai
!pip install chromadb
!pip install tiktoken

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tiktoken
  Downloading tiktoken-0.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.3.3


In [3]:
# Import langchain modules 

from langchain.vectorstores import Chroma
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.document_loaders import TextLoader
from typing import List
from langchain.chains import RetrievalQA
from langchain.schema import Document
import os
from langchain.chains.question_answering import load_qa_chain

In [4]:
# Setup OpenAI

os.environ['OPENAI_API_KEY'] = "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"

In [6]:
# Load text files of transcription from directory 

loader = DirectoryLoader('./Documents', glob="**/*.txt", loader_cls=TextLoader, show_progress=True)

In [7]:
# Load usind load()
docs = loader.load()

100%|██████████| 9/9 [00:00<00:00, 4467.31it/s]


In [8]:
# Checking length of loaded files
len(docs)

9

In [9]:
# Split text into small chunks for more efficient context retrival

text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=0)
texts = text_splitter.split_documents(docs)

In [10]:
# Checking the new document length after text splitting

len(texts)

771

In [13]:
# Loading vectors into Chroma DB

embeddings = OpenAIEmbeddings()
vectordb = Chroma.from_documents(texts, embeddings)



In [14]:
llm = OpenAI(temperature=0.3, openai_api_key="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")

In [15]:
query = "What is the quality of tshirt?"

In [16]:
docsearch = vectordb.similarity_search(query, k=8)

In [17]:
docsearch

[Document(page_content="t-shirts have been very, very durable in the wash so that's definitely something. A worth mentioning", metadata={'source': 'Documents/test1.txt'}),
 Document(page_content="improved on the T-shirt, but it wasn't back to the standards that it used to be before they changed", metadata={'source': 'Documents/test4.txt'}),
 Document(page_content="improved on the T-shirt, but it wasn't back to the standards that it used to be before they changed", metadata={'source': 'Documents/Is UNIQLO Still The Best For Affordable Basics_.txt'}),
 Document(page_content="they're any good, just want to mention some points about these shirts and my experience with them.", metadata={'source': 'Documents/Are Uniqlo shirts any good_ Owner’s review.txt'}),
 Document(page_content='still sell the best quality affordable basics, or are you better off spending your money elsewhere?', metadata={'source': 'Documents/test4.txt'}),
 Document(page_content='still sell the best quality affordable bas

In [18]:
from langchain.prompts import PromptTemplate
prompt_template = """ Prompt: Use the following pieces of context to answer the question at the end and Answer as if your a salesman of the company

{context}

Question: {question}

Answer:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [19]:
template = """Here is a statement:
        {statement}
        Make the statement sound like it is coming from a customer care representative at ABC clothing company.\n\n"""
prompt_template = PromptTemplate(input_variables=["statement"], template=template)

In [20]:
from langchain.chains.llm import LLMChain
question_chain = LLMChain(llm=llm, prompt=PROMPT , output_key = 'statement')


In [24]:
from langchain.chains import SequentialChain
assumptions_chain = LLMChain(llm=llm, prompt=prompt_template, output_key = 'response')

overall_chain = SequentialChain(
    chains=[question_chain, assumptions_chain],
    input_variables=['context', "question"],
    # Here we return multiple variables
    output_variables=["statement", "response"],
    verbose=False)

In [26]:
overall_chain({'context':docsearch, 'question':query})['response']

'At ABC Clothing Company, we take pride in the quality of our t-shirts. Our customers have praised their durability in the wash and their softness. We are continuously striving to improve the quality of our t-shirts, and we are confident that they will soon be back to the standards they used to be. We are proud of the material quality and the fit of our t-shirts, especially for the price.'