# 3.1 Installation

In [None]:
# Pip installation LangChain and Hugginface API
!pip install langchain
!pip install huggingface_hub

# Pip installation of additional needed libraries
!pip install sentence_transformers
!pip install faiss-cpu
!pip install unstructured

# To download the transcript of a youtube video
!pip install youtube_transcript_api

# 3.2 Environment Setup

In [1]:
import os
import requests
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "YOUR API TOKEN"

# 4.1 Loading of documents as a learning basis

In [2]:
import requests
from langchain.document_loaders import TextLoader

## 4.1.1 TextLoader from Local & GitHub

In [3]:
def loadTXTFileFromLocal(local_file_name="local_text_file.txt"):
    # Load the text data
    with open('./'+local_file_name, "r", encoding='utf-8') as file:
        text = file.read()
        
    with open('./'+local_file_name, "w",  encoding='utf-8') as file:
      file.write(text)

    # Load the text document using TextLoader
    loader = TextLoader('./'+local_file_name)
    loaded_docs = loader.load()
    return loaded_docs

## 4.1.2 TextLoader from URL (GitHub)

In [4]:
def loadTXTFileFromURL(text_file_url="https://raw.githubusercontent.com/vashAI/AnsweringQuestionsWithHuggingFaceAndLLM/main/url_text_file.txt"):
    # Fetching the text file
    output_file_name = "url_text_file.txt"
    response = requests.get(text_file_url)
    with open(output_file_name, "w",  encoding='utf-8') as file:
      file.write(response.text)

    # Load the text document using TextLoader
    loader = TextLoader('./'+output_file_name)
    loaded_docs = loader.load()
    return loaded_docs

## 4.1.3 PDFLoader

In [5]:
from langchain.document_loaders import UnstructuredPDFLoader 

In [6]:
def loadPDFFromLocal(pdf_file_path="./Eurovision_Song_Contest_2023.pdf"):
    loader = UnstructuredPDFLoader(pdf_file_path)
    loaded_docs = loader.load()
    return loaded_docs

## 4.1.4 WebsiteLoader

In [7]:
from langchain.document_loaders import UnstructuredURLLoader

In [8]:
def loadTextFromWebsite(url="https://saturncloud.io/blog/breaking-the-data-barrier-how-zero-shot-one-shot-and-few-shot-learning-are-transforming-machine-learning/"):
    loader = UnstructuredURLLoader(urls=[url])
    loaded_docs = loader.load()
    return loaded_docs

## 4.1.5 VideoLoader

In [9]:
from youtube_transcript_api import YouTubeTranscriptApi

In [10]:
def loadTextFromYoutubeVideo(youtube_video_id="eg9qDjws_bU"):
    transcript = YouTubeTranscriptApi.get_transcript(youtube_video_id)

    transcript_text = ""
    for entry in transcript:
        transcript_text += ' ' + entry['text']
    
    youtube_local_txt_file = "youtube_transcript.txt"
    with open('./'+youtube_local_txt_file, "w",  encoding='utf-8') as file:
      file.write(transcript_text)

    # Load the text document using TextLoader
    loader = TextLoader('./'+youtube_local_txt_file)
    loaded_docs = loader.load()
    return loaded_docs

# 4.2 Split the documents in chunks (Important as LLM cannot accept too long inputs)

In [11]:
from langchain.text_splitter import CharacterTextSplitter

In [12]:
def splitDocument(loaded_docs):
    # Splitting documents into chunks
    splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
    chunked_docs = splitter.split_documents(loaded_docs)
    return chunked_docs

# 4.3 Convert the documents into embeddings and store them

In [13]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

In [14]:
def createEmbeddings(chunked_docs):
    # Create embeddings and store them in a FAISS vector store
    embedder = HuggingFaceEmbeddings()
    vector_store = FAISS.from_documents(chunked_docs, embedder)
    return vector_store

# 4.4 Use those embeddings to feed the LLM model and Answer Questions

In [15]:
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub

In [16]:
def loadLLMModel():
    llm=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
    chain = load_qa_chain(llm, chain_type="stuff")
    return chain

def askQuestions(vector_store, chain, question):
    # Ask a question using the QA chain
    similar_docs = vector_store.similarity_search(question)
    response = chain.run(input_documents=similar_docs, question=question)
    return response

In [17]:
chain = loadLLMModel()

## 4.4.1 Test with Local file & Test with file from URL

In [18]:
LOCAL_loaded_docs = loadTXTFileFromLocal()
LOCAL_chunked_docs = splitDocument(LOCAL_loaded_docs)
LOCAL_vector_store = createEmbeddings(LOCAL_chunked_docs)

In [19]:
LOCAL_response = askQuestions(LOCAL_vector_store, chain, "Explain me how ChatGPT and Plugin are empowering Citizen Data Scientists?")
print(LOCAL_response)

ChatGPT and plugins are helping Citizen Data Scientists by providing them with the tools they need to analyze and interpret data. By enabling them to use natural language, they are able to ask questions and get answers in plain English, without knowing complex programming languages or statistical techniques. Additionally, ChatGPT is a personal expert who is always available to help them turn their idea into reality.


## 4.4.2 Test with file from URL

In [20]:
URL_loaded_docs = loadTXTFileFromURL()
URL_chunked_docs = splitDocument(URL_loaded_docs)
URL_vector_store = createEmbeddings(URL_chunked_docs)

In [21]:
URL_response = askQuestions(URL_vector_store, chain, "What are 5 examples of chatgpt and plugin applications?")
print(URL_response)

1. Data visualization and analysis using ChatGPT and Plugins 2. Content creation and summarization 3. Personalized learning and skill development 4. Collaboration and knowledge sharing


## 4.4.3 Test with PDF from local path

In [22]:
PDF_loaded_docs = loadPDFFromLocal()
PDF_chunked_docs = splitDocument(PDF_loaded_docs)
PDF_vector_store = createEmbeddings(PDF_chunked_docs)

In [23]:
PDF_response = askQuestions(PDF_vector_store, chain, "Who is the Winner of 2023 Eurovision Songcontest?")
print(PDF_response)

Sweden is the winner of 2023 Eurovision Songcontest.


## 4.4.4 Test with WEBSITE

In [24]:
WEBSITE_loaded_docs = loadTextFromWebsite()
WEBSITE_chunked_docs = splitDocument(WEBSITE_loaded_docs)
WEBSITE_vector_store = createEmbeddings(WEBSITE_chunked_docs)

In [25]:
WEBSITE_response = askQuestions(WEBSITE_vector_store, chain, "What is Zero-shot learning?")
print(WEBSITE_response)

Zero-Shot Learning is the concept of training a model to classify objects it has never seen before. The core idea is to exploit the existing knowledge of another model to obtain meaningful representations of new classes.


## 4.4.5 Test with text from video

In [26]:
VIDEO_loaded_docs = loadTextFromYoutubeVideo()
VIDEO_chunked_docs = splitDocument(VIDEO_loaded_docs)
VIDEO_vector_store = createEmbeddings(VIDEO_chunked_docs)

In [27]:
VIDEO_response = askQuestions(VIDEO_vector_store, chain, "What was Elon explaining in the video?")
print(VIDEO_response)

Elon was explaining that AI is much more dangerous than nuclear warheads and that it could be used to make incredibly effective propaganda.
