In [40]:
!pip -q install langchain openai tiktoken PyPDF2 faiss-cpu
!pip -q install python-dotenv

In [41]:
import os
from dotenv import load_dotenv

# Charger les variables d'environnement à partir du fichier .env
load_dotenv()

# Maintenant, vous pouvez accéder à votre clé API en utilisant os.environ
api_key = os.environ.get("OPENAI_API_KEY", "")

In [3]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS 

### Reading the PDF

In [19]:
doc_reader = PdfReader('./pdf/Victor_Hugo.pdf')

In [20]:
# read data from the file and put them into a variable called raw_text
raw_text = ''
for i, page in enumerate(doc_reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

In [21]:
len(raw_text)

74300

In [22]:
raw_text[:100]

'Victor Hugo\nPortrait by Étienne Carjat, 1876\nBorn Victor-Marie Hugo\n26 February 1802\nBesançon, Franc'

### Text splitter

This takes the text and splits it into chunks. The chunk size is characters not tokens

In [23]:
# Splitting up the text into smaller chunks for indexing
text_splitter = CharacterTextSplitter(        
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200, #striding over the text
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [24]:
len(texts)

93

In [25]:
texts[25]

'the Third Republic was proclaimed that Hugo finally returned to his homeland in 1870, where he was\npromptly elected to the National Assembly and the Senate.Communards defending a barricade\non the Rue de RivoliHe was in Paris during the siege by the Prussian Army in 1870 , famously eating animals given to him by\nthe Paris Zoo. As the siege continued, and food became ever more scarce, he wrote in his diary that he was\nreduced to "eating the unknow n".[38]\nDuring the Paris Commune—the revolutionary gove rnment that\ntook power on 18 March 1871 and was toppled on 28 May—V ictor\nHugo was harshly critical of the atrocities committed on both sides.\nOn 9 April, he wrote in his diary, "In short, this Commune is as\nidiotic as the National Assembly is ferocious. From both sides,\nfolly."[39] Yet he made a point of offering his support to members of\nthe Commune subjected to brutal repression. He had been in\nBrussels since 22 March 1871 when in the 27 May issue of the'

### Making the embeddings


In [26]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

In [27]:
docsearch = FAISS.from_texts(texts, embeddings)

In [28]:
docsearch.embedding_function

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x1149b27d0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x114a11f90>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None)

In [30]:
query = "where is born victor hugo ?"
docs = docsearch.similarity_search(query)

In [31]:
len(docs)

4

In [32]:
docs[0]

Document(page_content='Victor Hugo\nPortrait by Étienne Carjat, 1876\nBorn Victor-Marie Hugo\n26 February 1802\nBesançon, Franche-\nComté, France\nDied 22 May 1885\n(aged 83)\nParis, France\nResting\nplacePanthéon, Paris\nOccupationsPoet,novelist,\ndramatist,essayist,\npolitician\nPolitical\npartyParty of Order\n(1848–1851)\nIndependent liberal\n(1871)\nRepublican Union\n(1876–1885)\nSpouse Adèle Foucher\n(m. 1822; died 1868)\nChildren 5Victor Hugo\nVictor-Marie Hugo (French: [vikt ɔ ʁ ma ʁi y ɡo] ⓘ; 26\nFebruary 1802 – 22 May 1885) , sometimes nicknamed the Ocean\nMan, was a French Romantic writer and politician. During a\nliterary career that spanned more than sixty years, he wrote in a\nvariety of genres and forms.\nHis most famous works are the nove ls The Hunchbac k of Notre-\nDame (1831) and Les Misérables (1862). In France, Hugo is\nrenowned for his poetry collections, such as Les Contemplations\n(The Contemplations) and La Légende des siècles (The Legend of\nthe Ages). Hugo was

## Plain QA Chain

In [33]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [39]:
chain = load_qa_chain(OpenAI(), chain_type="stuff") # we are going to stuff all the docs in at once

In [35]:
# check the prompt
chain.llm_chain.prompt.template

"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"

In [42]:
query = "say me a major moment of the life of victor hugo"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

" One major moment in Victor Hugo's life was his death on May 22, 1885. He was a renowned writer and political figure, and his death was widely mourned and commemorated in France. The city of Paris even changed the name of a street in his honor."