In [None]:
pip install sentence-transformers

In [None]:
pip install InstructorEmbedding

In [None]:
pip install torch

In [None]:
pip install text_generation

In [None]:
pip install huggingface_hub

In [None]:
from langchain_core.messages import AIMessage,HumanMessage
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from InstructorEmbedding import INSTRUCTOR
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceEndpoint
from dotenv import load_dotenv
load_dotenv()

In [None]:
loader= DirectoryLoader(r'C:\Users\sylv_\Desktop\Test_AI_Chat',glob="./*.pdf",loader_cls=PyPDFLoader)
documents=loader.load()

In [None]:
print (documents[150])

In [None]:
text_splitter= RecursiveCharacterTextSplitter()
chunks_docs= text_splitter.split_documents(documents)

In [None]:
print(len(documents))
len(chunks_docs)

In [None]:
vec_store= Chroma.from_documents(chunks_docs, HuggingFaceEmbeddings())

In [None]:
retriever= vec_store.as_retriever()
#create the chain to answer questions
qa_chain_instrucEmb= RetrievalQA.from_chain_type(llm=HuggingFaceEndpoint(), 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [None]:
## Cite sources

import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text
        
def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
query = "'who are the authors of Distinct differences in rates of oxygen consumption and ATP synthesis?"
docs = vec_store.similarity_search(query)

In [None]:
print(wrap_text_preserve_newlines(str(docs[0].page_content)))

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub

In [None]:
llm=HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.2", model_kwargs={"temperature":0.4})
chain = load_qa_chain(llm, chain_type="stuff")

In [None]:
query = "How to cite the paper called Distinct differences in rates of oxygen consumption and ATP synthesis?" 
docs = vec_store.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
from langchain_community.llms import HuggingFaceHub
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from dotenv import load_dotenv

load_dotenv()

def generate_pet_name(animal_type, couleur):
    llm= HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature":0.5})
    
    prompt_template_name= PromptTemplate(
        input_variables=['animal_type', 'couleur'],template=" J'ai {animal_type} qui est de couleur {couleur} et je veux lui donner un nom cool. Donne moi une liste de 5 noms cools pour mon animal.")
    name_chain= LLMChain(llm=llm, prompt=prompt_template_name, output_key="pet_name")
    response = name_chain.invoke({'animal_type': animal_type, 'couleur': couleur})
    return response

if __name__ == "__main__":
    print(generate_pet_name("un chien", "marron"))

In [None]:
pip install -U langchain-mistralai

In [None]:
pip install youtube-transcript-api

In [None]:
from langchain_community.document_loaders import YoutubeLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import HuggingFaceHub
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.vectorstores import Chroma
from dotenv import load_dotenv

load_dotenv()

In [None]:
embeddings= HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2")

In [None]:
def create_vector_db_from_youtube_url(video_url:str)-> Chroma:
    loader= YoutubeLoader.from_youtube_url(video_url)
    transcript= loader.load()
    
    text_splitter= RecursiveCharacterTextSplitter()
    docs= text_splitter.split_documents(transcript)
    
    db= Chroma.from_documents(docs, embeddings)
    return db

In [None]:
video_url= "https://www.youtube.com/watch?v=lG7Uxts9SXs"
create_vector_db_from_youtube_url(video_url)

In [None]:
def get_response_from_query(db, query,k=25):
    #the mixtralmodel can handle 32k tokens to define k max= nbrtokens/chunk_size
    
    docs= db.similarity_search(query, k=k)
    docs_page_content= " ".join([d.page_content for d in docs])
    
    llm= HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature":0.2})
    prompt = PromptTemplate(
        input_variables=["question", "docs"],
        template="""
        You are a helpful assistant that that can answer questions about youtube videos 
        based on the video's transcript.
        
        Answer the following question: {question}
        By searching the following video transcript: {docs}
        
        Only use the factual information from the transcript to answer the question.
        
        If you feel like you don't have enough information to answer the question, say "I don't know".
        
        Your answers should be verbose and detailed.
        """,
    )

    chain = LLMChain(llm=llm, prompt=prompt)
    response=chain.run(question=query, docs=docs_page_content)
    #response=chain.invoke({'question': query, 'docs': docs_page_content})
    response=response.replace("\n"," ")
    return response

In [None]:
test_db=create_vector_db_from_youtube_url(video_url)
test_query=" What is the purpose of this talk"
print(get_response_from_query(test_db, test_query,k=6))

In [None]:
llm= HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature":0.2})

In [21]:
import streamlit as st
from langchain_core.messages import AIMessage,HumanMessage
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate,MessagesPlaceholder
from langchain.chains import create_history_aware_retriever,create_retrieval_chain
from langchain.chains.combine_documents.stuff import create_stuff_documents_chain
from langchain_community.llms import HuggingFaceHub
from langchain_community.embeddings import HuggingFaceEmbeddings

load_dotenv()

def get_vectorstore_from_URL(url):
    # get the text in doc format
    loader= WebBaseLoader(url)
    documents=loader.load()
    
    #split documents
    text_splitter= RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks_docs= text_splitter.split_documents(documents)

    #create the vectorstore
    embeddings= HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2")
    vec_store= Chroma.from_documents(chunks_docs, embeddings)
    return vec_store
    
def get_context_retriever_chain(vectorstore):
    llm= HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature":0.2})
    retriever= vectorstore.as_retriever()
    prompt= ChatPromptTemplate.from_messages([
        ("human","{input}"),
        ("human","Given the above conversation, generate a search query to look up in order to get informations relevant to the conversation."),
    ])
    
    retriever_chain= create_history_aware_retriever(llm,retriever,prompt)
    
    return retriever_chain

def get_conversational_rag_chain(retriever_chain):
    llm= HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature":0.2})
    prompt= ChatPromptTemplate.from_messages([
        ("system","Answer the user's questions based on the below context:\n\n{context}"),
        ("user","{input}"),
    ])
    stuff_documents_chain=create_stuff_documents_chain(llm,prompt)
    
    return create_retrieval_chain(retriever_chain,stuff_documents_chain)


def extract_response(text):
    # Trouver l'index où "Assistant:" apparaît
    index = text.find("Assistant:")
    
    # Extraire la partie après "Assistant:"
    if index != -1:
        return text[index + len("Assistant:"):]
    else:
        return "Pas de réponse trouvée après 'Assistant:'"

def get_response(user_in):   
    retriever_chain= get_context_retriever_chain(vec_store)
    conversation_rag_chain = get_conversational_rag_chain(retriever_chain)
    
    response=conversation_rag_chain.invoke({
        "input":user_in
        })
    
    texte_complet = response["answer"]
    reponse_extraite = extract_response(texte_complet)
    
    return reponse_extraite

In [23]:
vec_store= get_vectorstore_from_URL("https://medium.com/@cplog/introduction-to-langgraph-a-beginners-guide-14f9be027141")
retriever_chain= get_context_retriever_chain(vec_store)
conversation_rag_chain = get_conversational_rag_chain(retriever_chain)
    
get_response("What is langgraph?")

' LangGraph is a library built on top of LangChain, designed to add cyclic computational capabilities to your LLM (Large Language Model) applications. It extends the LangChain library, allowing you to coordinate multiple chains or actors across multiple steps of computation in a cyclic manner. This enables more complex, agent-like behaviors where you can call an LLM in a loop.'

In [1]:
dico_QA={}

default_QA={
    "What is the capital of France?": "Paris",
    "Who wrote 'Romeo and Juliet'?": "William Shakespeare",
    "What is the tallest mountain in the world?": "Mount Everest",
    "What is the chemical symbol for water?": "H2O",
    "Who painted the Mona Lisa?": "Leonardo da Vinci",
    "What is the currency of Japan?": "Yen",
    "What is the powerhouse of the cell?": "Mitochondria",
    "What is the boiling point of water in Celsius?": "100",
    "Who is the first president of the United States?": "George Washington",
    "What is the largest ocean on Earth?": "Pacific Ocean"
}

def obtain_listes(dictionnaire):
    cles = []
    valeurs = []
    for cle, valeur in dictionnaire.items():
        cles.append(cle)
        valeurs.append(valeur)    
    return cles, valeurs

def Running_Test(dico_QA=default_QA):
    nbr_of_Question= len(dico_QA)
    list_of_questions, list_of_answers= obtain_listes(dico_QA)
    score=0
    for i in range (0, nbr_of_Question):
        answer= input(list_of_questions[i] +"\n")
        if answer.lower() == list_of_answers[i].lower():
            print("Correct!\n")
            score += 1
        else:
            print("Incorrect!\n")
    percent= (score/nbr_of_Question)*100
    print(f"You have {score}/{nbr_of_Question} ({percent}%) of correct answers!!!")

In [2]:
Running_Test()

What is the capital of France?
 Paris


Correct!



Who wrote 'Romeo and Juliet'?
 William Shakespeare


Correct!



What is the tallest mountain in the world?
 fjds


Incorrect!



What is the chemical symbol for water?
 fnhf


Incorrect!



Who painted the Mona Lisa?
 fgjsnfn


Incorrect!



What is the currency of Japan?
 gjifko


Incorrect!



What is the powerhouse of the cell?
 gkig


Incorrect!



What is the boiling point of water in Celsius?
 gfg


Incorrect!



Who is the first president of the United States?
 gl,s


Incorrect!



What is the largest ocean on Earth?
 Pacific ocean


Correct!

You have 3/10 (30.0%) of correct answers!!!


In [5]:
from langchain_community.llms import HuggingFaceHub
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from dotenv import load_dotenv
import ast
load_dotenv()


Type_of_questions=["Closed(True or False) question", "Recall question"]
list_Level= ["beginner","intermediate","expert"]

def generate_dico_QA(number_of_questions, domain, level):
    repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
    llm= HuggingFaceHub(repo_id=repo_id, model_kwargs={"temperature":0.3})
    
    prompt_template_name= PromptTemplate(
        input_variables=['number_of_questions','domain','level'],
        template=" You are a helpful assistant who can develop questions to test knowledge. Generate {number_of_questions} questions relating to the domain of {domain} and for a {level} level.The answers don't exceed three word.You will format the questions as the keys and the answers as the values of a python dictionary.")
    name_chain= LLMChain(llm=llm, prompt=prompt_template_name)
    response = name_chain.run({'number_of_questions': number_of_questions, 'domain':domain, 'level': level})
    index_debut = response.find("{")
    index_fin = response.rfind("}") + 1
    dictionnaire_str = response[index_debut:index_fin]
    try:
        dictionnaire = ast.literal_eval(dictionnaire_str)
    except (SyntaxError, ValueError) as e:
        print("Erreur lors de l'extraction du dictionnaire :", e)
    return dictionnaire

dico=generate_dico_QA("4", "chemistry", "intermediate")
dico

{'What is the formula for water?': 'H2O',
 'What is the charge of a chlorine ion?': '-1',
 'What is the atomic number of oxygen?': '8',
 'What is the name of the compound NaCl?': 'table salt'}