In [None]:
!curl ipinfo.io

In [None]:
!pip -q install langchain langchain-community langchain-google-genai faiss-cpu sentence-transformers pypdf python-docx langchain-google-genai

In [10]:
import os
from pypdf import PdfReader
from langchain.document_loaders import PyPDFLoader # alternativa nel reader

from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceHub
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferWindowMemory
from langchain_google_genai import ChatGoogleGenerativeAI

In [11]:
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings

os.environ['GOOGLE_API_KEY'] = 'your-token-here'
gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# Embeddings

In [None]:
!unzip /content/docx.zip

In [12]:
def get_gext(document):
    full_text = []
    for para in document.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

def split_into_paragraphs(text):
    # Usa espressioni regolari per suddividere il testo in paragrafi
    paragraphs = re.split(r'\n\s*\n', text)
    # Rimuovi eventuali paragrafi vuoti
    paragraphs = [para.strip() for para in paragraphs if para.strip()]
    return paragraphs

In [14]:
import re
from docx import Document as word_doc
from langchain.docstore.document import Document

documents = []

for doc_name in os.listdir("./docx/"):
    paragraphs_as_doc = []
    if doc_name != "pdfs":
        document = word_doc("./docx/"+doc_name)
        full_text = get_gext(document)
        paragraphs = split_into_paragraphs(full_text)
        for para in paragraphs:
            paragraphs_as_doc.append(Document(page_content=para, metadata={"source": "local"}))

    documents = documents + paragraphs_as_doc

In [15]:
db_faiss = None
for i in range(0, len(documents), 50):
  if db_faiss is None:
    db_faiss = FAISS.from_documents(documents[i:i+50], gemini_embeddings)
  else:
    db_faiss.merge_from( FAISS.from_documents(documents[i:i+50], gemini_embeddings) )

db_faiss.save_local('./faiss_db')

In [111]:
retriever = db_faiss.as_retriever(search_kwargs={"k": 20})
retrieved = retriever.invoke("dove ha la sede Assoproma ASD??")

In [None]:
[print(x.page_content + "\n") for x in retrieved]

# LLM con chain personalizzata

In [None]:
llm_mistral = HuggingFaceHub(
    repo_id = 'mistralai/Mixtral-8x7B-Instruct-v0.1', # tiiuae/falcon-7b-instruct
    #model_kwargs={"temperature": temperature, "max_length": max_length},
    huggingfacehub_api_token='your-token-here'
)

In [187]:
from langchain_core.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.runnables import RunnablePassthrough, RunnableBranch, RunnableParallel
from langchain_core.output_parsers.string import StrOutputParser

In [188]:
llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3, top_p=0.5)

In [189]:
prompt_template = '''\
Rispondi alla domanda dell'utente riportata tra <q> e </q> \
utilizzando le informazioni riportate tra <ctx> e </ctx>.
<q>{question}</q>
<ctx>{context}</ctx>
'''
#e lo storico della conversazione (riportato tra <conv> e <\conv>)
#<conv>
#{chat_history}
#<\conv>

prompt = PromptTemplate.from_template(prompt_template)

In [None]:
def extract_text(x):
  documents = ""
  for doc in x['context']:
    documents = documents + doc.page_content
  return {"context": documents, "question": x['question']}

retr_chain = RunnableParallel({"context": retriever, "question":RunnablePassthrough()})

def print_p(x):
  print(x)
  return x

rag_chain = retr_chain | extract_text | prompt | print_p | llm | StrOutputParser()


question = "dove si trova Assoproma ASD?"

response = rag_chain.invoke(question)

In [None]:
print(response)