<a href="https://colab.research.google.com/github/taipei1/nlp/blob/main/RAGipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install langchain




In [3]:
!pip install langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.11-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.0-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.23.1-py3-none-any.whl.metadata (7.5 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB

In [4]:
!pip install langchain-huggingface

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Downloading langchain_huggingface-0.1.2-py3-none-any.whl (21 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.1.2


In [5]:
!pip install langchain-groq

Collecting langchain-groq
  Downloading langchain_groq-0.2.1-py3-none-any.whl.metadata (2.9 kB)
Collecting groq<1,>=0.4.1 (from langchain-groq)
  Downloading groq-0.13.0-py3-none-any.whl.metadata (13 kB)
Downloading langchain_groq-0.2.1-py3-none-any.whl (14 kB)
Downloading groq-0.13.0-py3-none-any.whl (108 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.8/108.8 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq, langchain-groq
Successfully installed groq-0.13.0 langchain-groq-0.2.1


In [6]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m64.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [7]:
!pip install spacy



In [8]:
!pip install sentence-transformers



# Для разговора с моделью

In [None]:
import os
import getpass
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import SpacyTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_community.vectorstores import FAISS

class RAGSystem:
    def __init__(self, file_path='info.txt', model_name="sentence-transformers/all-mpnet-base-v2",
                 groq_model="llama-3.3-70b-versatile"):
        self.file_path = file_path
        self.system_prompt = """
        You are a trained model that helps to give an answer from a file with weakly structured data.
        You must give answers based on the context of the file.
        If the answer is present in the context, specify that you answer according to the data.
        If the answer is not in the context, answer as you think is correct, but note that the answer is given according to information from the network.
        The file may contain tables.
        The beginning of the table is marked: # Text from tables:
        The separator between cells in a row is ===
        The first row is the column headers.
        """
        self.embeddings_model = model_name
        self.groq_model = groq_model
        self.rag_chain = self._create_rag_system()

    def _create_rag_system(self):
        try:
            # Загрузка документа
            loader = TextLoader(self.file_path, encoding='utf-8')
            docs = loader.load()

            # Разделение текста на части
            text_splitter = SpacyTextSplitter(
                chunk_size=1000,
                chunk_overlap=200,
                length_function=len
            )
            all_splits = text_splitter.split_documents(docs)

            # Создание векторного хранилища
            embeddings = HuggingFaceEmbeddings(model_name=self.embeddings_model)
            vector_store = FAISS.from_documents(all_splits, embeddings)

            # Инициализация языковой модели
            os.environ["GROQ_API_KEY"] = getpass.getpass("Введите ваш Groq API ключ: ")
            llm = ChatGroq(model=self.groq_model)

            # Создание промпта
            prompt = ChatPromptTemplate.from_messages([
                SystemMessagePromptTemplate.from_template(self.system_prompt),
                HumanMessagePromptTemplate.from_template(
                    "Контекст: {context}\n\nВопрос: {question}"
                )
            ])

            def format_docs(docs):
                return "\n\n".join(doc.page_content for doc in docs)

            retriever = vector_store.as_retriever()

            # Создание цепочки RAG
            rag_chain = (
                {"context": retriever | format_docs, "question": RunnablePassthrough()}
                | prompt
                | llm
                | StrOutputParser()
            )

            return rag_chain

        except Exception as e:
            print(f"Ошибка при создании RAG системы: {e}")
            return None

    def ask_question(self, question):
        if self.rag_chain is None:
            print("RAG система не была инициализирована.")
            return None

        try:
            response = self.rag_chain.invoke(question)
            return response
        except Exception as e:
            print(f"Ошибка при получении ответа: {e}")
            return None

def main():
    print("Терминальное приложение для взаимодействия с RAG системой")
    print("Для выхода введите 'exit'")

    try:
        rag_system = RAGSystem()
    except Exception as e:
        print(f"Ошибка при инициализации системы: {e}")
        return

    while True:
        try:
            # Ввод вопроса пользователем
            user_question = input("\nВведите ваш вопрос: ").strip()

            # Проверка на выход
            if user_question.lower() == 'exit':
                print("Завершение работы...")
                break

            # Получение ответа
            response = rag_system.ask_question(user_question)

            # Вывод ответа
            if response:
                print("\nОтвет:")
                print(response)

        except KeyboardInterrupt:
            print("\nПрервано пользователем. Для выхода введите 'exit'.")
        except Exception as e:
            print(f"Произошла ошибка: {e}")

if __name__ == "__main__":
    main()

# Для записи ответветов с целью дальнейшего анализа.

In [None]:
import os
import getpass
import pandas as pd
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import SpacyTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_community.vectorstores import FAISS

class RAGSystem:
    def __init__(self, file_path='info.txt', model_name="sentence-transformers/all-mpnet-base-v2",
                 groq_model="llama-3.3-70b-versatile"):
        self.file_path = file_path
        self.system_prompt = """
        You are a trained model that helps to give an answer from a file with weakly structured data.
        You must give answers based on the context of the file.
        If the answer is present in the context, specify that you answer according to the data.
        If the answer is not in the context, answer as you think is correct, but note that the answer is given according to information from the network.
        The file may contain tables.
        The beginning of the table is marked: # Text from tables:
        The separator between cells in a row is ===
        The first row is the column headers.
        """
        self.embeddings_model = model_name
        self.groq_model = groq_model
        self.results_df = pd.DataFrame(columns=['Question', 'Answer'])
        self.rag_chain = self._create_rag_system()

    def _create_rag_system(self):
        try:
            # Загрузка документа
            loader = TextLoader(self.file_path, encoding='utf-8')
            docs = loader.load()

            # Разделение текста на части
            text_splitter = SpacyTextSplitter(
                chunk_size=1000,
                chunk_overlap=200,
                length_function=len
            )
            all_splits = text_splitter.split_documents(docs)

            # Создание векторного хранилища
            embeddings = HuggingFaceEmbeddings(model_name=self.embeddings_model)
            vector_store = FAISS.from_documents(all_splits, embeddings)

            # Инициализация языковой модели
            os.environ["GROQ_API_KEY"] = getpass.getpass("Введите ваш Groq API ключ: ")
            llm = ChatGroq(model=self.groq_model)

            # Создание промпта
            prompt = ChatPromptTemplate.from_messages([
                SystemMessagePromptTemplate.from_template(self.system_prompt),
                HumanMessagePromptTemplate.from_template(
                    "Контекст: {context}\n\nВопрос: {question}"
                )
            ])

            def format_docs(docs):
                return "\n\n".join(doc.page_content for doc in docs)

            retriever = vector_store.as_retriever()

            # Создание цепочки RAG
            rag_chain = (
                {"context": retriever | format_docs, "question": RunnablePassthrough()}
                | prompt
                | llm
                | StrOutputParser()
            )

            return rag_chain

        except Exception as e:
            print(f"Ошибка при создании RAG системы: {e}")
            return None

    def ask_question(self, question):
        if self.rag_chain is None:
            print("RAG система не была инициализирована.")
            return None

        try:
            response = self.rag_chain.invoke(question)

            # Добавление вопроса и ответа в DataFrame
            new_row = pd.DataFrame({
                'Question': [question],
                'Answer': [response]
            })
            self.results_df = pd.concat([self.results_df, new_row], ignore_index=True)

            return response
        except Exception as e:
            print(f"Ошибка при получении ответа: {e}")
            return None

def main():
    print("Терминальное приложение для взаимодействия с RAG системой")
    print("Для выхода введите 'exit'")

    try:
        rag_system = RAGSystem()
    except Exception as e:
        print(f"Ошибка при инициализации системы: {e}")
        return

    while True:
        try:
            # Ввод вопроса пользователем
            user_question = input("\nВведите ваш вопрос: ").strip()

            # Проверка на выход
            if user_question.lower() == 'exit':
                # Сохранение результатов в CSV перед выходом
                rag_system.results_df.to_csv('rag_results.csv', index=False)
                print("Результаты сохранены в 'rag_results.csv'")
                print("Завершение работы...")
                break

            # Получение ответа
            response = rag_system.ask_question(user_question)

            # Вывод ответа
            if response:
                print("\nОтвет:")
                print(response)
                print("\nТекущие результаты:")
                print(rag_system.results_df)

        except KeyboardInterrupt:
            # Сохранение результатов при прерывании
            rag_system.results_df.to_csv('rag_results.csv', index=False)
            print("\nПрервано пользователем. Результаты сохранены в 'rag_results.csv'.")
            break
        except Exception as e:
            print(f"Произошла ошибка: {e}")

if __name__ == "__main__":
    main()