In [1]:
from dotenv import load_dotenv
from tqdm import tqdm

load_dotenv()

True

In [2]:
from langchain_openai import ChatOpenAI

model = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-1106")

# Парсинг DOCX документа

In [3]:
def filter_tasknames_from_str(s: str) -> list[str]:
    import re

    pattern = r"\w+_\w+_\d+\.\d+\.\d+"
    matches = re.findall(pattern, s)
    return sorted(set(matches))


def text_preprocess(text: str) -> str:
    """Deletes extra newlines and adds hints for model"""
    import re

    res = re.sub(r"\n+", "\n", text)
    res = res.replace("Эталон.", "Решение:")
    return res


def split_text_by_tasknames(text: str, tasknames: list[str]) -> list[str]:
    """Splits whole document text to list of strings, each one contatining context about one task"""
    res = []
    for i in range(len(tasknames) - 1):
        res.append(text[text.find(tasknames[i]) : text.find(tasknames[i + 1])])

    res.append(text[text.find(tasknames[-1]) :])
    return res

In [107]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.prompts import ChatPromptTemplate
from operator import itemgetter

# Main working part

In [108]:
funcs = [
    {
        "name": "task_info",
        "description": "Analyze context and return task description and solution if there is not enough information in context, return string 'ERROR'",
        "parameters": {
            "type": "object",
            "properties": {
                "name": {
                    "type": "string",
                    "description": "Name of the task. Example: home_beginner_2.5.3",
                },
                "description": {
                    "type": "string",
                    "description": "Description of the task.  Example: 'Напишите программу, которая считывает два целых числа и выводит их сумму'. If there is not enough information in context, return string 'ERROR'.",
                },
                "solution": {
                    "type": "string",
                    "description": "Solution of task written Python (any python code located in task description, with input and print). Example: 'n=int(input())\nm=int(input())\nprint(n+m)'. If there is not enough information in context, return string 'ERROR'",
                },
            },
            "required": ["description", "solution"],
        },
    },
    {
        "name": "solution_check",
        "description": "Check whether solution provided for the task is correct ",
        "parameters": {
            "type": "object",
            "properties": {
                "verificated": {
                    "type": "boolean",
                    "description": "Is solution correct or not for this task?",
                }
            },
            "required": ["verificated"],
        },
    },
    {
        "name": "input_data_info",
        "description": "From the task description give the format of input data",
        "parameters": {
            "type": "object",
            "properties": {
                "input": {
                    "type": "string",
                    "description": "detailed information about input",
                }
            },
            "required": ["input"],
        },
    },
]

In [74]:
from langchain_community.document_loaders import Docx2txtLoader

document_content = Docx2txtLoader("./home_beginner_2.docx").load()[0].page_content

In [75]:
tasknames = filter_tasknames_from_str(document_content)
text = text_preprocess(document_content)
splitted_texts = split_text_by_tasknames(text, tasknames)

In [93]:
prompt_for_parser = ChatPromptTemplate.from_template(
    """Return information about provided context. Information should be based ONLY on the following context (if there is not enough information in context, return string  "ERROR").:

Context: {context}

Question: {question}

(if there is not enough information in context, return string  "ERROR")
"""
)

chain_parsing = (
    prompt_for_parser
    | model.bind(function_call={"name": "task_info"}, functions=funcs)
    | JsonOutputFunctionsParser()
)

In [94]:
prompt_for_verification = ChatPromptTemplate.from_template(
    """Is this solution correct for the following task?

Solution: {solution}

Task: {description}
"""
)

verification_chain = (
    prompt_for_verification
    | model.bind(function_call={"name": "solution_check"}, functions=funcs)
    | JsonOutputFunctionsParser()
    | itemgetter("verificated")
)

In [109]:
prompt_for_input = ChatPromptTemplate.from_template(
    """From the description of the code task, give the description of the input format

Task description: {description}

Input format:"""
)

input_data_chain = (
    prompt_for_input
    | model.bind(function_call={"name": "input_data_info"}, functions=funcs)
    | JsonOutputFunctionsParser()
    | itemgetter("input")
)

In [110]:
main_chain = chain_parsing | {
    "name": itemgetter("name"),
    "description": itemgetter("description"),
    "solution": itemgetter("solution"),
    "verificated": verification_chain,
    "input": input_data_chain,
}



task_data = []
for i in tqdm(range(len(tasknames))):
    try:
        task_data.append(
            main_chain.invoke(
                {
                    "context": splitted_texts[i],
                    "question": f"Tell me about {tasknames[i]} and return it's solution. RETURN ERROR if there are no code problem description or solution",
                }
            )
        )
    except Exception as e:
        print(e)
        task_data.append(
            {
                "name": tasknames[i],
                "description": "ERROR",
                "solution": "ERROR",
                "verified": False,
            }
        )
task_data


# task_data = c.batch(
#     [
#         {
#             "context": splitted_texts[i],
#             "question": f"Tell me about {tasknames[i]} and return it's solution. RETURN ERROR if there are no code problem description or solution",
#         }
#         for i in range(len(tasknames))
#     ]
# )

 89%|████████▉ | 16/18 [01:52<00:47, 23.60s/it]

Could not parse function call data: Expecting ',' delimiter: line 1 column 457 (char 456)


100%|██████████| 18/18 [02:00<00:00,  6.70s/it]


[{'name': 'home_beginner_2.1.1',
  'description': 'Напишите программу, которая вводит строку и ищет ее в строке \ns = "I Do Love Python! Python is Cool!!!"\nЕсли искомая строка найдена, программа выводит на экран «найдено». В противном случае выводится «не найдено». Используйте один неполный условный оператор if.',
  'solution': 's = "I Do Love Python! Python is Cool!!!"\nc = input()\nres = "не найдено"\nif c in s:\n    res = "найдено"\nprint(res)',
  'verificated': True,
  'input': 'Строка для поиска'},
 {'name': 'home_beginner_2.1.2',
  'description': 'Напишите программу, которая вводит два целых числа и выводит результат \n«делится», если первое число делится на второе, и «не делится» в противном случае. Используйте один неполный условный оператор if.',
  'solution': 'n = int(input())\nm = int(input())\nres = "не делится"\nif n % m == 0:\n    res = "делится"\nprint(res)',
  'verificated': True,
  'input': 'Два целых числа'},
 {'name': 'home_beginner_2.1.3',
  'description': 'Напишит

In [88]:
task_data

[]

# Tests

In [95]:
c = (
    prompt_for_parser
    | model.bind(function_call={"name": "task_info"}, functions=funcs)
    | JsonOutputFunctionsParser()
    | RunnableParallel(
        verificated=(
            prompt_for_verification
            | model.bind(function_call={"name": "solution_check"}, functions=funcs)
            | JsonOutputFunctionsParser()
            | itemgetter("verificated")
        ),
        name=itemgetter("name"),
        description=itemgetter("description"),
        solution=itemgetter("solution"),
    )
)

result = c.batch(
    [
        {
            "context": splitted_texts[i],
            "question": f"Tell me about {tasknames[i]} and return it's solution. RETURN ERROR if there are no code problem description or solution",
        }
        for i in range(len(tasknames))
    ]
)
# c.get_graph().print_ascii()

OutputParserException: Could not parse function call data: Expecting ',' delimiter: line 1 column 457 (char 456)