In [1]:
import os
import requests
import json
from dotenv import load_dotenv


load_dotenv()

True

In [2]:
from parser.file.bulk import SimpleDirectoryReader
from parser.open_ai_func import call_openai_api, get_user_permission
from parser.schema.base import Document
from parser.token_func import group_split


def metadata_from_filename(title):
    return {'title': title}


directory = ["./inputs/iai_doc/"]
# directory = ["./inputs/iai_sample_notebook/"]
min_tokens = 250
max_tokens = 2000
token_check = True

In [3]:
from collections import defaultdict


folder_counts = defaultdict(int)
folder_names = []
for dir_path in directory:
    folder_name = os.path.basename(os.path.normpath(dir_path))
    folder_counts[folder_name] += 1
    if folder_counts[folder_name] > 1:
        folder_name = f"{folder_name}_{folder_counts[folder_name]}"
    folder_names.append(folder_name)    

In [4]:
dir_path

'./inputs/iai_doc/'

In [5]:
folder_names

['iai_doc']

In [6]:
from langchain.text_splitter import MarkdownTextSplitter


doc_reader = SimpleDirectoryReader(input_dir=dir_path, input_files=None, recursive=True,
                                 required_exts=[".rst", ".md"], num_files_limit=None,
                                 exclude_hidden=True, file_metadata=metadata_from_filename)
markdown_splitter = MarkdownTextSplitter(chunk_size=256, chunk_overlap=0)

file_lst = doc_reader.input_files
docs = []
for file_ii in file_lst:
    with open(file_ii, "r") as f:
        md_text = f.read()
    docs_file = markdown_splitter.create_documents([md_text])
    docs_file = [d for d in docs_file if len(d.page_content) > 5]
    for d in docs_file:
        d.metadata.update({"title": file_ii.name})
    docs.extend(docs_file)

input_files
None


In [7]:
len(docs)

761

In [8]:
docs[:5]

[Document(page_content='# Errors\nThe Kittn API uses the following error codes:', metadata={'title': '_errors.md'}),
 Document(page_content='Error Code | Meaning\n---------- | -------\n400 | Bad Request -- Your request is invalid.\n401 | Unauthorized -- Your API key is wrong.\n403 | Forbidden -- The kitten requested is hidden for administrators only.', metadata={'title': '_errors.md'}),
 Document(page_content="404 | Not Found -- The specified kitten could not be found.\n405 | Method Not Allowed -- You tried to access a kitten with an invalid method.\n406 | Not Acceptable -- You requested a format that isn't json.", metadata={'title': '_errors.md'}),
 Document(page_content="410 | Gone -- The kitten requested has been removed from our servers.\n418 | I'm a teapot.\n429 | Too Many Requests -- You're requesting too many kittens! Slow down!\n500 | Internal Server Error -- We had a problem with our server. Try again later.", metadata={'title': '_errors.md'}),
 Document(page_content="503 | Se

In [None]:
for ii in range(len(docs)):
    print(docs[ii].page_content)
    print("-" * 20)

In [9]:
doc_reader = SimpleDirectoryReader(input_dir=dir_path, input_files=None, recursive=True,
                                 required_exts=[".rst", ".md"], num_files_limit=None,
                                 exclude_hidden=True, file_metadata=metadata_from_filename)

input_files
None


In [10]:
parser = doc_reader.file_extractor[".md"]
parser.init_parser()

In [11]:
parser.parse_tups(file_lst[2])

[('Running AWS Batch jobs with the SDK',
  '\nThis tutorial assumes that you have configured the AWS Batch requirements, such as roles and permissions, as described in . \n\nFederated learning models are trained through sessions. You define the parameters required to train a federated model, including data and model configurations, in a session. Additional session parameters are required when using AWS Batch. \n\nUse the `integrateai_batch_client.ipynb` notebook to follow along and test the examples shown below by filling in your own variables as required.\n\nPrepare your model configuration and data schema. See  for details on this process. \n\nDefine your training session as usual. The session definition is passed to the batch through the task group that also contains the tasks for the batch.\n\n```python\ntraining_session = client.create_fl_session(\n    name="Testing notebook",\n    description="I am testing a batch job through a notebook",\n    min_num_clients=2,\n    num_rounds=2

In [12]:
raw_docs = doc_reader.load_data()

In [13]:
raw_docs[:5]

[Document(text="\n\nErrors\nThe Kittn API uses the following error codes:\n\n\nError Code | Meaning\n---------- | -------\n400 | Bad Request -- Your request is invalid.\n401 | Unauthorized -- Your API key is wrong.\n403 | Forbidden -- The kitten requested is hidden for administrators only.\n404 | Not Found -- The specified kitten could not be found.\n405 | Method Not Allowed -- You tried to access a kitten with an invalid method.\n406 | Not Acceptable -- You requested a format that isn't json.\n410 | Gone -- The kitten requested has been removed from our servers.\n418 | I'm a teapot.\n429 | Too Many Requests -- You're requesting too many kittens! Slow down!\n500 | Internal Server Error -- We had a problem with our server. Try again later.\n503 | Service Unavailable -- We're temporarily offline for maintenance. Please try again later.\n\n", doc_id=None, embedding=None, extra_info={'title': 'inputs/iai_doc/_errors.md'}),
 Document(text='\n\nAWS Batch and Fargate Manual Setup\n\n', doc_id

In [14]:
# Here we split the documents, as needed, into smaller chunks.
# We do this due to the context limits of the LLMs.
processed_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)

Grouping small documents
Separating large documents


In [15]:
processed_docs[:5]

[Document(text="\n\nErrors\nThe Kittn API uses the following error codes:\n\n\nError Code | Meaning\n---------- | -------\n400 | Bad Request -- Your request is invalid.\n401 | Unauthorized -- Your API key is wrong.\n403 | Forbidden -- The kitten requested is hidden for administrators only.\n404 | Not Found -- The specified kitten could not be found.\n405 | Method Not Allowed -- You tried to access a kitten with an invalid method.\n406 | Not Acceptable -- You requested a format that isn't json.\n410 | Gone -- The kitten requested has been removed from our servers.\n418 | I'm a teapot.\n429 | Too Many Requests -- You're requesting too many kittens! Slow down!\n500 | Internal Server Error -- We had a problem with our server. Try again later.\n503 | Service Unavailable -- We're temporarily offline for maintenance. Please try again later.\n\n", doc_id=None, embedding=None, extra_info={'title': 'inputs/iai_doc/_errors.md'}),
 Document(text='\n\nAWS Batch and Fargate Manual Setup\n\n', doc_id