In [3]:
# settings.py
import os
import pandas as pd
from os.path import join, dirname
from dotenv import load_dotenv

dotenv_path = join(dirname(__file__), '../config/dev.env')
load_dotenv(dotenv_path)

import warnings
import openai
warnings.filterwarnings('ignore')

## Document Loading

In [12]:
from langchain.document_loaders import CSVLoader
from langchain.vectorstores import DocArrayInMemorySearch
from IPython.display import display, Markdown

In [5]:
file = './data/dummy_data.csv'
loader = CSVLoader(file_path=file)

In [6]:
from langchain.indexes import VectorstoreIndexCreator

In [9]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])

In [10]:
query ="Please list all Men who make over $100000\
in a table in markdown and summarize each one."

In [13]:
response = index.query(query)
display(Markdown(response))



| Name | Age | Gender | Income |
| ---- | --- | ------ | ------ |
| Nick | 66  | M      | 100000 |

Nick is a 66 year old man with an income of $100000.

| Name | Age | Gender | Income |
| ---- | --- | ------ | ------ |
| David | 128  | M      | 60000 |

David is a 128 year old man with an income of $60000.

## Document Splitting
### type of splitters
- CharacterTextSplitter()
- Markdown
- Token
- Sentance
- Recurive
- Language
- NLTKTextSplitter
- SpacyText

In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [15]:
chunk_size =26
chunk_overlap = 4

In [16]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

### Recursive splitting 
- RecursiveCharacterSplittler is recommended for generic text

In [18]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""
len(some_text)

496

In [19]:
c_splitter = CharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separator = ' '
)
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0, 
    separators=["\n\n", "\n", " ", ""]
)

In [20]:
c_splitter.split_text(some_text)

['When writing documents, writers will use document structure to group content. This can convey to the reader, which idea\'s are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also,',
 'have a space.and words are separated by space.']

In [21]:
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.",
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [22]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "\. ", " ", ""]
)
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related",
 '. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns',
 '. Carriage returns are the "backslash n" you see embedded in this string',
 '. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [23]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related.",
 'For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns.',
 'Carriage returns are the "backslash n" you see embedded in this string.',
 'Sentences have a period at the end, but also, have a space.and words are separated by space.']

## Token Splitting

In [24]:
from langchain.text_splitter import TokenTextSplitter

In [25]:
text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)

In [26]:
text1 = "foo bar bazzyfoo"

In [27]:
text_splitter.split_text(text1)

['foo', ' bar', ' b', 'az', 'zy', 'foo']