# Document Loaders
- I/O
    - Input: files
    - Output: text string
- Test Files
    - VisionLLM.pdf
    - Random .txt file
    - one .docx with one IELTS Writing Example inside

In [1]:
texts = []

## Loading Files

### Load Files in a Directory

In [1]:
from langchain.document_loaders import DirectoryLoader
import time

In [2]:
directory_path = './test_files_directory'

directory_loader = DirectoryLoader(
    path = directory_path,
    glob = "**/[!.]*",
)

In [3]:
# Test on one .txt one VisionLLM Paper -> 35.1s
start = time.time()
docs_from_directory = directory_loader.load()
end = time.time()
print(end-start)

detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with another strategy.
Falling back to partitioning with ocr_only.


36.846291303634644


In [5]:
print(len(str(docs_from_directory[1])))

78041


In [6]:
'aligns the definitions of vision-centric tasks with the ' in str(docs_from_directory[1])

True

In [9]:
for doc in docs_from_directory:
    texts.append(str(doc))

In [12]:
with open('1.txt', 'w') as f:
    f.write(str(docs_from_directory[1]))

### Load Content from Web Link
- Using Unstructured Warpped by Langchain
- Using requests & beautifulsoup4

In [2]:
web_url = ['https://en.wikipedia.org/wiki/Pok%C3%A9mon']

#### Using Langchain Unstructured
- One Wikipedia Link above, takes 0.57s
- A lot of \n\n in here (might because of \n\n)

In [14]:
from langchain.document_loaders import UnstructuredURLLoader
import time

In [15]:
url_loader = UnstructuredURLLoader(
    urls = web_url,
    
)

In [16]:
start = time.time()
url_data = url_loader.load()
end = time.time()
print(end-start)

1.5237431526184082


In [17]:
with open('1.txt', 'w') as f:
    f.write(str(url_data[0]))

In [18]:
print(type(url_data[0]))

<class 'langchain.schema.Document'>


In [19]:
'The franchise began as Pocket Monsters: Red and Green' in str(url_data[0])

False

#### Using requests + beautifulsoup4
- takes 0.61 but with filter

In [3]:
import requests
from bs4 import BeautifulSoup
import time

def get_text_content_from_link(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the desired elements and extract their text content
        text_content = ''
        for element in soup.find_all('p'):  # Example: Extract text from all <p> tags
            text_content += element.get_text() + '\n'

        return text_content

    else:
        print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
        return None

In [4]:
start = time.time()
text_web = ''
for i in range(len(web_url)):
    try:
        web_content = get_text_content_from_link(web_url[i])
        lines = web_content.split('\n')
        for line in lines:
            if len(line) > 100:
                web_content += line + '\n'
        text_web += ''.join(['',f'[this text is from url{i}]', '\n', web_content])
        print(i)
    except:
        pass
end = time.time()
print(end-start)

0
0.5877127647399902


In [5]:
'The franchise began as Pocket Monsters: Red and Green' in text_web

True

In [5]:
texts.append(text_web)

In [21]:
with open('1.txt', 'w') as f:
    f.write(text_web)

### Load .pdf File
- Using PyPDF2 -> much faster than the Langchain

In [6]:
import PyPDF2
import time

In [7]:
# It opens the file in binary mode, extract the text from each page
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for i in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[i]
            text += page.extract_text()
        return text

In [8]:
start = time.time()
pdf_path = './test_files_directory/VisionLLM - Large Language Model is also an open-Ended Decoder for Vision-Centric Tasks.pdf'
pdf_text = extract_text_from_pdf(pdf_path)
end = time.time()
print(end-start)

0.3291749954223633


In [9]:
texts.append(''.join(['[from pdf0]\n', pdf_text]))

In [11]:
'aligns the definitions of vision-centric' in pdf_text

True

In [23]:
with open('1.txt', 'w') as f:
    f.write(pdf_text)

### Load .doc Files
- Takes 0.005s

In [10]:
import docx
import time

In [11]:
def extract_text_from_doc(file_path):
    doc = docx.Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text
    return text

In [12]:
doc_file_path = './test_files_directory/test.docx'

start = time.time()
text_docx = extract_text_from_doc(doc_file_path)
end = time.time()
print(end - start)

0.0028154850006103516


In [13]:
'Nowadays, rural-urban migration has become an irreversible trend globally' in text_docx

True

In [14]:
texts.append(''.join(['[from docx0]\n', text_docx]))

## Save those text to .txt for next step

In [15]:
print(len(texts))

3


In [16]:
texts

 '[from pdf0]\nVisionLLM: Large Language Model is also\nan Open-Ended Decoder for Vision-Centric Tasks\nWenhai Wang∗1, Zhe Chen∗2,1, Xiaokang Chen∗3,1, Jiannan Wu∗4,1, Xizhou Zhu5,1\nGang Zeng3, Ping Luo4,1, Tong Lu2, Jie Zhou6, Yu Qiao1, Jifeng Dai†6,1\n1OpenGVLab, Shanghai AI Laboratory2Nanjing University3Peking University\n4The University of HongKong5SenseTime Research6Tsinghua University\nCode: https://github.com/OpenGVLab/VisionLLM\nDemo: https://github.com/OpenGVLab/InternGPT\nAbstract\nLarge language models (LLMs) have notably accelerated progress towards artificial\ngeneral intelligence (AGI), with their impressive zero-shot capacity for user-tailored\ntasks, endowing them with immense potential across a range of applications.\nHowever, in the field of computer vision, despite the availability of numerous\npowerful vision foundation models (VFMs), they are still restricted to tasks in a\npre-defined form, struggling to match the open-ended task capabilities of LLMs.\nIn this wo

In [17]:
with open('text.txt', 'w') as f:
    f.write('++-------------------++\n'.join(texts))