In [None]:
# !pip install langchain openai tiktoken chromadb pypdf sentence_transformers InstructorEmbedding faiss-cpu

# PDF

In [1]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

root_dir = "D:/Papers"

# 加载目录下全部 PDF
# loader = DirectoryLoader(f'{root_dir}/', glob="./*.pdf", loader_cls=PyPDFLoader)
# documents = loader.load()

# 单个PDF
filename = "2201.11903_CoT"
loader = PyPDFLoader(f"{root_dir}/{filename}.pdf")
documents = loader.load()

# HTML

In [None]:
from langchain.document_loaders import BSHTMLLoader, DirectoryLoader

root_dir = "UE5.2_Docs\Animating Characters and Objects"
loader = DirectoryLoader(f'{root_dir}/', glob="./*.html", loader_cls=BSHTMLLoader, loader_kwargs={"open_encoding": 'utf-8'})

documents = loader.load()
documents[0]

In [26]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

# Markdown - UE

In [2]:
import re
import logging
from typing import Dict, List, Union

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader

logger = logging.getLogger(__name__)


class UEDocs_HTMLLoader(BaseLoader):
    """Loader that uses beautiful soup to parse Unreal HTML files."""

    def __init__(
        self,
        file_path: str,
        open_encoding: Union[str, None] = None,
        bs_kwargs: Union[dict, None] = None,
        get_text_separator: str = "",
    ) -> None:
        """Initialise with path, and optionally, file encoding to use, and any kwargs
        to pass to the BeautifulSoup object.

        Args:
            file_path: The path to the file to load.
            open_encoding: The encoding to use when opening the file.
            bs_kwargs: Any kwargs to pass to the BeautifulSoup object.
            get_text_separator: The separator to use when calling get_text on the soup.
        """
        try:
            import bs4  # noqa:F401
        except ImportError:
            raise ImportError(
                "beautifulsoup4 package not found, please install it with "
                "`pip install beautifulsoup4`"
            )
        try:
            import markdownify  # noqa:F401
        except ImportError:
            raise ImportError(
                "markdownify package not found, please install it with "
                "`pip install markdownify`"
            )

        self.file_path = file_path
        self.open_encoding = open_encoding
        if bs_kwargs is None:
            bs_kwargs = {"features": "lxml"}
        self.bs_kwargs = bs_kwargs
        self.get_text_separator = get_text_separator

    def load(self) -> List[Document]:
        """Load HTML document into document objects."""
        from bs4 import BeautifulSoup
        import markdownify

        with open(self.file_path, "r", encoding=self.open_encoding) as f:
            soup = BeautifulSoup(f, **self.bs_kwargs)

        content_div = soup.find('div', id='contentContainer')
        md = markdownify.markdownify(content_div.encode_contents(), heading_style='atx')

        cleaned_text = re.sub(r'^\s+', '', str(md)) # 去掉开头空白
        cleaned_text = re.sub(r'\n{2,}', r'\n\n', cleaned_text) # 限制连续换行
        cleaned_text = re.sub(r'\n +', r'\n', cleaned_text) # 去除行首空格
        cleaned_text = re.sub(r' +', r' ', cleaned_text) # 去除重复空格

        if soup.title:
            title = str(soup.title.string)
        else:
            title = ""

        metadata: Dict[str, Union[str, None]] = {
            "source": self.file_path,
            "title": title,
        }
        return [Document(page_content=cleaned_text, metadata=metadata)]

# loader = UEDocs_HTMLLoader('UE5.2_Docs\Animating Characters and Objects\index.html')
# documents = loader.load()
# documents[0]

In [3]:
from langchain.document_loaders import DirectoryLoader

root_dir = "UE5.2_Docs\Animating Characters and Objects"
loader = DirectoryLoader(f'{root_dir}/', glob="./*.html", loader_cls=UEDocs_HTMLLoader, loader_kwargs={"open_encoding": 'utf-8'})

documents = loader.load()
len(documents), documents[0]



(6,
 Document(page_content="![](./../../Images/animating-characters-and-objects/BannerImage.png) \n\n# Animating Characters and Objects\n\n## Explore Unreal Engine's animation tools and editors for working with 2D and 3D characters and objects.\n\n\n\nOn this page\n\n\n* [Skeletal Mesh Animation](#skeletalmeshanimation)\n* [Sequencer](#sequencer)\n* [Control Rig](#controlrig)\n* [Paper 2D](#paper2d)\n\nYou can use **Unreal Engine**'s suite of powerful animation tools and editors to create character and object runtime animation systems, rendered cinematic content, and author new animation content directly in the engine.\n\n## Skeletal Mesh Animation\n\nWith the [Skeletal Mesh Animation System](../skeletal-mesh-animation-system-in-unreal-engine), you can create robust animation systems for characters and objects within Unreal Engine. After importing a skinned mesh object as a [Skeletal Mesh asset](../skeletal-mesh-assets-in-unreal-engine), you can manage its properties and build logic to

In [4]:
from langchain.text_splitter import MarkdownTextSplitter

text_splitter = MarkdownTextSplitter(chunk_size=2000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

# embeddings

* 中文：**shibing624/text2vec-base-chinese** / **shibing624/text2vec-base-chinese-paraphrase** / **moka-ai/m3e-base**
* 英文：**hkunlp/instructor-xl**l

In [5]:
len(texts), texts[0]

(62,
 Document(page_content="![](./../../Images/animating-characters-and-objects/BannerImage.png) \n\n# Animating Characters and Objects\n\n## Explore Unreal Engine's animation tools and editors for working with 2D and 3D characters and objects.\n\n\n\nOn this page\n\n\n* [Skeletal Mesh Animation](#skeletalmeshanimation)\n* [Sequencer](#sequencer)\n* [Control Rig](#controlrig)\n* [Paper 2D](#paper2d)\n\nYou can use **Unreal Engine**'s suite of powerful animation tools and editors to create character and object runtime animation systems, rendered cinematic content, and author new animation content directly in the engine.\n\n## Skeletal Mesh Animation\n\nWith the [Skeletal Mesh Animation System](../skeletal-mesh-animation-system-in-unreal-engine), you can create robust animation systems for characters and objects within Unreal Engine. After importing a skinned mesh object as a [Skeletal Mesh asset](../skeletal-mesh-assets-in-unreal-engine), you can manage its properties and build logic t

In [1]:
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings

# 英文
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", model_kwargs={"device": "cuda"})

# 中文
# embeddings = HuggingFaceEmbeddings(model_name="shibing624/text2vec-base-chinese", model_kwargs={"device": "cuda"})
# embeddings = HuggingFaceEmbeddings(model_name="moka-ai/m3e-base", model_kwargs={"device": "cuda"})

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


In [2]:
from langchain.vectorstores import FAISS

# add
# db = FAISS.from_documents(texts, embeddings)
# db.save_local("faiss_index")

# load
db = FAISS.load_local("faiss_index", embeddings)

In [76]:
from langchain.vectorstores import Chroma

# add
# db = Chroma.from_documents(texts, embeddings, persist_directory="./chroma_db")

# load
db = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)

In [3]:
query = "How to Animation Skeletal Characters object"
docs = db.similarity_search(query, k=10)
docs[:3]

[Document(page_content="![](./../../Images/animating-characters-and-objects/BannerImage.png) \n\n# Animating Characters and Objects\n\n## Explore Unreal Engine's animation tools and editors for working with 2D and 3D characters and objects.\n\n\n\nOn this page\n\n\n* [Skeletal Mesh Animation](#skeletalmeshanimation)\n* [Sequencer](#sequencer)\n* [Control Rig](#controlrig)\n* [Paper 2D](#paper2d)\n\nYou can use **Unreal Engine**'s suite of powerful animation tools and editors to create character and object runtime animation systems, rendered cinematic content, and author new animation content directly in the engine.\n\n## Skeletal Mesh Animation\n\nWith the [Skeletal Mesh Animation System](../skeletal-mesh-animation-system-in-unreal-engine), you can create robust animation systems for characters and objects within Unreal Engine. After importing a skinned mesh object as a [Skeletal Mesh asset](../skeletal-mesh-assets-in-unreal-engine), you can manage its properties and build logic to run

# Real World Embeddings

### Metadata

This dataset is a mirror of the original ArXiv data. Because the full dataset is rather large (1.1TB and growing), this dataset provides only a metadata file in the  `json`  format. This file contains an entry for each paper, containing:

-   `id`: ArXiv ID (can be used to access the paper, see below)
-   `submitter`: Who submitted the paper
-   `authors`: Authors of the paper
-   `title`: Title of the paper
-   `comments`: Additional info, such as number of pages and figures
-   `journal-ref`: Information about the journal the paper was published in
-   `doi`: [[https://www.doi.org](Digital](https://www.doi.org]%28digital/)  Object Identifier)
-   `abstract`: The abstract of the paper
-   `categories`: Categories / tags in the ArXiv system
-   `versions`: A version history

You can access each paper directly on  [ArXiv](https://arxiv.org/)  using these links:

-   `https://arxiv.org/abs/{id}`: Page for this paper including its abstract and further links
-   `https://arxiv.org/pdf/{id}`: Direct link to download the PDFdownload the PDF

In [11]:
import pandas as pd
from tqdm.autonotebook import tqdm

# from https://www.kaggle.com/datasets/Cornell-University/arxiv
filename = "./arxiv_datasets/arxiv-metadata-oai-snapshot.json"
fullsize = 2_292_057
batchsize = int(2_292_057/100)+1

# df = pd.read_json(filename, lines=True)
df = pd.concat([chunk for chunk in tqdm(pd.read_json(filename, chunksize=batchsize, lines=True), desc='Loading', total=int(fullsize/batchsize)+1)])
df

Loading:   0%|          | 0/100 [00:00<?, ?it/s]

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,..."
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,"[[Streinu, Ileana, ], [Theran, Louis, ]]"
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,"[[Pan, Hongjun, ]]"
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,"[[Callan, David, ]]"
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,"[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2292052,supr-con/9608008,Ruslan Prozorov,"R. Prozorov, M. Konczykowski, B. Schmidt, Y. Y...",On the origin of the irreversibility line in t...,"19 pages, LaTex, 6 PostScript figures; Author'...",,10.1103/PhysRevB.54.15530,,supr-con cond-mat.supr-con,,We report on measurements of the angular dep...,"[{'version': 'v1', 'created': 'Mon, 26 Aug 199...",2009-10-30,"[[Prozorov, R., ], [Konczykowski, M., ], [Schm..."
2292053,supr-con/9609001,Durga P. Choudhury,"Durga P. Choudhury, Balam A. Willemsen, John S...",Nonlinear Response of HTSC Thin Film Microwave...,"4 pages, LaTeX type, Uses IEEE style files, 60...",,10.1109/77.620744,,supr-con cond-mat.supr-con,,The non-linear microwave surface impedance o...,"[{'version': 'v1', 'created': 'Sat, 31 Aug 199...",2016-11-18,"[[Choudhury, Durga P., , Physics Department, N..."
2292054,supr-con/9609002,Durga P. Choudhury,"Balam A. Willemsen, J. S. Derov and S.Sridhar ...",Critical State Flux Penetration and Linear Mic...,"20 pages, LaTeX type, Uses REVTeX style files,...",,10.1103/PhysRevB.56.11989,,supr-con cond-mat.supr-con,,The vortex contribution to the dc field (H) ...,"[{'version': 'v1', 'created': 'Tue, 3 Sep 1996...",2009-10-30,"[[Willemsen, Balam A., , Physics Department,\n..."
2292055,supr-con/9609003,Hasegawa Yasumasa,Yasumasa Hasegawa (Himeji Institute of Technol...,Density of States and NMR Relaxation Rate in A...,"7 pages, 4 PostScript Figures, LaTeX, to appea...",,10.1143/JPSJ.65.3131,,supr-con cond-mat.supr-con,,We show that the density of states in an ani...,"[{'version': 'v1', 'created': 'Wed, 18 Sep 199...",2009-10-30,"[[Hasegawa, Yasumasa, , Himeji Institute of Te..."


In [16]:
tqdm.pandas()

sdf = df[df['update_date'] > '2023-01-01']
sdf = sdf[sdf.categories.str[:3] == 'cs.']
sdf

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
55706,803.3946,Adam Smith,Shiva Prasad Kasiviswanathan and Adam Smith,On the `Semantics' of Differential Privacy: A ...,"Older version of this paper was titled: ""A Not...","Journal of Privacy and Confidentiality, 6 (1),...",10.29012/jpc.v6i1.634,,cs.CR cs.DB,http://arxiv.org/licenses/nonexclusive-distrib...,"Differential privacy is a definition of ""pri...","[{'version': 'v1', 'created': 'Thu, 27 Mar 200...",2023-01-24,"[[Kasiviswanathan, Shiva Prasad, ], [Smith, Ad..."
104832,901.359,Chunhua Shen,Chunhua Shen and Hanxi Li,On the Dual Formulation of Boosting Algorithms,Fixed typos. 16 pages. Published in IEEE Trans...,,10.1109/TPAMI.2010.47,,cs.LG cs.CV,http://creativecommons.org/licenses/by-nc-nd/4.0/,We study boosting algorithms from a new pers...,"[{'version': 'v1', 'created': 'Fri, 23 Jan 200...",2023-05-30,"[[Shen, Chunhua, ], [Li, Hanxi, ]]"
135819,907.3654,J\'er\^ome Gauthier,"Jerome Gauthier, Laurent Duval and Jean-Christ...",Optimization of Synthesis Oversampled Complex ...,,"IEEE Transactions on Signal Processing, Octobe...",10.1109/TSP.2009.2023947,,cs.IT cs.SY eess.SY math.IT math.OC,http://arxiv.org/licenses/nonexclusive-distrib...,An important issue with oversampled FIR anal...,"[{'version': 'v1', 'created': 'Tue, 21 Jul 200...",2023-01-19,"[[Gauthier, Jerome, ], [Duval, Laurent, ], [Pe..."
169585,1001.4297,Andrew Straw,"Andrew D. Straw, Kristin Branson, Titus R. Neu...",Multi-camera Realtime 3D Tracking of Multiple ...,pdfTeX using libpoppler 3.141592-1.40.3-2.2 (W...,,10.1098/rsif.2010.0230,,cs.CV,http://arxiv.org/licenses/nonexclusive-distrib...,Automated tracking of animal movement allows...,"[{'version': 'v1', 'created': 'Mon, 25 Jan 201...",2023-02-01,"[[Straw, Andrew D., ], [Branson, Kristin, ], [..."
185642,1004.3702,Lizhi Du,Lizhi Du,A Polynomial time Algorithm for Hamilton Cycle...,"16 pages. This time, I add a detailed polynomi...",,,,cs.DS,http://arxiv.org/licenses/nonexclusive-distrib...,Based on the famous Rotation-Extension techn...,"[{'version': 'v1', 'created': 'Mon, 12 Apr 201...",2023-07-11,"[[Du, Lizhi, ]]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1877941,2307.06947,Syed Talal Wasim,"Syed Talal Wasim, Muhammad Uzair Khattak, Muza...",Video-FocalNets: Spatio-Temporal Focal Modulat...,Project page: https://TalalWasim.github.io/Vid...,,,,cs.CV cs.AI,http://creativecommons.org/licenses/by-nc-sa/4.0/,Recent video recognition models utilize Tran...,"[{'version': 'v1', 'created': 'Thu, 13 Jul 202...",2023-07-14,"[[Wasim, Syed Talal, ], [Khattak, Muhammad Uza..."
1877942,2307.06948,Muzammal Naseer,"Muhammad Uzair Khattak, Syed Talal Wasim, Muza...",Self-regulating Prompts: Foundational Model Ad...,Project page: https://muzairkhattak.github.io/...,,,,cs.CV,http://creativecommons.org/licenses/by/4.0/,Prompt learning has emerged as an efficient ...,"[{'version': 'v1', 'created': 'Thu, 13 Jul 202...",2023-07-14,"[[Khattak, Muhammad Uzair, ], [Wasim, Syed Tal..."
1877943,2307.06949,Nataniel Ruiz,"Nataniel Ruiz, Yuanzhen Li, Varun Jampani, Wei...",HyperDreamBooth: HyperNetworks for Fast Person...,project page: https://hyperdreambooth.github.io,,,,cs.CV cs.AI cs.GR cs.LG,http://creativecommons.org/licenses/by/4.0/,Personalization has emerged as a prominent a...,"[{'version': 'v1', 'created': 'Thu, 13 Jul 202...",2023-07-14,"[[Ruiz, Nataniel, ], [Li, Yuanzhen, ], [Jampan..."
2041962,cs/0508048,Olivier Danvy,Malgorzata Biernacka and Dariusz Biernacki and...,An Operational Foundation for Delimited Contin...,39 pages,"Logical Methods in Computer Science, Volume 1,...",10.2168/LMCS-1(2:5)2005,,cs.LO cs.PL,,We present an abstract machine and a reducti...,"[{'version': 'v1', 'created': 'Mon, 8 Aug 2005...",2023-06-27,"[[Biernacka, Malgorzata, ], [Biernacki, Darius..."


In [26]:
from langchain.docstore.document import Document

documents = []
for _, row in sdf.head(20).iterrows():
    text = "# {}\n{}\n".format(row.title, row.abstract)
    row.pop('abstract')
    row.pop('comments')
    row.pop('versions')
    documents.append(Document(page_content=text, metadata=row.to_dict()))

documents[0].metadata

{'id': 803.3946,
 'submitter': 'Adam Smith',
 'authors': 'Shiva Prasad Kasiviswanathan and Adam Smith',
 'title': "On the `Semantics' of Differential Privacy: A Bayesian Formulation",
 'journal-ref': 'Journal of Privacy and Confidentiality, 6 (1), 2014',
 'doi': '10.29012/jpc.v6i1.634',
 'report-no': None,
 'categories': 'cs.CR cs.DB',
 'license': 'http://arxiv.org/licenses/nonexclusive-distrib/1.0/',
 'update_date': '2023-01-24',
 'authors_parsed': [['Kasiviswanathan', 'Shiva Prasad', ''],
  ['Smith', 'Adam', '']]}

In [None]:
from langchain.vectorstores import FAISS

# add
db = FAISS.from_documents(texts, embeddings)
db.save_local("./arxiv_datasets/arxiv_faiss")