Embedding 构建向量

In [1]:
import { TextLoader } from "langchain/document_loaders/fs/text";
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
import { OpenAIEmbeddings } from "@langchain/openai";
import { load } from "dotenv";
var env = await load();
var process = { env };
var loader = new TextLoader("data/kong.txt");
var docs = await loader.load();
var splitter = new RecursiveCharacterTextSplitter({ chunkSize: 100, chunkOverlap: 20 });
var splitDocs = await splitter.splitDocuments(docs);
var embeddings = new OpenAIEmbeddings({
    configuration: {
        baseURL: process.env.baseURL,
    },
});
console.log(splitDocs[0]);
var res = await embeddings.embedQuery(splitDocs[0].pageContent)
console.log(res);


Document {
  pageContent: "鲁镇的酒店的格局，是和别处不同的：都是当街一个曲尺形的大柜台，柜里面预备着热水，可以随时温酒。做工的人，傍午傍晚散了工，每每花四文铜钱，买一碗酒，——这是二十多年前的事，现在每碗要涨到十文，——靠柜外",
  metadata: { source: "data/kong.txt", loc: { lines: { from: 1, to: 1 } } }
}
[
   0.017491372,   0.0005632728,   0.015193834,  -0.021402655,  -0.0067592734,
  -0.010072241,    -0.02246937, -0.0057780333, -0.0074191317,   -0.030196207,
  -0.009265367,    0.021703525,  -0.013046731,  0.0034223737,   -0.013190327,
   0.018393977,   0.0064925947,   0.010291054,   0.027939698,   -0.012766377,
   -0.03142703,    0.010933817,  0.0023607882, -0.0059113726,   -0.026120814,
   0.018489707,    0.020171832,  -0.010680814,   -0.00170093,     0.01595968,
  0.0001302408,   0.0025163507,   -0.02992269,   -0.01595968,   -0.005118175,
  -0.002066758, -0.00013088186, -0.0028599557,   0.016260548,  -0.0056652077,
  0.0004542936,   -0.007535376,  0.0149203185,   0.013839928,   -0.014140797,
   0.019583773,    0.006119929,   0.013491195,  -0.014017714,  -0.0010188485,
   0.00663

MemoryVectorStore 向量存储

In [2]:
import { MemoryVectorStore } from "langchain/vectorstores/memory";
var vectorstore = new MemoryVectorStore(embeddings);
await vectorstore.addDocuments(splitDocs);
//每个输入，返回相似度最高的两个文本内容
var retriever = vectorstore.asRetriever(2);
var res = await retriever.invoke('茴香豆是做什么用的')
console.log(res);


[
  Document {
    pageContent: "有喝酒的人便都看着他笑，有的叫道，“孔乙己，你脸上又添上新伤疤了！”他不回答，对柜里说，“温两碗酒，要一碟茴香豆。”便排出九文大钱。他们又故意的高声嚷道，“你一定又偷了人家的东西了！”孔乙己睁大眼睛说",
    metadata: { source: "data/kong.txt", loc: { lines: { from: 7, to: 7 } } }
  },
  Document {
    pageContent: "有几回，邻居孩子听得笑声，也赶热闹，围住了孔乙己。他便给他们一人一颗。孩子吃完豆，仍然不散，眼睛都望着碟子。孔乙己着了慌，伸开五指将碟子罩住，弯腰下去说道，“不多了，我已经不多了。”直起身又看一看豆",
    metadata: { source: "data/kong.txt", loc: { lines: { from: 15, to: 15 } } }
  }
]


In [3]:
// 如果提问的特别简洁，并没有相应的关键词，就会出现提取的信息错误的问题
var res = await retriever.invoke('下酒菜一般是什么？')
console.log(res);

[
  Document {
    pageContent: "顾客，多是短衣帮，大抵没有这样阔绰。只有穿长衫的，才踱进店面隔壁的房子里，要酒要菜，慢慢地坐喝。",
    metadata: { source: "data/kong.txt", loc: { lines: { from: 1, to: 1 } } }
  },
  Document {
    pageContent: "有喝酒的人便都看着他笑，有的叫道，“孔乙己，你脸上又添上新伤疤了！”他不回答，对柜里说，“温两碗酒，要一碟茴香豆。”便排出九文大钱。他们又故意的高声嚷道，“你一定又偷了人家的东西了！”孔乙己睁大眼睛说",
    metadata: { source: "data/kong.txt", loc: { lines: { from: 7, to: 7 } } }
  }
]


In [1]:
//如果涉及到多层语意理解才能构建出联系的情况就比较难说了
var res = await retriever.invoke("孔乙己用什么谋生？")
console.log(res);

ReferenceError: retriever is not defined