In [25]:
%%capture

!pip install elasticsearch==7.14.0
!apt install default-jdk > /dev/null
try:
  import os
  import elasticsearch
  from elasticsearch import Elasticsearch
  import numpy as np
  import pandas as pd
  import sys
  import json
  from ast import literal_eval
  from tqdm import tqdm
  import datetime
  from elasticsearch import helpers

except Exception as e:
  print(f"error: {e}")

# Download & extract Elasticsearch 7.0.0

!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.0.0-linux-x86_64.tar.gz -q
!tar -xzf elasticsearch-7.0.0-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-7.0.0

# Creating daemon instance of elasticsearch
import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-7.0.0/bin/elasticsearch'],
                  stdout=PIPE, stderr=STDOUT,
                  preexec_fn=lambda: os.setuid(1)  # as daemon
                 )

# This part is important, since it takes a little amount of time for instance to load
import time
time.sleep(20)


In [26]:
%%bash
# If you get 1 root & 2 daemon process then Elasticsearch instance has started successfully
ps -ef | grep elasticsearch

daemon       957     423  1 22:09 ?        00:00:55 /content/elasticsearch-7.0.0/jdk/bin/java -Xms1g
daemon      1043     957  0 22:09 ?        00:00:00 /content/elasticsearch-7.0.0/modules/x-pack-ml/p
root       15036   15034  0 23:07 ?        00:00:00 grep elasticsearch


In [27]:
# Check if elasticsearch is running
!curl -sX GET "localhost:9200/"

{
  "name" : "6b34e2123210",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "6l1WLjtCRgKgalnkfkwHjg",
  "version" : {
    "number" : "7.0.0",
    "build_flavor" : "default",
    "build_type" : "tar",
    "build_hash" : "b7e28a7",
    "build_date" : "2019-04-05T22:55:32.697037Z",
    "build_snapshot" : false,
    "lucene_version" : "8.0.0",
    "minimum_wire_compatibility_version" : "6.7.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


In [28]:
es = Elasticsearch(hosts = [{"host":"localhost", "port":9200}])
# Check if python is connected to elasticsearch
if es.ping():
    print("Connected to Elasticsearch")
else:
    print("Connection failed")

Connected to Elasticsearch


In [18]:
query="受持菩萨戒之前需要先受持别解脱戒吗？"

document = """
无著承许受愿心，无需别解脱戒律，
然正受前受七戒，上师询问其违缘，
弟子承诺学处等，以愿行各仪轨受。
无著菩萨承许，受愿菩提心戒时无须先受别解脱戒，而受行菩提心戒时，必须受七种别解脱戒中的任一者，上师会询问是否具违缘等，弟子在上师面前承诺：从今以后愿意受持所需守持的菩萨学处。承诺守持学处以后，按照愿菩提心和行菩提心各自的仪轨进行受持。
无著菩萨的观点：仅仅受愿菩提心戒者，不必先受别解脱戒。但想受行菩提心戒者，在真实受戒前，首先必须受七种别解脱戒。阿底峡尊者在《道灯论》中说："别解脱戒律，恒具七种人，菩萨戒有缘，其余不可受。"受了别解脱戒的人，才有缘受持菩萨戒，其他人不可以受。《大圆满心性休息大车疏》中说，实际按照《道灯论自释》的观点，其他人也可以受，只不过别解脱戒中的一条学处都不能守持的人，没有学菩萨戒的缘分。究竟来讲，龙猛菩萨和无著菩萨的观点无有相违。
受戒方式必须依靠仪轨，以断除厌离轮回及贪执寂灭之心、对远离二边的菩提心生起喜悦之情这三种教言改造自心。
加行：在殊胜对境前供曼茶罗，诚心祈祷，皈依殊胜所依，以殊胜方便积累资粮。
"""

subtitle="""
菩萨戒不同于别解脱戒别解脱戒只有人可以受持人以外的众生则不能得到别解脱戒的戒体而菩萨戒在受戒者种类方面没有什么限制只要对大乘佛法有信心具有菩提心愿意受菩萨戒任何众生都可以受关于受菩萨戒龙树菩萨和无著菩萨的传承与戒条有些不同按照无著菩萨的要求只有在别解脱戒的基础上才能受菩萨戒而龙树菩萨的传承却没有这样的要求无论如何二者的实质内容是一样的无著菩萨所说的别解脱戒的意思不一定是指别解脱戒的真实戒体而是说必须按照别解脱戒的要求去做断除杀盗淫妄酒等等如果一条戒都不能守而随意杀人偷盗就没有办法受持菩萨戒所以在受菩萨戒之前如果有居士戒那就很完整即使没有受居士戒也不成问题因为受菩萨戒的时候同样也受了不杀生不偷盗等戒条所以没有太大差别最理想的次第是先受居士戒然后精进修持菩提心在自认自己有菩提心时再受持菩萨戒在菩萨戒的基础上再受持密宗誓言——灌顶
"""

In [24]:
from elasticsearch import Elasticsearch
import jieba

# 连接到Elasticsearch
es = Elasticsearch(["http://localhost:9200"])

# 创建索引和映射
index_name = "buddhism_texts"
es.indices.create(index=index_name, ignore=400)
es.indices.put_mapping(index=index_name, body={
    "properties": {
        "content": {
            "type": "text",
            # "analyzer": "ik_max_word",
            "analyzer": "standard",
            "search_analyzer": "ik_smart",
        }
    }
})

# 索引文档
def index_document(content, doc_id):
    es.index(index=index_name, id=doc_id, body={"content": content})

# 分词函数
def tokenize(text):
    return list(jieba.cut(text))

# 搜索函数
def search(query):
    response = es.search(index=index_name, body={
        "query": {
            "match": {
                "content": query
            }
        },
        "highlight": {
            "fields": {
                "content": {}
            }
        }
    })
    return response['hits']['hits']

# 索引测试文档
index_document(document, "doc1")
index_document(subtitle, "doc2")

# 执行搜索
query = "受持菩萨戒之前需要先受持别解脱戒吗？"
results = search(query)

# 输出结果
for hit in results:
    print(f"Score: {hit['_score']}")
    print(f"Content: {hit['_source']['content'][:100]}...")
    print("Highlights:", hit.get('highlight', {}).get('content', []))
    print()

# 句子级别评分
sentences = document.split("。") + subtitle.split("。")
for i, sentence in enumerate(sentences):
    index_document(sentence, f"sent{i}")

sentence_results = search(query)
print("Sentence level scoring:")
for hit in sentence_results[:5]:  # 显示前5个结果
    print(f"Score: {hit['_score']}")
    print(f"Sentence: {hit['_source']['content']}")
    print()

# 短语级别评分
phrases = [" ".join(tokens) for tokens in jieba.cut(document + subtitle)]
for i, phrase in enumerate(phrases):
    index_document(phrase, f"phrase{i}")

phrase_results = search(query)
print("Phrase level scoring:")
for hit in phrase_results[:5]:  # 显示前5个结果
    print(f"Score: {hit['_score']}")
    print(f"Phrase: {hit['_source']['content']}")
    print()


Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...


Sentence level scoring:


Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.852 seconds.
DEBUG:jieba:Loading model cost 0.852 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


Phrase level scoring:
Score: 16.248829
Phrase: "受了别解脱戒的人，才有缘受持菩萨戒，其他人不可以受

Score: 15.124655
Phrase: 受 持

Score: 15.124655
Phrase: 受 持

Score: 15.124655
Phrase: 受 持

Score: 15.124655
Phrase: 受 持

