In [1]:
import os
from bs4 import BeautifulSoup
from collections import Counter

In [2]:
def text_strip(input_text):
    return(input_text.lower().replace("­", ""))

In [3]:
def parse_html_dom(input_html_content):
    soup = BeautifulSoup(input_html_content, "html5lib")
    document = {}
    
    paragraphs = []
    
    for tag in soup.findAll('p'):
        paragraph_entry = {}
        paragraph_entry["text"] = text_strip(tag.text)
        paragraph_entry["ids"] = []

        #print(tag.renderContents())
        for a in tag.find_all('a'):
            if (a.get('id') is None) or (a.get('class') is None):
                continue
            paragraph_entry["ids"].append({'class': a.get('class'),'id': a.get('id')})
            
        paragraphs.append(paragraph_entry)
    
    document["paragraphs"] = paragraphs
    
    for section in soup.findAll('section'):
        document["class"] = section.get('class')
        document["id"] = section.get('id')

    for h1 in soup.findAll('h1'):
        document["title"] = h1.text
        
    return(document)

In [4]:
def search_document(query, document):
    _paragraph_results = []
    
    _document_score = 0
    _document_catch = []
    for paragraph in document["dom"]["paragraphs"]:
        _paragraph_score = 0
        _paragraph_catch = []
        
        if "keyword_list" in query:
            for kw in query["keyword_list"]:
                if kw in paragraph["text"]:
                    _paragraph_score += 1
                    _document_score += 1
                    _paragraph_catch.append(kw)
                    _document_catch.append(kw)
        elif "alternatives_list" in query:
            for altkw in query["alternatives_list"]:
                for kw in altkw["forms"]:
                    if kw in paragraph["text"]:
                        _paragraph_score += 1
                        _document_score += 1
                        _paragraph_catch.append({"label": altkw["label"], "form" : kw})
                        _document_catch.append({"label": altkw["label"], "form" : kw})
            
        
        if len(_paragraph_catch) > 0:
            _result_entry = {
                "doc_reference" : document["file_absolute_path"],
                "type" : "paragraph",
                "excerpt" : paragraph["text"],
                "score" : _paragraph_score,
                "catch" : _paragraph_catch,
                "ids" : paragraph.get("ids", [])
            }
            _paragraph_results.append(_result_entry)
            
    results = []
    if _document_score > 0:
        results.append({
                "type" : "document",
                "doc_reference" : document["file_absolute_path"],
                "doc_id" : document["dom"].get("id", "none"),
                "doc_title" : document["dom"].get("title", "no title available"),
                "score" : _document_score,
                "catch" : _document_catch,
                "paragraphs" : _paragraph_results
            })
    return(results)

In [5]:
def load_documents(rootdir):
    documents = []

    for root, subFolders, files in os.walk(rootdir):
        #print("{} // {} // {}".format(root,subFolders,files))
        for filename in files:
            if filename.endswith(".html"):
                _html_files_entry = {
                    "relative_root" : root[len(rootdir):],
                    "file_name" : filename,
                    "file_absolute_path" : "{}/{}".format(root,filename)
                }

                with open(_html_files_entry["file_absolute_path"]) as ifile:
                    _html_files_entry["html_content"] = ifile.read()
                _html_files_entry["dom"] = parse_html_dom(_html_files_entry["html_content"])
                documents.append(_html_files_entry)

    return(documents)

In [6]:
def query_keywordlist(kwlist):
    return({
        "keyword_list" : [
            text_strip(kw) for kw in kwlist
        ]
    })


In [17]:
def transform_excerpt(excerpt, query):
    _output = excerpt
    if "keyword_list" in query:
        for kw in query["keyword_list"]:
            _output = _output.replace(kw, "**{}**".format(kw))
    elif "alternatives_list" in query:
        for altkw in query["alternatives_list"]:
            for kw in altkw["forms"]:
                _output = _output.replace(kw, "**{}**".format(kw))
    return(_output)

def result_markdown_formater(query, results):
    _output = []
    
    for result in results:
        _output.append("#### {doc_id} - {title} [pali]({url})\n".format(
                            doc_id = result.get("doc_id","noid").upper(),
                            title = result.get("doc_title","no title in result"),
                            url = "https://suttacentral.net/pi/{}/".format(result["doc_id"])
                        ))
        _output.append("\n")

        _output.append("Matches({}): ".format(len(result["catch"])))
        
        _suboutput = []
        if "keyword_list" in query:
            for kw in query["keyword_list"]:
                if kw in result["catch"]:
                    _suboutput.append("[{}]".format(kw))
                else:
                    _suboutput.append("~~[{}]~~".format(kw))
        elif "alternatives_list" in query:
            _result_labels = [alt["label"] for alt in result["catch"]]
            for altkw in query["alternatives_list"]:
                if altkw["label"] in _result_labels:
                    _suboutput.append("[{}]".format(altkw["label"]))
                else:
                    _suboutput.append("~~[{}]~~".format(altkw["label"]))
                    
        _output.append(", ".join(_suboutput))
        _output.append("\n")
        _output.append("\n")
        
        for paragraph in result["paragraphs"]:
            # todo: id
            
            _output.append("**Paragraph {}**\n> ".format(paragraph["ids"]))
            _output.append(transform_excerpt(paragraph["excerpt"], query))
            _output.append("\n\n")
            
        _output.append("\n")
    
    return("".join(_output))

In [8]:
_rootdir = "/home/jeff/dev/datayana/suttacentral-data/text/pi/su"

_docs = load_documents(_rootdir)
print("loaded {} files, total bytes {}".format(len(_docs), sum([len(entry["html_content"]) for entry in _docs])))

loaded 5739 files, total bytes 28469442


In [9]:
_global_results = []

_query = query_keywordlist([
            "anussavena", "paramparāya", "itikirāya", "piṭaka­sam­padā­nena",
            "takkahetu", "nayahetu", "ākāra­pari­vitak­kena", "diṭṭhi­nij­jhā­nak­khan­tiyā",
            "bhabbarūpatāya", "samaṇo no garū"        
    ])

for doc in _docs:
    _global_results.extend(search_document(_query, doc))
#_global_results.extend(search_document({"keyword_list": ["ānando"]}, _docs[0]))

_global_results = sorted(_global_results, key=lambda el : -el['score'])
print("Results found: {}".format(len(_global_results)))

with open("results-kwlist.md", "w") as ofile:
    ofile.write(result_markdown_formater(_query, _global_results))


Results found: 12


In [18]:
_global_results = []

_query = {
    "alternatives_list": [
        { "label": "anussavena", "forms": ["anussavena", "anussavā", "itihitihaṃ", "itihītihaṃ"] },
        { "label": "paramparāya", "forms": ["paramparāya"] },
        { "label": "itikirāya", "forms": ["itikirāya", "itikiriyāya"] },
        { "label": "piṭakasampadānena", "forms": ["piṭakasampadānena", "piṭakasampadāya"] },
        { "label": "takkahetu", "forms": ["takkahetu"] },
        { "label": "nayahetu", "forms": ["nayahetu"] },
        { "label": "ākāraparivitakkena", "forms": ["ākāraparivitakkena", "ākāraparivitakkā"] },
        { "label": "diṭṭhinijjhānakkhantiyā", "forms": ["diṭṭhinijjhānakkhantiyā"] },
        { "label": "bhabbarūpatāya", "forms": ["bhabbarūpatāya"] },
        { "label": "samaṇo no garū", "forms": ["samaṇo no garū"] }
    ]
}

for doc in _docs:
    _global_results.extend(search_document(_query, doc))
#_global_results.extend(search_document({"keyword_list": ["ānando"]}, _docs[0]))

_global_results = sorted(_global_results, key=lambda el : -el['score'])

with open("results-altlist.md", "w") as ofile:
    ofile.write(result_markdown_formater(_query, _global_results))