In [1]:
%load_ext autoreload
%autoreload 2

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sc_query import load_documents
from sc_query import search_documentlist
from sc_query import result_markdown_formater
from sc_query import collect_matika_cooccurences_pivot
from sc_query import query_keywordlist
from sc_query import query_regex


## 1. Loading suttas from suttacentral github image

Clone the `suttacentral-data` repository.

```bash
git clone https://github.com/suttacentral/suttacentral-data.git
```

Set the `rootdir` variable accordingly.


In [2]:
rootdir = "/home/jeff/dev/datayana/suttacentral-data/text/pi/su"

docs = load_documents(rootdir)
print("loaded {} files, total bytes {}".format(len(docs), sum([len(entry["html_content"]) for entry in docs])))

loaded 5739 files, total bytes 28469442


## 2. Querying a list of simple keywords

Create a query made of just a list of keywords. Function `query_wordlist()` constructs the query by taking a list of keywords.

In [None]:
kwquery = query_keywordlist([
            "anussavena", "paramparāya", "itikirāya", "piṭaka­sam­padā­nena",
            "takkahetu", "nayahetu", "ākāra­pari­vitak­kena", "diṭṭhi­nij­jhā­nak­khan­tiyā",
            "bhabbarūpatāya", "samaṇo no garū"        
    ])

results_kwquery = search_documentlist(kwquery, docs, sort="score")
print("Results found: {}".format(len(results_kwquery)))


Creating a markdown file that shows those results

In [None]:
with open("output/test_results-kwlist.md", "w") as ofile:
    ofile.write(result_markdown_formater(kwquery, results_kwquery))

## 3. Querying a list of alternatives

If each word can be found in multiple variants, create a list of alternatives.

Here's how to construct the query:

In [None]:
altquery = {
    "alternatives_list": [
        { "label": "anussavena", "forms": ["anussavena", "anussavā", "itihitihaṃ", "itihītihaṃ"] },
        { "label": "paramparāya", "forms": ["paramparāya"] },
        { "label": "itikirāya", "forms": ["itikirāya", "itikiriyāya"] },
        { "label": "piṭakasampadānena", "forms": ["piṭakasampadānena", "piṭakasampadāya"] },
        { "label": "takkahetu", "forms": ["takkahetu"] },
        { "label": "nayahetu", "forms": ["nayahetu"] },
        { "label": "ākāraparivitakkena", "forms": ["ākāraparivitakkena", "ākāraparivitakkā"] },
        { "label": "diṭṭhinijjhānakkhantiyā", "forms": ["diṭṭhinijjhānakkhantiyā"] },
        { "label": "bhabbarūpatāya", "forms": ["bhabbarūpatāya"] },
        { "label": "samaṇo no garū", "forms": ["samaṇo no garū"] }
    ]
}

results_altquery = search_documentlist(altquery, docs, sort="score")
print("Results found: {}".format(len(results_altquery)))

with open("output/test_results-altlist.md", "w") as ofile:
    ofile.write(result_markdown_formater(altquery, results_altquery))

## 4. Showing matika co-occurences

In [None]:
df_pivot = collect_matika_cooccurences_pivot(results_altquery)

rcmd_figsize = (df_pivot.shape[0]+2,df_pivot.shape[1]+2)

sns.set(color_codes=True)

g = sns.clustermap(df_pivot,
                   method="complete", metric="jaccard",
                   cmap="YlGnBu",
                   figsize=rcmd_figsize)

g.savefig("output/img/test_matika_cooccurences.png")

## 5. Test other examples

In [None]:
from sc_query import collect_matika_cooccurences_counter
import pandas as pd

altquery = {
    "alternatives_list": [
        { "label": "dukkha", "forms": ["dukkh"] },
        { "label": "nirodha", "forms": ["nirodh"] },
        { "label": "samudaya", "forms": ["samuday"] },
        { "label": "magga", "forms": ["magga"] },
        { "label": "anicca", "forms": ["anicca"] },
        { "label": "anatta", "forms": ["anatta"] },
        { "label": "rupa", "forms": ["rūpa", "rūpā"] },
        { "label": "vedana", "forms": ["vedana", "vedanā"] },
        { "label": "sanna", "forms": ["saññā", "sañña"] },
        { "label": "sankhara", "forms": ["sankhārā", "saṅkhāra", "sankhar", "sankhār"] },
        { "label": "vijnana", "forms": ["viññāṇaṃ", "viññāṇ"] }
    ]
}

results_altquery = search_documentlist(altquery, docs, sort="score")
print("Results found: {}".format(len(results_altquery)))

In [None]:
import math

#df_pivot = collect_matika_cooccurences_pivot(results_altquery,
#                                             transform=lambda v : 1 / (1+math.log(v)),
#                                             normalize="minmax")

df_pivot = collect_matika_cooccurences_pivot(results_altquery)

#df_pivot

In [None]:
rcmd_figsize = (df_pivot.shape[0]+2,df_pivot.shape[1]+2)

sns.set(color_codes=True)

import scipy.spatial as sp
import scipy.cluster.hierarchy as hc

linkage = hc.linkage(sp.distance.squareform(df_pivot), method='single')

g = sns.clustermap(df_pivot,
                   row_linkage=linkage,
                   col_linkage=linkage,
                   figsize=rcmd_figsize)

g.savefig("output/img/test_matika_fournobletruths.png")

### Example: locating the suttas

In [51]:
location_regex = r"ekaṃ samayaṃ ([^\.\'\"\“\”\‘\’\?]+) (viharati|carati|caramāno|addhānamaggappaṭipanno)([^\.]+)(?:[\.])"
location_query = query_regex(location_regex, group_map={0: '*', 1:'location', 3:'rest'})

full_results_location = search_documentlist(location_query, docs, sort="score")
print("Results found: {}".format(len(full_results_location)))

#[doc['doc_id'] for doc in full_results_location]
#full_results_location[0]
with open("output/test_locations.md", "w") as ofile:
    ofile.write(result_markdown_formater(location_query, full_results_location))


Results found: 757


In [34]:
dn_suttas_ids = []
mn_suttas_ids = []
an_suttas_ids = []
sn_suttas_ids = []

for doc in docs:
    sc_id = doc["dom"].get("id", None)
    if sc_id.startswith("dn"):
        dn_suttas_ids.append(sc_id)
    if sc_id.startswith("mn") and not sc_id.startswith("mnd"):
        mn_suttas_ids.append(sc_id)
    if sc_id.startswith("an"):
        an_suttas_ids.append(sc_id)
    if sc_id.startswith("sn") and not sc_id.startswith("snp"):
        sn_suttas_ids.append(sc_id)

print("dn: {}".format(len(dn_suttas_ids)))
print("mn: {}".format(len(mn_suttas_ids)))
print("an: {}".format(len(an_suttas_ids)))
print("sn: {}".format(len(sn_suttas_ids)))


dn: 34
mn: 152
an: 1366
sn: 1819


In [35]:
dn_suttas_located = []
mn_suttas_located = []
an_suttas_located = []
sn_suttas_located = []

for doc in full_results_location:
    sc_id = doc.get("doc_id", "")
    if sc_id.startswith("dn"):
        dn_suttas_located.append(sc_id)
    if sc_id.startswith("mn") and not sc_id.startswith("mnd"):
        mn_suttas_located.append(sc_id)
    if sc_id.startswith("an"):
        an_suttas_located.append(sc_id)
    if sc_id.startswith("sn") and not sc_id.startswith("snp"):
        sn_suttas_located.append(sc_id)

print("dn: {}".format(len(dn_suttas_located)))
print("mn: {}".format(len(mn_suttas_located)))
print("an: {}".format(len(an_suttas_located)))
print("sn: {}".format(len(sn_suttas_located)))

dn: 34
mn: 152
an: 195
sn: 269


In [36]:
print(set(dn_suttas_ids).difference(set(dn_suttas_located)))
print(set(mn_suttas_ids).difference(set(mn_suttas_located)))
#print(set(an_suttas_ids).difference(set(an_suttas_located)))

set()
set()


### Example: "yaṃ kiñci samuda­ya­dhammaṃ sabbaṃ taṃ nirodhadhamma"

In [None]:
formula_query = query_keywordlist([
        "yaṃ kiñci samuda­ya­dhammaṃ sabbaṃ taṃ nirodhadhamma"
    ])

results_formula = search_documentlist(formula_query, docs, sort="score")
print("Results found: {}".format(len(results_formula)))

with open("output/test_yam-kinci.md", "w") as ofile:
    ofile.write(result_markdown_formater(formula_query, results_formula))
    
[doc['doc_id'] for doc in results_formula]

In [None]:
formula_query = query_keywordlist([
        "yaṃ kho kiñci",
        "yampi hi kiñci",
        "yā kāci",
        "yaṃ kiñci",
        "ye keci",
        "samudayadhamma",
        "sabbaṃ taṃ",
        "nirodhadhamma"
    ])


results_formula = search_documentlist(formula_query, docs, sort="score")
print("Results found: {}".format(len(results_formula)))

dn_results_formula = [
    doc for doc in results_formula
    if (doc["doc_id"].startswith("dn"))
]
print("DN: {}".format([doc['doc_id'] for doc in dn_results_formula]))
with open("output/test_yam-kinci-DN.md", "w") as ofile:
    ofile.write(result_markdown_formater(formula_query, dn_results_formula))

mn_results_formula = [
    doc for doc in results_formula
    if (doc["doc_id"].startswith("mn") and not doc["doc_id"].startswith("mnd"))
]
print("MN: {}".format([doc['doc_id'] for doc in mn_results_formula]))
with open("output/test_yam-kinci-MN.md", "w") as ofile:
    ofile.write(result_markdown_formater(formula_query, mn_results_formula))

sn_results_formula = [
    doc for doc in results_formula
    if (doc["doc_id"].startswith("sn") and not doc["doc_id"].startswith("snp"))
]

print("SN: {}".format(sorted(
    [
        (doc['doc_id'])
        for doc in sn_results_formula
    ]
)))

with open("output/test_yam-kinci-SN.md", "w") as ofile:
    ofile.write(result_markdown_formater(formula_query, sn_results_formula))

In [None]:
#formula_reg = r"(ya|yā|ye)([^\.\'\"\“\”\‘\’\?]+)(kiñci|kāci|keci)([^\.\'\"\“\”\‘\’\?]+)sabbaṃ([^\:\.\'\"\“\”\‘\’\?]+)[\:\.\'\"\“\”\‘\’\?]"
formula_reg = r"(ya|yā|ye)([^\.\'\"\“\”\‘\’\?\;]+)(kiñci|kāci|keci)([^\.\'\"\“\”\‘\’\?]+)sabbaṃ([^\.\'\"\“\”\‘\’\?]+)(?:[\.\'\"\“\”\‘\’\?])?"

formula_query = query_regex(formula_reg)

full_results_formula = search_documentlist(formula_query, docs, sort="score")
print("Results found: {}".format(len(full_results_formula)))

print("Found in {}".format(sorted([doc["doc_id"] for doc in full_results_formula])))

def keep_doc(doc):
    if doc["doc_id"].startswith("dn"):
        return True
    if doc["doc_id"].startswith("mn") and not doc["doc_id"].startswith("mnd"):
        return True
    if doc["doc_id"].startswith("sn") and not doc["doc_id"].startswith("snp"):
        return True
    if doc["doc_id"].startswith("sn"):
        return True
    return False

results_formula = [doc for doc in full_results_formula if keep_doc(doc)]

print("DN,MN,SN,AN({}): {}".format(
    len(results_formula),
    sorted([doc["doc_id"] for doc in results_formula])
))

#for doc in results_formula:
#    for catchphrase in doc["catch"]:
#        print("*** formula in {} :\n{}".format(doc["doc_id"], catchphrase))

with open("output/test_yam-kinci-regex.md", "w") as ofile:
    ofile.write(result_markdown_formater(formula_query, sorted(results_formula, key=lambda d : d['doc_id'])))