In [2]:
import xml.etree.ElementTree as ET
import re
from collections import Counter
import pandas as pd
import os

raw_dir = "../data/raw"
processed_dir = "../data/processed"
raw_file_path = os.path.join(raw_dir, "dingfubao.xml")

tree = ET.parse(raw_file_path)
root = tree.getroot()

target_characters = [
    "佛", "禪", "僧", "梵", "劫", "懺", "釋", "檀", "鉢", "唄", "偈", "塔", "竺", "魔", "衲",
    "恆", "剎", "儭", "嚫", "䞋", "尼", "閻", "曇", "法", "心", "善", "空", "定", "覺"
]

In [3]:
# Extract terms from XML, considering namespace and filtering two-character terms
def extract_terms(root):
    namespace = '{http://www.tei-c.org/ns/1.0}'
    terms = []

    for entry in root.findall(f'.//{namespace}entry'):
        form = entry.find(f'{namespace}form')
        if form is not None and form.text is not None:
            term = form.text.strip()
            if len(term) == 2:  
                terms.append(term)
    return terms

# Extract terms from the XML
terms = extract_terms(root)

In [4]:
# Calculate frequency of a character as prefix or suffix and save the results
def cal_freq(character, terms, output_dir):
    # Create the "prefix_suffix" subdirectory inside the output directory
    prefix_suffix_dir = os.path.join(output_dir, "prefix_suffix")
    os.makedirs(prefix_suffix_dir, exist_ok=True)  # Ensure the directory exists

    # Define regex patterns
    pattern_start = re.compile(rf'^{character}.')  # Match prefix terms
    pattern_end = re.compile(rf'.{character}$')    # Match suffix terms

    # Filter terms based on the patterns
    start_terms = [term for term in terms if pattern_start.match(term)]
    end_terms = [term for term in terms if pattern_end.match(term)]

    # Define file paths for prefix and suffix results
    prefix_path = os.path.join(prefix_suffix_dir, f"{character}_prefix.txt")
    suffix_path = os.path.join(prefix_suffix_dir, f"{character}_suffix.txt")

    # Save prefix terms to file
    with open(prefix_path, "w", encoding="utf-8") as f:
        f.write("\n".join(start_terms))

    # Save suffix terms to file
    with open(suffix_path, "w", encoding="utf-8") as f:
        f.write("\n".join(end_terms))

    # Return counts for prefix, suffix, and total
    return len(start_terms), len(end_terms), len(start_terms) + len(end_terms)

# Extract terms from the XML document
terms = extract_terms(root)

# Process each target character
for char in target_characters:
    prefix_count, suffix_count, total_count = cal_freq(char, terms, processed_dir)

# Print confirmation message
print(f"Processed data saved in: {os.path.abspath(os.path.join(processed_dir, 'prefix_suffix'))}")

Processed data saved in: /Users/weilingchen/Desktop/morpho_pilot/data/processed/prefix_suffix


In [16]:
# Process each target character
results = []

for char in target_characters:
    start_count, end_count, total_count = cal_freq(char, terms, processed_dir)
    results.append({
        "Character": char,
        "Prefix Count": start_count,
        "Suffix Count": end_count,
        "Total Count": total_count
})

In [17]:
# Handle special characters with multiple forms
special_characters = ["儭", "嚫", "䞋"]
special_rows = [row for row in results if row["Character"] in special_characters]
special_start_total = sum(row["Prefix Count"] for row in special_rows)
special_end_total = sum(row["Suffix Count"] for row in special_rows)
special_total = special_start_total + special_end_total

# Insert aggregated special character data
insert_index = results.index(special_rows[-1]) + 1
results.insert(insert_index, {
    "Character": "儭, 嚫, 䞋",
    "Prefix Count": special_start_total,
    "Suffix Count": special_end_total,
    "Total Count": special_total
})

In [18]:
# Book frequency data
book_frequencies = {
    "佛": 79, "禪": 75, "僧": 37, "梵": 45, "劫": 14, "懺": 5, "釋": 21, "檀": 16, "鉢": 11,
    "唄": 4, "偈": 4, "塔": 5, "竺": 3, "魔": 22, "衲": 3, "恆": 2, "剎": 6, "儭, 嚫, 䞋": 6,
    "尼": 6, "閻": "NA", "曇": "NA", "法": 423, "心": 149, "善": 123, "空": 76, "定": 48, "覺": 43
}

# Add book frequencies to results
for row in results:
    character = row["Character"]
    row["Book Frequency"] = book_frequencies.get(character, "NA")

In [22]:
# Convert results to DataFrame
df = pd.DataFrame(results)
display(df)

Unnamed: 0,Character,Prefix Count,Suffix Count,Total Count,Book Frequency
0,佛,81,54,135,79.0
1,禪,75,29,104,75.0
2,僧,54,39,93,37.0
3,梵,51,16,67,45.0
4,劫,18,33,51,14.0
5,懺,7,9,16,5.0
6,釋,27,13,40,21.0
7,檀,20,11,31,16.0
8,鉢,20,20,40,11.0
9,唄,5,8,13,4.0


In [28]:
# Get top 10 characters by total count
top_10 = df.sort_values(by="Total Count", ascending=False).head(10)
total_bigrams_count = top_10["Total Count"].sum()

display(top_10)
print(f"前十名單音詞所含有的雙音詞總數: {total_bigrams_count}")

Unnamed: 0,Character,Prefix Count,Suffix Count,Total Count,Book Frequency
24,法,155,110,265,423
25,心,77,96,173,149
0,佛,81,54,135,79
1,禪,75,29,104,75
2,僧,54,39,93,37
27,空,42,45,87,76
29,覺,38,41,79,43
3,梵,51,16,67,45
26,善,36,25,61,123
28,定,30,27,57,48


前十名單音詞所含有的雙音詞總數: 1121
