In [36]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re

In [37]:
numeric_pattern = r"\(\s*\d+\s*/\s*\d+\s*\)"
english = r"[a-zA-Z]"
numbers = r"\s*\d+\s*"
numering_items = r"\s*\d+\s*[-]\s*"
empty_brackets = r"\(\s*\)|\[\s*\]|\{\s*\}"
stand_alone=r'(?<=\s|\^|\(|\[|\{)[^\(\)\[\]\{\}\.,،:;؛؟!\-](?=\s|$|\]|\)|\})'

In [38]:
kawloho_pattern = r"(\s*قَوْلُهُ\s*)"
qala_variations = r"(?:قَالَ|قَالَتْ|قُلْت|قَالُوا|قُلْنَا|أَقُولُ)"
qala_variations = r"(?:قَالَ|قَالَتْ|قُلْت|قَالُوا|قُلْنَا|أَقُولُ)"
qala_pattern = rf"(\s*{qala_variations}\s*:)"
soila_pattern = r"(\s*[\[\(]?\s*(?:وَ)?سُئِلَ\s*[\]\)]?)"
ajaba_pattern = r"(\s*[\[\(]?\s*(?:فَ)?أَجَابَ\s*[\]\)]?)"

In [39]:
def remove_unbalanced_brackets(text):
    pair_map = {')': '(', '}': '{', ']': '['}
    openers = set(['(', '{', '['])
    
    stack = [] 
    indices_to_remove = set()

    for i, char in enumerate(text):
        if char in openers:
            stack.append((char, i))
        
        elif char in pair_map:
            if stack:
                last_opener, _ = stack[-1]
                if last_opener == pair_map[char]:
                    stack.pop()
                else:
                    indices_to_remove.add(i)
            else:
                indices_to_remove.add(i)

    for char, index in stack:
        indices_to_remove.add(index)

    return "".join([char for i, char in enumerate(text) if i not in indices_to_remove])


In [40]:
def clean_punctuation_sequence(text):
    puncs = re.escape(".,:;{}[]()!?'\"/،؛؟")
    pattern = rf"([{puncs}])(?:\s*[{puncs}])+"
    return re.sub(pattern, r"\1", text)

In [41]:
def separate_citations(citation_pattern, lines):
    final_lines = []

    for line in lines:
        modified_line = re.sub(citation_pattern, r"\n\1", line)
        
        parts = modified_line.split('\n')
        
        for part in parts:
            cleaned_part = part.strip()
            if cleaned_part:
                final_lines.append(cleaned_part)
                
    return final_lines

In [42]:
def remove_slashes(line):
    res = re.sub(r'/', '', line)
    return res

In [43]:
def clean_punctuation_sequence(text):
    collapsible = re.escape(".,:;!?'\"/،؛؟")    
    pattern = rf"([{collapsible}])(?:\s*\1)+"
    
    return re.sub(pattern, r"\1", text)

In [44]:
def abbrevs(new_lines):    
    updated = []
    stand_alone=r'(?<=\s|\^|\(|\[|\{|-)[^\(\)\[\]\{\}\.,،:;؛؟!\-](?=\s|$|\]|\)|\|-})'

    for new_line in new_lines:
        matches = re.findall(f'{stand_alone}', new_line)
        if len(matches):
            res = re.sub(r'\s+ا\s*هـ?\s+', ' ، انْتَهَى ', new_line)
            res = re.sub(fr'\((\s*{stand_alone}\s*)+\)', '', res)
            res = re.sub(fr'(\s*{stand_alone}\s*)+', ' الشَّيْخُ ', res) # candidate: المصدر
            updated.append(res)
        else:
            updated.append(new_line)

    return updated


In [45]:
def separate_citations(citation_pattern, lines):
    final_lines = []

    for line in lines:
        modified_line = re.sub(citation_pattern, r"\n\1", line)
        
        parts = modified_line.split('\n')
        
        for part in parts:
            cleaned_part = part.strip()
            if cleaned_part:
                final_lines.append(cleaned_part)
                
    return final_lines

In [46]:
def process_text(lines):
    new_lines = []
    for line in lines:
        res = re.sub(numering_items, '', line)
        res = re.sub(numeric_pattern, '', res)
        res = re.sub(english, '', res)
        res = re.sub(numbers, '', res)
        res = re.sub(empty_brackets, '', res)
        res = re.sub(',', '،', res)
        res = re.sub(';', '؛', res)
        res = re.sub(r'\s+ا\s*هـ?\s+', ' ، ', res)
        res = re.sub(fr'\((\s*{stand_alone}\s*)+\)', ' ، انْتَهَى ', res)
        res = re.sub(fr'(\s*{stand_alone}\s*)+', ' الشَّيْخُ ', res)
        res = re.sub(r'/', '', res)
        
        res = clean_punctuation_sequence(res)
        res = remove_unbalanced_brackets(res)
        res = remove_slashes(res)
        
        res = re.sub(r"\s+", " ", res).strip()
        new_lines.append(res)
    return new_lines

In [47]:
def read_data(file_path='../data/train.txt'):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        lines = lines[:1000]
    return process_text(lines)

In [48]:
train_lines = read_data('../data/train.txt')

In [49]:
lines_step_1 = separate_citations(kawloho_pattern, train_lines)
cleaned_lines = separate_citations(qala_pattern, lines_step_1)

lines_after_soila = separate_citations(soila_pattern, cleaned_lines)
final_fatwa_lines = separate_citations(ajaba_pattern, lines_after_soila)

In [50]:
cleaned_lines = [remove_unbalanced_brackets(line) for line in final_fatwa_lines]