# Cleaning the Mahabharata Text

In [17]:
import re

## Reading the File

In [18]:
with open('originalText.txt', 'r') as file:
    text = file.readlines()

sentences = []
for i in range (0,len(text)):
    if(i%2==0):
        continue
    sentences.append(text[i])

# Writing only the sentences in different file
with open('onlySentences.txt', 'w') as new_file:
    for sentence in sentences:
        new_file.write(sentence)

print("Sentences have been stored in sentences.txt.")
print(len(sentences))

Sentences have been stored in sentences.txt.
12647


### Function to remove Greek letters

In [19]:
def remove_greek(text):
    greek_letters = ['α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω','§','†','—','-',':','*','ϕ']
    txt = ""
    for i in text:
        if i in greek_letters:
            txt = txt + ' '
            continue
        txt = txt + i
    return txt

### Function to remove Roman Numerals

In [20]:
def remove_specific_roman_numerals(text):
    numerals_to_remove = ['I','I,', 'II', 'III','D', 'IV','V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI', 'XVII', 'XVIII', 'XIX', 'XX']
    pattern = r'\b(?:' + '|'.join(numerals_to_remove) + r')\b'
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

### Function to remove Unnecessary Digits

In [21]:
def remove_digits_keep_numbers_after_letters(text):
    # Define a regular expression pattern to match digits that don't come after letters
    pattern = r'(?<![a-zA-Z])\d+'
    # Replace matched digits with an empty string
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

### Function to remove Brackets

In [22]:
def remove_brackets(text):
    textt = ""
    for i in text:
        if(i=='[' or i==']' or i=='(' or i==')' or i=='{' or i=='}'):
            continue
        textt = textt + i
    return textt

### Function to remove Special Characters

In [23]:
def clean_text(text):
    # Remove numbers, †, and —§
    cleaned_text = re.sub(r'\|†|—§|§|-|—|:|(do.)|()|', '', text)
    cleaned_text = re.sub(r'\|θ|β|nihataṃ|\(\)|\[\]', '',cleaned_text)
    cleaned_text = re.sub(r'\*', '', cleaned_text)
    cleaned_text = remove_specific_roman_numerals(cleaned_text)
    cleaned_text = remove_digits_keep_numbers_after_letters(cleaned_text)
    cleaned_text = re.sub(r',+', ',', cleaned_text)
    # Remove extra spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    # Remove leading and trailing spaces
    cleaned_text = cleaned_text.strip()
    return cleaned_text

### Function to remove Comma

In [24]:
def clean_comma(text):
    cleaned_text = ''
    prev_char = ''
    for char in text:
        # Append valid characters to the cleaned text
        if char.isalnum() or char in [' ', ',','=','.']:
            # Add only one comma if consecutive commas are found
            if (char == ',' and prev_char == ','):
                continue
            # Append the character to the cleaned text
            cleaned_text += char
        # Store the previous character
        prev_char = char
    # Split the text by commas and remove empty segments
    segments = [segment.strip() for segment in cleaned_text.split(',') if segment.strip()]
    # Join the segments with commas
    cleaned_text = ', '.join(segments)
    return cleaned_text

### Function to remove Dots

In [25]:
def clean_dot(text):
    cleaned_text = ''
    prev_char = ''
    for char in text:
        # Append valid characters to the cleaned text
        if char.isalnum() or char in [' ', ',','=','.']:
            # Add only one comma if consecutive commas are found
            if (char == '.' and prev_char == '.'):
                continue
            # Append the character to the cleaned text
            cleaned_text += char
        # Store the previous character
        prev_char = char
    # Split the text by commas and remove empty segments
    segments = [segment.strip() for segment in cleaned_text.split('.') if segment.strip()]
    # Join the segments with commas
    cleaned_text = '. '.join(segments)
    return cleaned_text

### Function to remove Content Inside Brackets

In [26]:
def remove_content_inside_brackets(text):
    txt = ""
    cnt = 0
    for i in text:
        if(i=='(' or i=='[' or i=='{'):
            cnt = cnt + 1
            continue
        if(i==')' or i==']' or i=='}'):
            cnt = cnt - 1
            continue
        if(cnt==0):
            txt = txt + i
    return txt

### Function to remove unnecessary patterns

In [27]:
def remove_dotComma(txt1):
    pattern = r'\.,'
    # Remove ".," using regex
    cleaned_text = re.sub(pattern, '', txt1)
    return cleaned_text

def remove_alone_dot(text):
    pattern = r'\s+\.\s+'
    # Remove the dots surrounded by spaces using regex
    cleaned_text = re.sub(pattern, ' ', text)
    return cleaned_text

def remove_thday(text):
    words = text.split()
    # Remove "th day," from the list of words
    cleaned_words = [word for word in words if word != "th" and word != "day,"]
    # Join the remaining words back into a sentence
    cleaned_sentence = ' '.join(cleaned_words)
    return cleaned_sentence

def remove_cfdot(text):
    words = text.split()
    # Remove "th day," from the list of words
    cleaned_words = [word for word in words if word != "cf."]
    # Join the remaining words back into a sentence
    cleaned_sentence = ' '.join(cleaned_words)
    return cleaned_sentence

def remove_cFdot(text):
    words = text.split()
    # Remove "th day," from the list of words
    cleaned_words = [word for word in words if word != "Cf."]
    # Join the remaining words back into a sentence
    cleaned_sentence = ' '.join(cleaned_words)
    return cleaned_sentence

In [28]:
cleanTextFile = []
newTextFile = []
print(len(sentences))

12647


In [29]:
# Original text
cleanTextFile = []
newTextFile = []

print(len(sentences))
for i in range(0,len(sentences)):
    txt = remove_greek(sentences[i])
    txt = clean_text(txt)
    txt = remove_content_inside_brackets(txt)
    txt = clean_comma(txt)
    txt = remove_dotComma(txt)
    txt = remove_specific_roman_numerals(txt)
    txt = remove_thday(txt)
    txt = remove_cfdot(txt)
    txt = remove_cFdot(txt)
    cleanTextFile.append(txt)
    
print(len(cleanTextFile))
for i in cleanTextFile:
    newTextFile.append(clean_comma(i))


12647
12647


## Creating clean text without comma

In [30]:
with open('cleanedTextFile.txt', 'w') as new_file1:
    for sentence in cleanTextFile:
        new_file1.write(sentence + '\n')


## Creating clean text with comma

In [31]:
with open('cleanedTextFileFinalDONE.txt', 'w') as new_file2:
    for sentence in newTextFile:
        new_file2.write(sentence + '\n')