In [1]:
import os
import json
from bs4 import BeautifulSoup

def extract_text_from_html(html_content):
    """
    Extracts and returns only the text content from an HTML document.
    :param html_content: A string containing the HTML content
    :return: A string with extracted text
    """
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Remove script and style elements
    for script_or_style in soup(["script", "style"]):
        script_or_style.decompose()
    
    # Extract text from <div class="text">
    text_divs = [div.get_text(separator=' ', strip=True) for div in soup.find_all("div", class_="text")]
    
    return text_divs

def process_html_files(folder_path, output_json_path):
    """
    Process all HTML files in a folder and save extracted text into a JSON file.
    :param folder_path: Path to the folder containing HTML files
    :param output_json_path: Path to save the output JSON file
    """
    extracted_data = {}
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".html"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                html_content = file.read()
                extracted_data[filename] = extract_text_from_html(html_content)
    
    with open(output_json_path, "w", encoding="utf-8") as json_file:
        json.dump(extracted_data, json_file, ensure_ascii=False, indent=4)

# Example usage
folder_path = "data/row_data" # Change this to the folder containing HTML files
output_json_path = "data/extracted_text.json" # Change this to the desired output JSON file
process_html_files(folder_path, output_json_path)

In [2]:

def split_json_into_txt(json_path, txt_folder, num_parts=7):
    """
    Split a JSON file into multiple TXT files with equal sizes.
    :param json_path: Path to the JSON file
    :param txt_folder: Folder to save the TXT files
    :param num_parts: Number of TXT files to create
    """
    if not os.path.exists(txt_folder):
        os.makedirs(txt_folder)
    
    with open(json_path, "r", encoding="utf-8") as json_file:
        data = json.load(json_file)
        all_text = []
        
        for key, value in data.items():
            all_text.append(f"{key}:\n")
            if isinstance(value, list):
                all_text.extend(value)
            else:
                all_text.append(str(value))
            all_text.append("\n\n")
        
    with open(os.path.join(txt_folder, f"all.txt"), "w", encoding="utf-8") as txt_file:
            txt_file.write("\n".join(all_text))
            
    total_size = len(all_text)
    chunk_size = total_size // num_parts

    if num_parts == 1:
        txt_path = os.path.join(txt_folder, f"all_russian.txt")
        with open(txt_path, "w", encoding="utf-8") as txt_file:
            txt_file.write("\n".join(all_text))
        return

    for i in range(num_parts):
        start_index = i * chunk_size
        end_index = (i + 1) * chunk_size if i < num_parts - 1 else total_size

        txt_path = os.path.join(txt_folder, f"part_{i+1}.txt")
        with open(txt_path, "w", encoding="utf-8") as txt_file:
            txt_file.write("\n".join(all_text[start_index:end_index]))

# Example usage
json_path = "data/extracted_text.json"  # Change to actual JSON file path
txt_folder = "data/txts"  # Change to output folder path
split_json_into_txt(json_path, txt_folder,num_parts=1)


TODO:

all_russian.txt -> all_translated.txt

With yabdex preferably

In [3]:
translates_txt = 'data/txts/all_translated.txt'
with open(translates_txt, "r", encoding="utf-8") as txt_file:
    f = txt_file.read()

In [4]:
f.count('\n')

30768

In [5]:
import re
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

In [6]:
with open('data/txts/emoji_free.txt', 'w', encoding="utf-8") as file:
    file.write(remove_emojis(f))

In [7]:
import re

# Input and output file paths
input_file = "data/txts/emoji_free.txt"   # Change this to your file name
output_file = "data/txts/cleaned.txt" # Change if needed

# Read the file and process lines
with open(input_file, "r", encoding="utf-8") as file:
    lines = file.readlines()

# Remove leading spaces and "?" signs
processed_lines = [re.sub(r"^[\s?]+", "", line) for line in lines]

# Write the processed lines to a new file
with open(output_file, "w", encoding="utf-8") as file:
    file.writelines(processed_lines)

print("Processing complete. Check", output_file)


Processing complete. Check data/txts/cleaned.txt


In [8]:
with open('data/txts/cleaned.txt', 'r', encoding="utf-8") as file:
    f = file.read()

In [9]:
f.count('\n')
#Was 30768.....

30609

In [10]:
import re
print('Bakhar occurence:', len([m.start() for m in re.finditer('Bakhar', f)]))
print('Bahar occurence:',len([m.start() for m in re.finditer('Bahar', f)]))

Bakhar occurence: 3
Bahar occurence: 198
