In [1]:
import os

In [2]:
%pwd

'c:\\DataScience\\Projects\\Text_analysis\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\DataScience\\Projects\\Text_analysis'

In [5]:

from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class TextProcessingConfig:
    root_dir: Path
    stop_words_folder_path: Path
    processed_text_path: Path
    merged_stop_words_path: Path
    extracted_files_folder_path: Path
    destination_folder: Path

In [6]:
from src.text_analysis.constants import *
from src.text_analysis.utils.common import read_yaml_file, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH):

        self.config = read_yaml_file(config_filepath)

        create_directories([self.config.artifacts_root])


    def get_text_processing_config(self) -> TextProcessingConfig:
        config = self.config.text_processing

        create_directories([config.root_dir])

        data_ingestion_config = TextProcessingConfig(
            root_dir=config.root_dir,
            stop_words_folder_path=config.stop_words_folder_path,
            processed_text_path=config.processed_text_path,
            merged_stop_words_path=config.merged_stop_words_path,
            extracted_files_folder_path=config.extracted_files_folder_path,
            destination_folder=config.destination_folder,
        )

        return data_ingestion_config

In [8]:
import re

In [9]:
class TextProcessing:
    def __init__(self, config: TextProcessingConfig):
        self.config = config
    
    def merge_stop_words_files(self):
        folder_path=self.config.stop_words_folder_path
        print(folder_path)
        merged_file_path=os.path.join(self.config.merged_stop_words_path,"StopWords")
        os.makedirs(merged_file_path, exist_ok=True)
        print(merged_file_path)
        unwanted_pattern = re.compile(r"(http[s]?://|www\.)|Surnames from 1990 census|census\.gov")

        print(os.listdir(folder_path))        

        for filename in os.listdir(folder_path):
            file_path=os.path.join(folder_path,filename)

            if os.path.isfile(file_path):
                with open(file_path,"r",encoding="ISO-8859-1") as file:
                    lines = file.readlines()

                # Process each line to extract the relevant content
                new_lines = []
                for line in lines:
                    if unwanted_pattern.search(line):
                        continue

                    if '|' in line:
                        # Split by '|', strip whitespace, and remove text in parentheses
                        parts = [re.sub(r'\s*\(.*?\)\s*', '', part).strip() for part in line.split('|')]
                        # Add each part as a separate line in the new format
                        new_lines.append(parts[0] + '\n' + parts[1] + '\n')
                    else:
                        # If no '|' separator, add the line as it is
                        new_lines.append(line)

        # Write the modified content back to the same file
        merged_file_path = os.path.join(merged_file_path, filename)
        with open(merged_file_path, "w", encoding="ISO-8859-1") as file:
                file.writelines(new_lines)
                
    
    def process_text(self):
        folder_path=self.config.extracted_files_folder_path
        for filename in os.listdir(folder_path):
            
            file_path=os.path.join(folder_path,filename)
            
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
            
            content_start = text.find('==================================================\n\n')
            if content_start != -1:
                text = text[content_start + 51:]
                
            # text=text[1:]
            
            text=re.sub(r"[^a-zA-Z\s]","",text)
            text=text.lower()
            lines = text.splitlines()
            lines = [line for line in lines if line.strip()]
            text = "\n".join(lines)
            
            file_path=os.path.join(self.config.processed_text_path,filename)
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(text)

In [10]:
try:
    config = ConfigurationManager()
    text_processing_config = config.get_text_processing_config()
    text_processing = TextProcessing(config=text_processing_config)
    text_processing.merge_stop_words_files()
    text_processing.process_text()
except Exception as e:
    raise e

[2024-11-14 18:47:18,987: INFO:common: yaml file: config\config.yaml loaded successfully]
[2024-11-14 18:47:18,987: INFO:common: createD Directory at:artifacts]
[2024-11-14 18:47:18,987: INFO:common: createD Directory at:artifacts/text_processing]
assets/StopWords
artifacts/text_processing\StopWords
['StopWords_Auditor.txt', 'StopWords_Currencies.txt', 'StopWords_DatesandNumbers.txt', 'StopWords_Generic.txt', 'StopWords_GenericLong.txt', 'StopWords_Geographic.txt', 'StopWords_Names.txt']
