In [1]:
import os

In [2]:
%pwd

'c:\\DataScience\\Projects\\Text_analysis\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\DataScience\\Projects\\Text_analysis'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class TextAnalysisConfig:
    root_dir: Path
    positive_words_path: Path
    negative_words_path: Path
    merged_stop_words_path: Path
    raw_text_path: Path
    processed_text_path: Path
    output_file_path: Path
    destination_folder: Path

In [6]:
from src.text_analysis.constants import *
from src.text_analysis.utils.common import read_yaml_file, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH):

        self.config = read_yaml_file(config_filepath)

        create_directories([self.config.artifacts_root])


    def get_text_analysis_config(self) -> TextAnalysisConfig:
        config = self.config.text_analysis

        create_directories([config.root_dir])

        text_analysis_config = TextAnalysisConfig(
            root_dir=config.root_dir,
            processed_text_path=config.processed_text_path,
            positive_words_path=config.positive_words_path,
            negative_words_path=config.positive_words_path,
            merged_stop_words_path=config.merged_stop_words_path,
            raw_text_path=config.raw_text_path,
            output_file_path=config.output_file_path,
            destination_folder=config.destination_folder,
        )

        return text_analysis_config

In [8]:
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd

In [9]:
class TextAnalysis:
    def __init__(self, config: TextAnalysisConfig):
        self.config = config
        
    def word_count(self,text):
        
        stop_words=set(open(self.config.merged_stop_words_path,encoding="ISO-8859-1").read().split())
        
        words=word_tokenize(text)
        words=[word for word in words if word not in stop_words]
        
        return len(words), words
    
    def total_words_count(self,text):
        total_words=0
        avg_word_len=0
        for sentence in text.split("\n"):
            for word in sentence.split(" "):
                total_words=total_words+1
                avg_word_len=avg_word_len+len(word)
                
        return total_words, avg_word_len
    
    def sentiment_check(self,text):
        
        positive_words_path=self.config.positive_words_path
        negative_words_path=self.config.negative_words_path
        
        positive_words = set(open(positive_words_path,encoding="ISO-8859-1").read().split())
        negative_words = set(open(negative_words_path,encoding="ISO-8859-1").read().split())
        
        positive_score = 0
        negative_score = 0
        total_words=0
        
        for sentence in text.split("\n"):
            for word in sentence.split(" "):
                total_words=total_words+1
                if word in positive_words:
                    positive_score += 1
                elif word in negative_words:
                    negative_score -= 1
                
        polarity_score = (positive_score + negative_score) / ((positive_score - negative_score) + 0.000001)
        subjectivity_score =(positive_score + negative_score)/ (total_words + 0.000001)
        
        return positive_score, negative_score, polarity_score, subjectivity_score
    
    def sentence_analysis(self,text,total_words):
        avg_sen_len=total_words/len(text.split("\n"))

        avg_words_sen=0
        num_sen=0
        for sentence in text.split("\n"):
            avg_words_sen = avg_words_sen + len(sentence.split(" "))
            num_sen=num_sen+1

        avg_words_sen=avg_words_sen / num_sen
        return avg_words_sen, avg_sen_len
    
    def count_syllables(self,word):
        word = word.lower()
        vowels = "aeiou"
        syllable_count = 0
        previous_char_was_vowel = False

        for char in word:
            if char in vowels:
                # Count only when encountering a vowel for the first time in a sequence
                if not previous_char_was_vowel:
                    syllable_count += 1
                previous_char_was_vowel = True
            else:
                previous_char_was_vowel = False

        # Adjustments for silent 'e' at the end
        if word.endswith("e") or word.endswith("es") or word.endswith("ed"):
            syllable_count -= 1
        return max(1, syllable_count)
    
    # Function to count complex words in text
    def count_complex_words(self,text,avg_sen_len):
        complex_words_count=0
        cnt=0
        for sentence in text.split("\n"):
            # Count complex words based on syllable count
            for word in sentence.split(" "):
                cnt=cnt+1
                if self.count_syllables(word) >= 3:
                    complex_words_count += 1
        
        pct_complex_words=complex_words_count/cnt
        fog_index = 0.4 * (avg_sen_len + pct_complex_words)

        
        return complex_words_count, pct_complex_words, fog_index
    
    def syllable_count_per_word(self,text,total_words):
        syllable_words_count=0
        for sentence in text.split("\n"):
            for word in sentence.split(" "):
                syllable_words_count=syllable_words_count+self.count_syllables(word)

        syllable_words_count=syllable_words_count/total_words
        return syllable_words_count
    
    def pronoun_count(self,text):
        # Pronouns in lowercase
        pronouns = [
            # Personal pronouns (subjective and objective)
            "i", "me", "you", "he", "him", "she", "her", "it", "we", "us", "they", "them",

            # Possessive pronouns
            "my", "mine", "your", "yours", "his", "her", "hers", "its", "our", "ours", "their", "theirs",

            # Reflexive pronouns
            "myself", "yourself", "himself", "herself", "itself", "ourselves", "yourselves", "themselves",

            # Demonstrative pronouns
            "this", "that", "these", "those",

            # Relative pronouns
            "who", "whom", "whose", "which", "that",

            # Interrogative pronouns
            "who", "whom", "whose", "which", "what",

            # Indefinite pronouns
            "anybody", "anyone", "anything", "each", "everybody", "everyone", "everything",
            "nobody", "no one", "nothing", "somebody", "someone", "something","no one", "nothing", "somebody", "someone", "something",
            "any", "all", "some", "none", "both", "few", "several", "many", "others"
        ]

        num_pronoun=0
        for sentence in text.split("\n"):
            for word in sentence.split(" "):
                if word.lower() in pronouns:
                    num_pronoun=num_pronoun+1

        return num_pronoun

    def text_analysis(self):
        processed_text_path = self.config.processed_text_path
        raw_text_path = self.config.raw_text_path
        
        # processed_text_path_list=os.listdir(processed_text_path),
        # raw_text_path_list=os.listdir(raw_text_path),
        
        url_id_list=[]
        url_list=[]
        positive_score_list=[]
        negative_score_list=[]
        polarity_score_list=[]
        subjectivity_score_list=[]
        avg_sen_len_list=[]
        pct_complex_words_list=[]
        fog_index_list=[]
        avg_words_sen_list=[]
        complex_words_count_list=[]
        word_count_list=[]
        syllable_words_count_list=[]
        num_pronoun_list=[]
        avg_word_len_list=[]
        
        for processed_filename, raw_filename in zip(os.listdir(processed_text_path),os.listdir(raw_text_path)):
            raw_file_path=os.path.join(raw_text_path,raw_filename)
            processed_file_path=os.path.join(processed_text_path,processed_filename)
            print(processed_file_path)
            with open(processed_file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                
            word_count, words = self.word_count(text)
            word_count_list.append(word_count)
            
            total_words, avg_word_len=self.total_words_count(text)
            avg_word_len_list.append(avg_word_len)
            
            positive_score, negative_score, polarity_score, subjectivity_score = self.sentiment_check(text)
            positive_score_list.append(positive_score)
            negative_score_list.append(negative_score)
            polarity_score_list.append(polarity_score)
            subjectivity_score_list.append(subjectivity_score)
            
            avg_words_sen, avg_sen_len=self.sentence_analysis(text,total_words)
            avg_sen_len_list.append(avg_sen_len)
            avg_words_sen_list.append(avg_sen_len)
            
            complex_words_count, pct_complex_words, fog_index= self.count_complex_words(text,avg_sen_len)
            complex_words_count_list.append(complex_words_count)
            pct_complex_words_list.append(pct_complex_words)
            fog_index_list.append(fog_index)
            
            syllable_words_count=self.syllable_count_per_word(text,total_words)
            syllable_words_count_list.append(syllable_words_count)
            
            pronouns_count=self.pronoun_count(text)
            num_pronoun_list.append(pronouns_count)
            
            # Initialize variables for URL_ID and URL
            url_id = None
            url = None

            # Open and read the file
            with open(raw_file_path, "r", encoding="ISO-8859-1") as file:
                for line in file:
                    # Check if the line contains the URL_ID by matching "Title: "
                    if line.startswith("Title:"):
                        url_id = line.split("Title:")[1].strip()

                    # Check if the line contains the URL by matching "Source URL: "
                    elif line.startswith("Source URL:"):
                        url = line.split("Source URL:")[1].strip()
                        
            url_id_list.append(url_id)
            url_list.append(url)
                        
        data = {
                "URL_ID": url_id_list,
                "URL": url_list,
                "POSITIVE SCORE": positive_score_list,
                "NEGATIVE SCORE": negative_score_list,
                "POLARITY SCORE": polarity_score_list,
                "SUBJECTIVITY SCORE": subjectivity_score_list,
                "AVG SENTENCE LENGTH": avg_sen_len_list,
                "PERCENTAGE OF COMPLEX WORDS": pct_complex_words_list,
                "FOG INDEX": fog_index_list,
                "AVG NUMBER OF WORDS PER SENTENCE": avg_words_sen_list,
                "COMPLEX WORD COUNT": complex_words_count_list,
                "WORD COUNT": word_count_list,
                "SYLLABLE PER WORD": syllable_words_count_list,
                "PERSONAL PRONOUNS": num_pronoun_list,
                "AVG WORD LENGTH": avg_word_len_list
            }
        
        print(len(url_id_list))
        print(len(url_list))
        print(len(positive_score_list))
        print(len(negative_score_list))
        print(len(polarity_score_list))
        print(len(subjectivity_score_list))
        print(len(avg_sen_len_list))
        print(len(pct_complex_words_list))
        print(len(fog_index_list))
        print(len(avg_words_sen_list))
        print(len(complex_words_count_list))
        print(len(word_count_list))
        print(len(syllable_words_count_list))
        print(len(num_pronoun_list))
        print(len(avg_word_len_list))
        
        # Create DataFrame
        df = pd.DataFrame(data)

        # Specify the file path to save the Excel file
        file_path = self.config.output_file_path

        # Write to Excel file
        df.to_excel(file_path, index=False)

                
    

In [10]:
try:
    config = ConfigurationManager()
    text_analysis_config = config.get_text_analysis_config()
    text_analysis = TextAnalysis(config=text_analysis_config)
    text_analysis.text_analysis()
    # text_processing.process_text()
except Exception as e:
    raise e

[2024-11-14 19:06:17,784: INFO:common: yaml file: config\config.yaml loaded successfully]
[2024-11-14 19:06:17,787: INFO:common: createD Directory at:artifacts]
[2024-11-14 19:06:17,787: INFO:common: createD Directory at:artifacts/text_analysis]
artifacts/text_processing\Netclan20241017.txt
artifacts/text_processing\Netclan20241018.txt
artifacts/text_processing\Netclan20241019.txt
artifacts/text_processing\Netclan20241020.txt
artifacts/text_processing\Netclan20241021.txt
artifacts/text_processing\Netclan20241022.txt
artifacts/text_processing\Netclan20241023.txt
artifacts/text_processing\Netclan20241024.txt
artifacts/text_processing\Netclan20241025.txt
artifacts/text_processing\Netclan20241026.txt
artifacts/text_processing\Netclan20241027.txt
artifacts/text_processing\Netclan20241028.txt
artifacts/text_processing\Netclan20241029.txt
artifacts/text_processing\Netclan20241030.txt
artifacts/text_processing\Netclan20241031.txt
artifacts/text_processing\Netclan20241032.txt
artifacts/text_pro