In [196]:
import os
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.corpus import cmudict
import pandas as pd

stop_words=set(stopwords.words('english'))
def read_text(file_path):
    with open(file_path,'r',encoding='utf-8')as file:
        text=file.read()
        return text

def clean_text(text):
    words=word_tokenize(text)
    cleaned_words=[word for word in words if word.lower() not in stop_words]
    cleaned_text= ' '.join(cleaned_words)
    return cleaned_text

In [197]:


def sentimental_analyzer(cleaned_txt):
    positive_words = positive_content(cleaned_txt)
    negative_words = negative_content(cleaned_txt)
    positivescore=len(positive_words)
    negativescore=len(negative_words)
    return positivescore,negativescore

def positive_content(cleaned_txt):
    positive_words = []
    analyzer = SentimentIntensityAnalyzer()
    words = cleaned_txt.split()
    for word in words:
        sentiment_scores = analyzer.polarity_scores(word)
        if sentiment_scores['pos'] > sentiment_scores['neg']:
            positive_words.append(word)
    return positive_words

def negative_content(cleaned_txt):
    negative_words = []
    analyzer = SentimentIntensityAnalyzer()
    words = cleaned_txt.split()
    for word in words:
        sentiment_scores = analyzer.polarity_scores(word)
        if sentiment_scores['neg'] > sentiment_scores['pos']:
            negative_words.append(word)
    return negative_words
    

In [198]:
def polarity_score(positivescore, negativescore):
    polarityscore= (positivescore-negativescore)/((positivescore+negativescore)+0.000001)
    return polarityscore

In [199]:
def subjectivity_score(positivescore, negativescore):
    total_number_of_cleanedwords= len(text_file)
    subjectivityscore= (positivescore+negativescore)/total_number_of_cleanedwords
    return subjectivityscore

In [200]:
def sentence_readability(text_file):
    sentence= sent_tokenize(text_file)
    num_sentence=len(sentence)

    words= word_tokenize(text_file)
    num_words=len(words)

    average_sentence_length=num_sentence/num_words
    
    syllable_counter=0

    cmu_dict=cmudict.dict()

    for word in words:
        word=re.sub(r'[^\w\s]','', word.lower())
        if word in cmu_dict:
            syllable_counter+= max([len(list(y for y in x if y[-1].isdigit())) for x in cmu_dict[word]])
            
    percentage_of_complex_words= (syllable_counter/num_words)*100

    fog_index=0.4*(average_sentence_length+percentage_of_complex_words)

    return average_sentence_length, percentage_of_complex_words, fog_index
        
        

In [201]:
def average_words_sentence(text_file):
    sentences= sent_tokenize(text_file)
    sentence_count=len(sentences)

    word_count=0
    for sentence in sentences:
        words=word_tokenize(sentence)
        word_count+=len(words)

    Average_Number_of_Words_Per_Sentence = word_count/sentence_count
    return Average_Number_of_Words_Per_Sentence

In [202]:
def complex_words(textfile):
    words=word_tokenize(textfile)
    complex_word_count=0
    
    cmu_dict=cmudict.dict()

    for word in words:
        word=re.sub(r'[^\w\s]','', word.lower())
        if word in cmu_dict:
            syllable_count= max([len(list(y for y in x if y[-1].isdigit())) for x in cmu_dict[word]])
            if syllable_count>2:
                complex_word_count+=1

    return complex_word_count
    
                
            

In [203]:
def cleaned_word_count(text_file):
    words=word_tokenize(text_file)
    cleaned_words=[]

    for word in words:
        word=re.sub(r'[^\w\s]','', word)
        if word not in stop_words:
            cleaned_words.append(word)
        word_count=len(cleaned_words)
    return word_count

In [204]:
def syllable_word(text_file):
    words=word_tokenize(text_file)
    cmu_dict=cmudict.dict()
    syllables_tot_word=0
    for word in words:
        word=re.sub(r'[^\w\s]','', word.lower())
        if word in cmu_dict:
            syllable_count= max([len(list(y for y in x if y[-1].isdigit())) for x in cmu_dict[word]])
            syllables_tot_word+=syllable_count
        average_syllables_per_word=syllables_tot_word/len(words)
    return average_syllables_per_word
            

In [205]:
def count_pronouns(text_file):
    pronoun_pattern = r'\b(I|we|my|ours|us)\b'
    exclude_pattern = r'\bUS\b'
    pattern_regex= re.compile(pronoun_pattern, flags=re.IGNORECASE)
    exclude_pattern=re.compile(exclude_pattern, flags=re.IGNORECASE)
    my_pronoun=pattern_regex.findall(text_file)
    personal_pronouns = [pronoun for pronoun in my_pronoun if not exclude_pattern.search(pronoun)]
    tot_length_pronoun= len(personal_pronouns)
    return tot_length_pronoun

In [206]:
def word_average(text_file):
    words=word_tokenize(text_file)
    word_length=0
    for word in words:
        word=re.sub(r'[^\w\s]','', word.lower())
        word_length = sum(len(word) for word in words)
    average_word_length=word_length/len(words)
    return average_word_length
        

In [207]:
directory='C:/Users/HP/Desktop/jupyter projects/extracted content'
data=[]
for file_name in os.listdir(directory):
    if file_name.endswith('.txt'):
        file_path=os.path.join(directory, file_name)
        text_file=read_text(file_path)
        cleaned_txt_file= clean_text(text_file)
        positivescore, negativescore= sentimental_analyzer(cleaned_txt_file)
        polarityscore= polarity_score(positivescore, negativescore)
        subjectivityscore= subjectivity_score(positivescore, negativescore)
        average_sentence_length, percentage_of_complex_words, fog_index= sentence_readability(text_file)
        average_words_per_sentence=average_words_sentence(text_file)
        complex_word_count= complex_words(text_file)
        cleaned_words_count=cleaned_word_count(text_file)
        average_syllables_per_word= syllable_word(text_file)
        tot_length_pronoun= count_pronouns(text_file)
        average_word_length=word_average(text_file)

        data.append({
            "URL_ID": file_name,
            "Positive Score": positivescore,
            "Negative Score": negativescore,
            "Polarity Score": polarityscore,
            "Subjectivity Score": subjectivityscore,
            "Average Sentence Length": average_sentence_length,
            "Percentage of Complex Words": percentage_of_complex_words,
            "Fog Index": fog_index,
            "Average Words per Sentence": average_words_per_sentence,
            "Complex Word Count": complex_word_count,
            "Cleaned Words Count": cleaned_words_count,
            "Average Syllables per Word": average_syllables_per_word,
            "Total Pronouns Count": tot_length_pronoun,
            "Average Word Length": average_word_length})
for all_data in data:
    print(all_data)
    print()



df = pd.DataFrame(data)
file_path='C:/Users/HP/Desktop/jupyter projects/output_structure.xlsx'
df.to_excel(file_path, index=False)



    

{'URL_ID': 'blackassign0001 .txt', 'Positive Score': 75, 'Negative Score': 10, 'Polarity Score': 0.7647058733564015, 'Subjectivity Score': 0.012177650429799427, 'Average Sentence Length': 0.0446969696969697, 'Percentage of Complex Words': 131.5909090909091, 'Fog Index': 52.654242424242426, 'Average Words per Sentence': 22.372881355932204, 'Complex Word Count': 141, 'Cleaned Words Count': 843, 'Average Syllables per Word': 1.315909090909091, 'Total Pronouns Count': 11, 'Average Word Length': 4.382575757575758}
{'URL_ID': 'blackassign0002 .txt', 'Positive Score': 94, 'Negative Score': 19, 'Polarity Score': 0.663716808285692, 'Subjectivity Score': 0.011641083753991964, 'Average Sentence Length': 0.03764420157862781, 'Percentage of Complex Words': 152.09471766848816, 'Fog Index': 60.852944748026715, 'Average Words per Sentence': 26.56451612903226, 'Complex Word Count': 321, 'Cleaned Words Count': 1103, 'Average Syllables per Word': 1.5209471766848817, 'Total Pronouns Count': 3, 'Average Wo

PermissionError: [Errno 13] Permission denied: 'C:/Users/HP/Desktop/jupyter projects/output_structure.xlsx'