In [None]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize
import re

stop_words = set()
stopwords_directory = '/content/drive/MyDrive/StopWords'
for filename in os.listdir(stopwords_directory):
    if filename.endswith('.txt'):
        with open(os.path.join(stopwords_directory, filename), encoding='ISO-8859-1') as file:
            stop_words.update([line.strip() for line in file])

positive_words = set(line.strip() for line in open('/content/drive/MyDrive/MasterDictionary/MasterDictionary/positive-words.txt', encoding='ISO-8859-1'))
negative_words = set(line.strip() for line in open('/content/drive/MyDrive/MasterDictionary/MasterDictionary/negative-words.txt', encoding='ISO-8859-1'))

def extract_text_and_headings(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    for tag in soup(['header', 'footer', 'img', 'iframe', 'media']):
        tag.extract()

    text = ' '.join([p.get_text() for p in soup.find_all('p')])
    headings = [h.get_text() for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
    return text, headings

def scrape_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        text, headings = extract_text_and_headings(response.content)
        return text, headings
    else:
        print(f"Failed to fetch URL: {url}")
        return None, None

def calculate_sentiment(text):
    words = word_tokenize(text.lower())

    cleaned_words = [word for word in words if word.isalnum() and word not in stop_words]

    pos_score = sum(1 for word in cleaned_words if word in positive_words)
    neg_score = sum(1 for word in cleaned_words if word in negative_words)

    polarity = (pos_score - neg_score) / ((pos_score + neg_score) + 0.000001)
    subjectivity = (pos_score + neg_score) / (len(cleaned_words) + 0.000001)

    return pos_score, neg_score, polarity, subjectivity

def calculate_readability(text):
    words = word_tokenize(text)
    sentences = sent_tokenize(text)

    avg_sentence_length = round(len(words) / len(sentences))
    complex_word_count = sum(1 for word in words if len(word) > 2 and word.isalnum() and word not in stop_words)
    percentage_complex_words = complex_word_count / len(words)

    fog_index = round(0.4 * (avg_sentence_length + percentage_complex_words), 4)
    avg_words_per_sentence = round(len(words) / len(sentences))

    return avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count

def calculate_syllable_per_word(text):
    words = word_tokenize(text)
    total_syllables = 0

    for word in words:
        word = re.sub(r'[.,!?]', '', word)
        if len(word) > 2 and word.isalnum() and word not in stop_words:
            syllables = 0
            vowels = 'aeiouAEIOU'
            prev_char = None
            for char in word:
                if char in vowels and (prev_char is None or prev_char not in vowels):
                    syllables += 1
                prev_char = char
            if word.endswith('e'):
                syllables -= 1
            if syllables == 0:
                syllables = 1
            total_syllables += syllables

    avg_syllables_per_word = total_syllables / len(words)
    return avg_syllables_per_word

def calculate_personal_pronouns(text):
    personal_pronouns = re.findall(r'\b(I|we|my|ours|us)\b', text)
    return len(personal_pronouns)

def calculate_avg_word_length(text):
    words = word_tokenize(text)
    total_characters = sum(len(word) for word in words)
    avg_word_length = total_characters / len(words)
    return avg_word_length

def main():
    excel_file = '/content/Input.xlsx'
    df = pd.read_excel(excel_file)
    text_files_directory = 'extracted_text'
    os.makedirs(text_files_directory, exist_ok=True)

    for index, row in df.iterrows():
        url_id = row['URL_ID']
        url = row['URL']

        text, headings = scrape_url(url.strip())

        if text:
            text_filename = os.path.join(text_files_directory, f'text_{url_id}.txt')
            with open(text_filename, 'w', encoding='utf-8') as text_file:
                text_file.write(text)

            pos_score, neg_score, polarity, subjectivity = calculate_sentiment(text)
            avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count = calculate_readability(text)
            word_count = len(word_tokenize(text))
            syllable_per_word = calculate_syllable_per_word(text)
            personal_pronouns = calculate_personal_pronouns(text)
            avg_word_length = calculate_avg_word_length(text)

            df.at[index, 'POSITIVE SCORE'] = pos_score
            df.at[index, 'NEGATIVE SCORE'] = neg_score
            df.at[index, 'POLARITY SCORE'] = polarity
            df.at[index, 'SUBJECTIVITY SCORE'] = subjectivity
            df.at[index, 'AVG SENTENCE LENGTH'] = avg_sentence_length
            df.at[index, 'PERCENTAGE OF COMPLEX WORDS'] = percentage_complex_words
            df.at[index, 'FOG INDEX'] = fog_index
            df.at[index, 'AVG NUMBER OF WORDS PER SENTENCE'] = avg_words_per_sentence
            df.at[index, 'COMPLEX WORD COUNT'] = complex_word_count
            df.at[index, 'WORD COUNT'] = word_count
            df.at[index, 'SYLLABLE PER WORD'] = syllable_per_word
            df.at[index, 'PERSONAL PRONOUNS'] = personal_pronouns
            df.at[index, 'AVG WORD LENGTH'] = round(avg_word_length, 4)

            print(f"Scores calculated and updated for URL ID {url_id}.")
            print(f"Extracted text saved to {text_filename}")

    output_file = 'output.xlsx'
    df.to_excel(output_file, index=False)
    print("Results saved to", output_file)

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Scores calculated and updated for URL ID 123.0.
Extracted text saved to extracted_text/text_123.0.txt
Scores calculated and updated for URL ID 321.0.
Extracted text saved to extracted_text/text_321.0.txt
Scores calculated and updated for URL ID 2345.0.
Extracted text saved to extracted_text/text_2345.0.txt
Scores calculated and updated for URL ID 4321.0.
Extracted text saved to extracted_text/text_4321.0.txt
Scores calculated and updated for URL ID 432.0.
Extracted text saved to extracted_text/text_432.0.txt
Scores calculated and updated for URL ID 2893.8.
Extracted text saved to extracted_text/text_2893.8.txt
Scores calculated and updated for URL ID 3355.6.
Extracted text saved to extracted_text/text_3355.6.txt
Scores calculated and updated for URL ID 3817.4.
Extracted text saved to extracted_text/text_3817.4.txt
Scores calculated and updated for URL ID 4279.2.
Extracted text saved to extracted_text/text_4279.2.txt
Scores calculated and updated for URL ID 4741.0.
Extracted text saved 