In [10]:
!pip install beautifulsoup4
!pip install nltk



In [4]:
import nltk
nltk.download('stopwords')



In [18]:
#Importing Required Libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob

In [25]:
# Step 2: Define Function to Extract Text from URL
def extract_text_from_url(url):
    try:
        # Sending HTTP request to fetch webpage content
        response = requests.get(url)
        # Parsing HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        article_text = ''
        # Extracting only article text from <p> tags
        for paragraph in soup.find_all('p'):
            article_text += paragraph.get_text() + '\n'
        return article_text
    except Exception as e:
        print(f"Error extracting text from {url}: {e}")
        return None

In [26]:
# Defining Function to Calculate Text Analysis Variables
def calculate_text_variables(text):
    # Tokenizing text into sentences and words
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    # Calculating word count and sentence count
    word_count = len(words)
    sentence_count = len(sentences)
    # Removing stopwords and counting complex words
    stopwords_set = set(stopwords.words("english"))
    complex_words = [word for word in words if word.lower() not in stopwords_set and len(word) > 6]
    complex_word_count = len(complex_words)
    # Calculating average word length, sentence length, syllable count, and Fog index
    avg_word_length = sum(len(word) for word in words) / word_count
    avg_sentence_length = sum(len(word_tokenize(sentence)) for sentence in sentences) / sentence_count
    syllable_count = sum([len(list(filter(lambda c: c.lower() in 'aeiouy', word))) for word in words])
    fog_index = 0.4 * (avg_sentence_length + (100 * (complex_word_count / word_count)))
    # Counting personal pronouns
    personal_pronouns = ['i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours']
    personal_pronoun_count = sum(1 for word in words if word.lower() in personal_pronouns)
    # Performing sentiment analysis using TextBlob
    blob = TextBlob(text)
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity
    positive_score = sum(1 for sentence in sentences if TextBlob(sentence).sentiment.polarity > 0)
    negative_score = sum(1 for sentence in sentences if TextBlob(sentence).sentiment.polarity < 0)
    # Returning computed variables
    return (positive_score, negative_score, polarity_score, subjectivity_score,
            avg_sentence_length, len(complex_words)/len(words)*100, fog_index,
            word_count/sentence_count, complex_word_count, word_count, syllable_count/word_count,
            personal_pronoun_count, avg_word_length)

In [21]:
input_data = pd.read_excel("Input.xlsx")

In [28]:
#  Extracting Text from URLs and Perform Text Analysis
output_data = []
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    # Extracting article text from URL
    article_text = extract_text_from_url(url)
    if article_text:
        # Calculating text analysis variables
        text_variables = calculate_text_variables(article_text)
        output_data.append((url_id, *text_variables))

In [23]:
# Creating DataFrame for Output Data
output_df = pd.DataFrame(output_data, columns=['URL_ID', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 
                                               'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS',
                                               'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT',
                                               'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'])

In [24]:
# Writing Output DataFrame to Excel
output_df.to_excel("Output.xlsx", index=False)