In [33]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
import re
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
def scrape_article(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            article_title = soup.find("h1").get_text().strip()
            paras = soup.find_all("p")
            article_text = "".join(para.get_text().strip() for para in paras)
            return article_title, article_text
    except Exeption as e:
        print(f"Error while scraping {url}: {e}")
    return None, None

In [3]:
stop_words = set("stopwords")

In [4]:
with open("MasterDictionary/positive-words.txt","r") as file:
    positive_words = set(word.strip() for word in file.readlines() if word.strip() not in stop_words)

In [5]:
with open("MasterDictionary/negative-words.txt","r") as file:
    negative_words = set(word.strip() for word in file.readlines() if word.strip() not in stop_words)

In [6]:
df_input = pd.read_excel("input.xlsx")

In [7]:
df_input.head()

Unnamed: 0,URL_ID,URL
0,123.0,https://insights.blackcoffer.com/rise-of-telem...
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...
4,432.0,https://insights.blackcoffer.com/rise-of-telem...


In [8]:
for index, row in df_input.iterrows():
    url_id = int(row["URL_ID"])
    url = row["URL"]
    article_title, article_text = scrape_article(url)
    if article_text:
        with open(f"{url_id}.txt", "w", encoding= "utf-8") as f:
            f.write(article_text)
        df_input.loc[index, "ARTICLE_TITLE"] = article_title
        df_input.loc[index, "ARTICLE_FILE"] = f"{url_id}.txt"
    else:
        df_input.loc[index, "ARTICLE_TITLE"] = ""
        df_input.loc[index, "ARTICLE_FILE"] = ""

In [9]:
df_input.head()

Unnamed: 0,URL_ID,URL,ARTICLE_TITLE,ARTICLE_FILE
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,Rise of telemedicine and its Impact on Livelih...,123.txt
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,Rise of e-health and its impact on humans by t...,321.txt
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,Rise of e-health and its impact on humans by t...,2345.txt
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...,Rise of telemedicine and its Impact on Livelih...,4321.txt
4,432.0,https://insights.blackcoffer.com/rise-of-telem...,Rise of telemedicine and its Impact on Livelih...,432.txt


In [31]:
def sentiment_scores(text):
    blob = TextBlob(text)
    positive_score = sum(1 for word in blob.words if word.lower() in positive_words)
    negative_score = sum(1 for word in blob.words if word.lower() in negative_words)
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(blob.words) + 0.000001)
    return positive_score, negative_score, polarity_score, subjectivity_score

In [23]:
def count_syllables(word):
    vowels = 'aeiouyAEIOUY'
    count = 0
    for i in range(1, len(word)):
        if word[i] in vowels and word[i - 1] not in vowels:
            count += 1
    if word.endswith('e'):
        count -= 1
    if count == 0:
        count = 1
    return count

In [24]:
def average_word_length(text):
    words = word_tokenize(text)
    return sum(len(word) for word in words) / len(words)

In [25]:
import os

In [54]:
output_data = []

In [55]:
for index, row in df_input.iterrows():
    article_file = row["ARTICLE_FILE"]
    if not article_file:
        continue
    if os.path.exists(article_file):
        with open(article_file, "r", encoding="utf-8") as f:
            article_text = f.read()
    else:
        print(f"File not found: {article_file}")
        continue
    positive_score, negative_score, polarity_score, subjectivity_score = sentiment_scores(article_text)
    sentences = sent_tokenize(article_text)
    word_count = len(word_tokenize(article_text))
    avg_sentence_length = word_count / len(sentences)
    complex_word_count = sum(1 for word in word_tokenize(article_text) if count_syllables(word) > 2)
    percentage_complex_words = complex_word_count / word_count
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = word_count / len(sentences)
    syllables_per_word = sum(count_syllables(word) for word in word_tokenize(article_text)) / word_count
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', article_text, re.IGNORECASE))
    avg_word_len = average_word_length(article_text)

    output_data.append([row['URL_ID'], row['URL'], row['ARTICLE_TITLE'], positive_score, negative_score,
                        polarity_score, subjectivity_score, avg_sentence_length, percentage_complex_words,
                        fog_index, avg_words_per_sentence, complex_word_count, word_count, syllables_per_word,
                        personal_pronouns, avg_word_len])

In [57]:
print(output_data[7])

[3817.4, 'https://insights.blackcoffer.com/how-does-marketing-influence-businesses-and-consumers/', 'How does marketing influence businesses and consumers?', 73, 9, 0.780487795359905, 0.043455219902779424, 28.930555555555557, 0.18098895823331734, 11.644617805515551, 28.930555555555557, 377, 2083, 1.6024963994239079, 7, 4.936149783965434]


In [58]:
output_columns = ['URL_ID', 'URL', 'ARTICLE_TITLE', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
                  'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
                  'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD',
                  'PERSONAL PRONOUNS', 'AVG WORD LENGTH']

In [59]:
df_output = pd.DataFrame(output_data, columns = output_columns)

In [60]:
df_output.to_excel("output.xlsx", index=False)