In [23]:
#Importing necessary librariries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import nltk
nltk.download('punkt')
import string
import warnings
import re
import os
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abz\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [24]:
#Creating repository of stopwords, positive and negative words from file provided
def stopwords(home_directory):
    child_paths = ['Auditor', 'Currencies', 'DatesandNumbers','Generic', 'GenericLong', 'Geographic', 'Names']
    words=[]
    for child in child_paths:
        with open(os.path.join(home_directory, "Stopwords", f"StopWords_{child}.txt"), 'r', encoding="utf-8", errors="ignore") as f:
            words.extend([word for line in f for word in line.split() if word not in ['\n', '|']])
    return words

def word_sentiment (home_directory, stopwords):
    child_paths = ['positive-words', 'negative-words']
    words = []
    for child in child_paths:
        text = ""
        with open(os.path.join(home_directory, "MasterDictionary/", f"{child}.txt"), 'r', encoding = "utf-8", errors = "ignore") as f:
            words.append([word for line in f for word in line.split() if word not in stopwords+['\n', '|']])
    return words

In [25]:
#Reading url
def generate_content(url, stopwords):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        text = soup.get_text()
        sentences = nltk.sent_tokenize(text)
        words = nltk.word_tokenize(text)
        words = [word for word in words if word not in string.punctuation]
        words = [word for word in words if word.lower() not in stopwords]
        return sentences, words
    except Exception as e:
        print(f"Error occurred while processing URL: {e}")
        return None, None

In [26]:
#Word analysis
def word_analysis(words):
    plurals = "aeiouAEIOU"
    total_syallable = 0
    complex_words = 0
    for w in words:
        if not w.endswith(('es', 'ed')):
            syallable = sum([1 for c in w if c in plurals])
            if(syallable > 2): complex_words += 1
            total_syallable += syallable
    syllable_per_word = (total_syallable/len(words))
    complex_percent = (complex_words)/len(words)
    fog_index = 0.4 * (syllable_per_word + complex_percent)
    return syllable_per_word,  complex_words, complex_percent, fog_index

#Pronouns in sentence
def pronoun_counter(sentences):
    pronounRegex = re.compile(r'\b(I|we|my|ours|(?-i:us))\b',re.I)
    pronouns = []
    for s in sentences:
        if s: pronouns.extend(re.findall(pronounRegex, s))
    pronouns_count = sum([1 for w in pronouns])
    return pronouns_count

In [27]:
home_directory= os.getcwd()
stopwords= stopwords(home_directory)
word_sentiment= word_sentiment (home_directory, stopwords)
pos_words = word_sentiment[0]
neg_words = word_sentiment[1]
df = pd.read_excel(os.path.join(home_directory, "Output Data Structure.xlsx"))
df.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,,,,,,,,,,,,,
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,,,,,,,,,,,,,
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,,,,,,,,,,,,,
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,,,,,,,,,,,,,
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,,,,,,,,,,,,,


In [37]:
df.columns

Index(['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
       'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH',
       'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
       'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
       'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'],
      dtype='object')

In [40]:
#Filling the analysis score for every url
for idx, url in enumerate(df['URL']):
    try:
        response = requests.get(url)
        response.raise_for_status()
        sentences, words = generate_content(url,stopwords)
        pos_score = sum([1 for w in words if w in pos_words])
        neg_score = sum([1 for w in words if w in neg_words])
        syllable_per_word , complex_count, complex_percentage, fog_index = word_analysis(words)
        pronouns_count = pronoun_counter(sentences)
        # Updating data
        df.at[idx, "POSITIVE SCORE"] =  pos_score
        df.at[idx, "NEGATIVE SCORE"] = neg_score
        df.at[idx, "POLARITY SCORE"] = (pos_score - neg_score)/(pos_score + neg_score + 0.000001)
        df.at[idx, 'SUBJECTIVITY SCORE'] = (pos_score + neg_score)/(len(words)+0.000001)
        df.at[idx, 'AVG SENTENCE LENGTH'] = len(words)/len(sentences)
        df.at[idx, 'PERCENTAGE OF COMPLEX WORDS'] = complex_percentage
        df.at[idx, 'FOG INDEX'] = fog_index
        df.at[idx, 'AVG NUMBER OF WORDS PER SENTENCE'] = len(words)/len(sentences)
        df.at[idx, 'COMPLEX WORD COUNT'] = complex_count
        df.at[idx, "WORD COUNT"] = len(words)
        df.at[idx, 'SYLLABLE PER WORD'] = syllable_per_word
        df.at[idx, 'PERSONAL PRONOUNS'] = pronouns_count
        df.at[idx, 'AVG WORD LENGTH'] = sum([len(w) for w in words])/len(words)
    except requests.exceptions.HTTPError as err:
        print(f"HTTPError occurred while processing URL: {url}")
        print(f"Error: {err}")
    except Exception as e:
        print(f"Error occurred while processing URL: {url}")
        print(f"Error: {e}")

HTTPError occurred while processing URL: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
Error: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
HTTPError occurred while processing URL: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/
Error: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/
Error occurred while processing URL: nan
Error: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Error occurred while processing URL: nan
Error: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Error occurred while processing URL: nan
Error: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Error occurred while processing URL: nan
Error: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


In [43]:
df.tail()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
https://insights.blackcoffer.com/what-is-the-repercussion-of-the-environment-due-to-the-covid-19-pandemic-situation-2/,,,33.0,56,-0.258427,0.071257,20.145161,0.444355,1.154844,20.145161,555.0,1249.0,2.442754,27.0,7.534828
https://insights.blackcoffer.com/impact-of-covid-19-pandemic-on-office-space-and-co-working-industries/,,,27.0,35,-0.129032,0.056007,22.14,0.420054,1.106775,22.14,465.0,1107.0,2.346883,29.0,7.273713
https://insights.blackcoffer.com/contribution-of-handicrafts-visual-arts-literature-in-the-indian-economy/,,,11.0,2,0.692308,0.014428,40.954545,0.480577,1.204883,40.954545,433.0,901.0,2.531632,22.0,7.598224
https://insights.blackcoffer.com/how-covid-19-is-impacting-payment-preferences/,,,19.0,3,0.727273,0.022044,23.761905,0.426854,1.132265,23.761905,426.0,998.0,2.403808,26.0,7.328657
https://insights.blackcoffer.com/how-will-covid-19-affect-the-world-of-work-2/,,,40.0,55,-0.157895,0.076,29.069767,0.424,1.10912,29.069767,530.0,1250.0,2.3488,25.0,7.4392


In [48]:
df.isna().sum()


URL_ID                              98
URL                                 98
POSITIVE SCORE                       2
NEGATIVE SCORE                       2
POLARITY SCORE                       2
SUBJECTIVITY SCORE                   2
AVG SENTENCE LENGTH                  2
PERCENTAGE OF COMPLEX WORDS          2
FOG INDEX                            2
AVG NUMBER OF WORDS PER SENTENCE     2
COMPLEX WORD COUNT                   2
WORD COUNT                           2
SYLLABLE PER WORD                    2
PERSONAL PRONOUNS                    2
AVG WORD LENGTH                      2
dtype: int64

In [49]:
df.to_csv('Blackcoffer_OUTPUT.csv', index=False)