In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import seaborn as sns
import requests
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import cmudict
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('vader_lexicon')
nltk.download('words')
plt.style.use('ggplot')
nltk.download('cmudict')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tanma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tanma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tanma\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\tanma\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\tanma\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\tanma\AppData\Roaming\nltk_data...
[nltk_data]   Package words is

True

# Extracting all the necessary files:

In [2]:

#extracting stopwords list
def getstopwords(filename):
    
    with open(filename, 'r') as stopwords_file:
        stop_words = stopwords_file.read().splitlines()
    return stop_words

stop_words1 = getstopwords("StopWords_Generic.txt")
stop_words2 = getstopwords("StopWords_GenericLong.txt")
stop_words3 = getstopwords("StopWords_Auditor.txt")
stop_words4 = getstopwords("StopWords_Currencies.txt")
stop_words5 = getstopwords("StopWords_DatesandNumbers.txt")
stop_words6 = getstopwords("StopWords_Geographic.txt")
stop_words7 = getstopwords("StopWords_Names.txt")

stop_wordss = stop_words1+stop_words2+stop_words3+stop_words4+stop_words5+stop_words6+stop_words7


#extracting positive and negetive words list:

with open("positive-words.txt" , 'r') as file:
    pos_words = file.read().splitlines()
with open("negative-words.txt" , 'r') as file:
    neg_words = file.read().splitlines()    
    

# All the required functions:

In [3]:
#function for extracting content of article

def extractcontent(url):
    
    response = requests.get(url)
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    title = soup.title.text.strip()
    
    contentt = ''
    article_body = soup.find('div', class_='td-post-content')

    if article_body:
        paragraphs = article_body.find_all('p')
        contentt = '\n'.join([p.text.strip() for p in paragraphs])
    strcont = title+contentt   

    return strcont

#function for removing stopwords

def remove_stopwords(input_text, stopwords_list):
    stopwords_list1 = list(map(str.lower, stopwords_list))
    words = input_text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords_list1]
    return ' '.join(filtered_words)

#function that will perform sentimental analysis

def sentiment_analysis(text, positive_words, negative_words):
    words = text.lower().split()

    positive_count = sum(word in positive_words for word in words)
    negative_count = sum(word in negative_words for word in words)

    total_words = len(words)
    positive_score = positive_count
    negative_score = negative_count
    polarity_score = (positive_score - negative_score)/((positive_score + negative_score)+0.000001)
    subjectivity_score = (positive_score + negative_score) / ((total_words)+0.000001)

    return positive_score , negative_score, polarity_score ,subjectivity_score


#functions that will perform readability analysis
       
    
def count_syllables(word):
    d = cmudict.dict()
    return max([len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]]) if word.lower() in d else 0



def calculate_metrics(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)

    average_sentence_length = len(words) / len(sentences) if len(sentences) > 0 else 0

    complex_words = [word for word in words if count_syllables(word) > 2]
    percentage_complex_words = (len(complex_words) / len(words)) * 100 if len(words) > 0 else 0
    fogindex = 0.4 * (average_sentence_length + percentage_complex_words)
    avg_words = average_sentence_length

    return average_sentence_length, percentage_complex_words , fogindex, avg_words , len(complex_words)


#function that will count total words

def count_totalwords(text):
    
    words = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in words if word.lower() not in stop_words]

    words = [word for word in words if word not in string.punctuation]
    total_words = len(words)

    return total_words


#functions that will count average syllables

def count_syllables2(word):
    cleaned_word = re.sub(r'(es|ed)$', '', word, flags=re.IGNORECASE)
    vowels = 'aeiouy'
    return sum(1 for char in cleaned_word.lower() if char in vowels)

def calculate_average_syllables(text):
    words = word_tokenize(text)

    syllable_counts = [count_syllables2(word) for word in words]

    total_syllables = sum(syllable_counts)
    total_words = len(words)
    average_syllables_per_word = total_syllables / total_words if total_words > 0 else 0

    return average_syllables_per_word


#function that will count no. of personal pronouns

def count_personal_pronouns(text):
    target_words = ["I", "we", "my", "ours", "us"]

    pattern = r'\b(?:' + '|'.join(re.escape(word) for word in target_words) + r')\b'
    exclude_pattern = r'\bUS\b'

    matches = re.findall(pattern, text, flags=re.IGNORECASE)
    excluded_matches = re.findall(exclude_pattern, text)

    word_count = len(matches) - len(excluded_matches)

    return word_count

#function that will count average word length

def calculate_average_word_length(text):
    words = text.split()
    
    total_characters = sum(len(word) for word in words)
    total_words = len(words)
    average_word_length = total_characters / total_words if total_words > 0 else 0
    
    return average_word_length

 


# Function that will perform text analysis on a URL:

In [4]:
def analysisfunction(url1 , stop_wordsss , pos_wordss , neg_wordss):
    
    content = extractcontent(url1)
    
    filtered_content = remove_stopwords(content,stop_wordsss)
    
    results1 = list(sentiment_analysis(filtered_content, pos_wordss, neg_wordss))
    
    results2 = list(calculate_metrics(content))
    
    results3 = count_totalwords(content)
    
    results4 = calculate_average_syllables(content)
    
    results5 = count_personal_pronouns(content)
    
    results6 = calculate_average_word_length(content)
    
    combine_results = results1+results2
    combine_results.append(results3)
    combine_results.append(results4)
    combine_results.append(results5)
    combine_results.append(results6)
    
    return combine_results
    
    

# Extracting excel file that contains URL_id and URLs

In [25]:
data = pd.read_excel('Output.xlsx')

data.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,,,,,,,,,,,,,
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,,,,,,,,,,,,,
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,,,,,,,,,,,,,
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,,,,,,,,,,,,,
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,,,,,,,,,,,,,


In [26]:
data.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,,,,,,,,,,,,,
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,,,,,,,,,,,,,
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,,,,,,,,,,,,,
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,,,,,,,,,,,,,
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,,,,,,,,,,,,,


# Performing data analysis iteratively on every article

In [27]:
for index, row in data.iterrows():
    
    url = row['URL']

    
    results = list(analysisfunction(url ,stop_wordss , pos_words , neg_words ))

 
    data.at[index, 'POSITIVE SCORE'] = results[0]
    data.at[index, 'NEGATIVE SCORE'] = results[1]
    data.at[index, 'POLARITY SCORE'] = results[2]
    data.at[index, 'SUBJECTIVITY SCORE'] = results[3]
    data.at[index, 'AVG SENTENCE LENGTH'] = results[4]
    data.at[index, 'PERCENTAGE OF COMPLEX WORDS'] = results[5]
    data.at[index, 'FOG INDEX'] = results[6]
    data.at[index, 'AVG NUMBER OF WORDS PER SENTENCE'] = results[7]
    data.at[index, 'COMPLEX WORD COUNT'] = results[8]
    data.at[index, 'WORD COUNT'] = results[9]
    data.at[index, 'SYLLABLE PER WORD'] = results[10]
    data.at[index, 'PERSONAL PRONOUNS'] = results[11]
    data.at[index, 'AVG WORD LENGTH'] = results[12]

# Results

In [28]:
data.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,2.0,1.0,0.333333,0.018987,15.88,10.831234,10.684494,15.88,43.0,192.0,1.63728,3.0,4.678771
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,40.0,23.0,0.269841,0.089872,21.181818,20.294298,16.590446,21.181818,331.0,865.0,1.852238,4.0,5.631095
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,32.0,19.0,0.254902,0.082258,21.803571,27.027027,19.532239,21.803571,330.0,677.0,2.050778,13.0,6.287723
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,29.0,61.0,-0.355556,0.15,23.784314,22.588623,18.549175,23.784314,274.0,677.0,2.003298,5.0,6.125356
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,15.0,7.0,0.363636,0.063768,19.641026,18.407311,15.219335,19.641026,141.0,418.0,1.877285,6.0,5.672012


In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 15 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   URL_ID                            100 non-null    object 
 1   URL                               100 non-null    object 
 2   POSITIVE SCORE                    100 non-null    float64
 3   NEGATIVE SCORE                    100 non-null    float64
 4   POLARITY SCORE                    100 non-null    float64
 5   SUBJECTIVITY SCORE                100 non-null    float64
 6   AVG SENTENCE LENGTH               100 non-null    float64
 7   PERCENTAGE OF COMPLEX WORDS       100 non-null    float64
 8   FOG INDEX                         100 non-null    float64
 9   AVG NUMBER OF WORDS PER SENTENCE  100 non-null    float64
 10  COMPLEX WORD COUNT                100 non-null    float64
 11  WORD COUNT                        100 non-null    float64
 12  SYLLABLE 

In [30]:
data.describe()

Unnamed: 0,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,24.54,18.69,0.197885,0.083823,25.297757,15.859979,16.463094,25.297757,193.22,608.23,1.761719,6.2,5.256267
std,16.113499,15.458445,0.433655,0.035709,19.550462,4.525274,8.099281,19.550462,110.556954,340.303722,0.125629,6.785308,0.382451
min,0.0,0.0,-1.0,0.0,6.0,0.0,2.4,6.0,0.0,4.0,1.517727,0.0,4.643519
25%,14.0,6.0,-0.112403,0.063814,19.89816,12.965052,13.676638,19.89816,114.5,390.25,1.678595,2.0,4.948705
50%,23.0,16.5,0.252451,0.080951,23.226537,15.554705,15.905675,23.226537,192.5,622.0,1.746004,4.0,5.219935
75%,32.25,27.0,0.480124,0.104512,26.734698,18.850403,17.985914,26.734698,265.5,810.75,1.831207,8.0,5.504726
max,73.0,62.0,1.0,0.189189,210.222222,27.865169,89.670284,210.222222,512.0,2275.0,2.101463,37.0,6.443726


# Converting the DataFrame to final Output excel file

In [31]:
dataa = data

In [34]:
dataa.to_excel('Final Output Data.xlsx', index=False)