In [1]:
# Importing required libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import pyphen # for syllable count

In [2]:
input_file = "Input.xlsx"
df = pd.read_excel(input_file)

In [3]:
# Data extraction
count = 0 
start_time = time.time()       # To check the time througout the data extraction process.

# Loop through the URLs
for index, row in df.iterrows():
    try:
        # Get the URL and the URL_ID
        url = row["URL"]
        url_id = row["URL_ID"]

        # Crawl and parse the web page
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad responses
        soup = BeautifulSoup(response.text, "html.parser")

        # Find the elements that contain the article title and text
        title_element = soup.find("h1", class_="entry-title")
        text_element = soup.find("div",class_ = "td-post-content tagdiv-type")
        
        # Handle cases where class names are different
        if title_element is None:
            title_element = soup.find("h1", class_="tdb-title-text")

        if text_element is None:
            text_element = soup.find("div", class_="td_block_wrap tdb_single_content tdi_130 td-pb-border-top td_block_template_1 td-post-content tagdiv-type")
            

        if title_element is not None and text_element is not None:
            # Extract the text content and remove any whitespace
            title = title_element.get_text().strip()
            text = text_element.get_text().strip()

            # Save the extracted text in a text file with the URL_ID as its file name
            output_file = f"{url_id}.txt"
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(title + "\n")
                f.write(text + "\n")

            count += 1

        else:
            print(f"Could not extract data from {url_id}. Check HTML structure.")
    
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err} in link {url_id}")
    except Exception as err:
        print(f"An error occurred: {err} . in link {url_id}")

# Print the total number of successful extractions
print(f"Successfully extracted data from {count} links.")

# Calculate the time taken
end_time = time.time()
time_taken = end_time - start_time

print(f"Time taken: {time_taken:.2f} seconds")

HTTP error occurred: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/ in link 11668.0
HTTP error occurred: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/ in link 17671.4
Successfully extracted data from 112 links.
Time taken: 311.35 seconds


In [4]:
# paths for stopwords, master dict and data extracted 
input_dir = "C:/Users/VIVEK/_1_blackcoffer_project"
stopwords_dir = "C:/Users/VIVEK/_1_blackcoffer_project/StopWords"
master_dir = "C:/Users/VIVEK/_1_blackcoffer_project/MasterDictionary"

# Loading stop words
stop_words = set()
for file in os.listdir(stopwords_dir):
    if file.endswith(".txt"):
        with open(os.path.join(stopwords_dir,file),'r') as f:
            stop_words.update(f.read().splitlines())
        
# loading positive and negative words
positive_words = set()
negative_words = set()

for file in os.listdir(master_dir):
    if file == 'positive-words.txt':
        with open(os.path.join(master_dir,file),'r') as f:
            positive_words.update(f.read().splitlines())
    if file == 'negative-words.txt':
        with open(os.path.join(master_dir,file),'r') as f:
            negative_words.update(f.read().splitlines())


In [5]:
# Function to calculate each variable
def calculate_scores(text):
#   1. Sentimental Analysis
    text = ' '.join(word for word in text.lower().split() if word not in stop_words)
    
    tokens = word_tokenize(text.lower())   # Tokenize and convert to lowercase
    positive_score = sum(1 for token in tokens if token in positive_words)
    negative_score = sum(1 for token in tokens if token in negative_words)
    
    polarity_score = (positive_score - negative_score)/((positive_score + negative_score) +0.000001)
    subjectivity_score = (positive_score + negative_score)/(len(tokens) + 0.000001)
    
#   2. Analysis of Readability
    sentences = nltk.sent_tokenize(text)   # list of sentences 
    total_words = len(tokens)              # Total words
    avg_sentence_length = total_words/len(sentences) # Avg sentence length

#     Percentage of Complex Words
    complex_words = [word for word in tokens if len(word) > 2]
    percentage_complex_words = len(complex_words)/total_words
    
#     Fog Index
    fog_index = 0.4*(avg_sentence_length + percentage_complex_words)
    
#   3. Average Number of Words Per Sentence
    avg_words_per_sentence = total_words/len(sentences)
    
#   4. Complex Word Count
    complex_word_count = len(complex_words)
    
#   5. Word Count
    cleaned_words = [word for word in tokens if word.lower() not in stop_words and re.match(r'\w',word)]
    word_count = len(cleaned_words)
    
#   6. Syllable Count Per Word --using pyphen library
    def count_syllables(word):
        dic = pyphen.Pyphen(lang='en-US')
        return len(dic.inserted(word).split('-'))
    
    syllable_per_word = sum(count_syllables(word) for word in cleaned_words)/len(cleaned_words)
    
#   7. Personal Pronouns
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b',text,re.IGNORECASE))
    
#   8. Average Word Length
    avg_word_length = sum(len(word) for word in cleaned_words)/len(cleaned_words)
    
    return {
        'POSITIVE SCORE' : positive_score,
        'NEGATIVE SCORE' : negative_score,
        'POLARITY SCORE' : polarity_score,
        'SUBJECTIVITY SCORE' : subjectivity_score,
        'AVG SENTENCE LENGTH' : avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS' : percentage_complex_words,
        'FOG INDEX' : fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE' : avg_words_per_sentence,
        'COMPLEX WORD COUNT' : complex_word_count,
        'WORD COUNT' : word_count,
        'SYLLABLE PER WORD' : syllable_per_word,
        'PERSONAL PRONOUNS' : personal_pronouns,
        'AVG WORD LENGTH' : avg_word_length
    }

In [6]:
# To save output in excel file
output_file = 'output Data Structure.xlsx'

# Initializing a list to store computed variables
data = []

# Calculating variables by iterating through each files
for filename in os.listdir(input_dir):
    if filename.endswith(".txt"):
        filepath = os.path.join(input_dir,filename)
        url_id = os.path.splitext(filename)[0]
        
        with open(filepath, 'r', encoding = 'utf-8') as file:
            content = file.read()

# Calculating scores
        scores = calculate_scores(content)
        
        row_data = [url_id] + list(scores.values())
        data.append(row_data)
        
# Dataframe to store computed variables
columns = ['URL_ID','POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
           'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 
           'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 
           'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 
           'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 
           'AVG WORD LENGTH']

df = pd.DataFrame(data, columns = columns) # Computed data
output_df = pd.read_excel(output_file)     # Output file

# Merging computed data with existing data based on URL_ID
output_df.update(df)

# Saving updated output file
output_df.to_excel('updated_output_file.xlsx', index = False)
