Imported Libraries

In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import re
import syllapy



Data Extraction

In [5]:
  
df = pd.read_excel("input.xlsx")

# Directory to save the files into our machine.
output_dir = "extracted_articles"
os.makedirs(output_dir, exist_ok=True)

def extract_article(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract the article title which is <h1> tag only
    title = soup.find('h1').get_text(strip=True)
    
    # Extract the main article content from the specific div tag only
    article_body = soup.find('div', class_='td-post-content tagdiv-type')
    article_text = ""
    
    if article_body:
        article_text = article_body.get_text(strip=True)
    
    return title, article_text

# Loop through the DataFrame and process each URL
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    try:
        title, article_text = extract_article(url)
        
        # Save the article text to a text files.
        with open(os.path.join(output_dir, f"{url_id}.txt"), "w", encoding="utf-8") as file:
            file.write(f"{title}\n\n{article_text}")
        
        print(f"Successfully extracted & saved article file for the {url_id}")
    except Exception as e:
        print(f"Failed to extract article for {url_id}. Error: {str(e)}")

print("All articles processed!")

Successfully extracted & saved article file for the bctech2011
Successfully extracted & saved article file for the bctech2012
Successfully extracted & saved article file for the bctech2013
Successfully extracted & saved article file for the bctech2014
Successfully extracted & saved article file for the bctech2015
Successfully extracted & saved article file for the bctech2016
Successfully extracted & saved article file for the bctech2017
Successfully extracted & saved article file for the bctech2018
Successfully extracted & saved article file for the bctech2019
Successfully extracted & saved article file for the bctech2020
Successfully extracted & saved article file for the bctech2021
Successfully extracted & saved article file for the bctech2022
Successfully extracted & saved article file for the bctech2023
Successfully extracted & saved article file for the bctech2024
Successfully extracted & saved article file for the bctech2025
Successfully extracted & saved article file for the bct

Sentimental Analysis:

In [14]:
import spacy
from spacy.lang.en import English
from spacy.tokenizer import Tokenizer
from nltk.corpus import stopwords
import string
import os

# Load spaCy's English tokenizer
nlp = English()
tokenizer = Tokenizer(nlp.vocab)

# Ensure stopwords are loaded
stop_words = set(stopwords.words('english'))

def cleaned_word_count_spacy(text):
    stop_words = set(stopwords.words('english'))
    # Tokenize the text into words
    doc = tokenizer(text)
    words = [token.text.lower().strip(string.punctuation) for token in doc if token.is_alpha]
    # Remove stop words
    cleaned_words = [word for word in words if word not in stop_words]
    return len(cleaned_words)


def gunning_fog_index(text):
    # Split the text into sentences
    sentences = re.split(r'[.!?]+', text)
    # Remove any empty strings from the list
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    
    # Split the text into words
    words = re.findall(r'\w+', text)
    
    # Count the number of complex words (words with 3 or more syllables)
    def count_syllables(word):
        word = word.lower()
        syllables = 0
        vowels = "aeiouy"
        if word[0] in vowels:
            syllables += 1
        for index in range(1, len(word)):
            if word[index] in vowels and word[index - 1] not in vowels:
                syllables += 1
        if word.endswith("e"):
            syllables -= 1
        if syllables == 0:
            syllables += 1
        return syllables
    
    complex_words = [word for word in words if count_syllables(word) >= 3]
    
    # average sentence length
    average_sentence_length = len(words) / len(sentences)
    
    # Here, in textanalysis, that they have said that you do not have to multiply with 100 but in formula it should be.
    complex_word_percentage = (len(complex_words) / len(words)) * 100 
    
    #the Gunning Fog Index
    fog_index = 0.4 * (average_sentence_length + complex_word_percentage)
    
    return average_sentence_length,complex_word_percentage,fog_index,len(complex_words)

def load_stopwords(stopwords_folder_path):
    stopwords_set = set()
    
    # Iterate over all files in the stopwords folder
    for filename in os.listdir(stopwords_folder_path):
        file_path = os.path.join(stopwords_folder_path, filename)
        
        # Process all the stop words text files
        if filename.endswith('.txt'):
            try:
                with open(file_path, 'r', encoding='ISO-8859-1') as file:  # Try different encoding
                    lines = file.read().splitlines()  # Read each line
                    for line in lines:
                        # Split words by delimiter '|' and normalize them
                        words = [word.strip().lower() for word in line.split('|')]
                        stopwords_set.update(words)
            except UnicodeDecodeError:
                print(f"Error decoding file {filename}, skipping.")
    
    return stopwords_set

def load_positive_stopwords(stopwords_folder_path):
    stopwords_set = set()
    
    # Iterate over all files in the stopwords folder
    for filename in os.listdir(stopwords_folder_path):
        file_path = os.path.join(stopwords_folder_path, filename)
        
        # Process all the stop words text files
        if "positive-words" in filename:
            try:
                with open(file_path, 'r', encoding='ISO-8859-1') as file:  # Try different encoding
                    lines = file.read().splitlines()  # Read each line
                    for line in lines:
                        # Split words by delimiter '|' and normalize them
                        words = [word.strip().lower() for word in line.split('|')]
                        stopwords_set.update(words)
            except UnicodeDecodeError:
                print(f"Error decoding file {filename}, skipping.")
    
    return stopwords_set

def load_negative_stopwords(stopwords_folder_path):
    stopwords_set = set()
    
    # Iterate over all files in the stopwords folder
    for filename in os.listdir(stopwords_folder_path):
        file_path = os.path.join(stopwords_folder_path, filename)
        
        # Process all the stop words text files
        if "negative-words" in filename:
            try:
                with open(file_path, 'r', encoding='ISO-8859-1') as file:  # Try different encoding
                    lines = file.read().splitlines()  # Read each line
                    for line in lines:
                        # Split words by delimiter '|' and normalize them
                        words = [word.strip().lower() for word in line.split('|')]
                        stopwords_set.update(words)
            except UnicodeDecodeError:
                print(f"Error decoding file {filename}, skipping.")
    
    return stopwords_set

def count_sentiments(unique_words, positive_words_set, negative_words_set):
    """Count positive and negative words in the unique words list."""
    positive_count = 0
    negative_count = 0
    
    for word in unique_words:
        if word in positive_words_set:
            positive_count += 1
        if word in negative_words_set:
            negative_count -= 1
    
    return positive_count, negative_count

def count_syllables(word):
    return syllapy.count(word)

def average_syllables_in_text(text):
    # Split text into words
    words = re.findall(r'\b\w+\b', text)
    
    if not words:
        return 0  # Avoid division by zero if text is empty or contains no words

    # Count syllables per word
    syllables_per_word = [count_syllables(word) for word in words]
    
    # Calculate average syllables
    average_syllables = sum(syllables_per_word) / len(words)
    
    return average_syllables

def count_total_personal_pronouns(text):
    """Counts the total number of personal pronouns in a given text.

    Args:
        text: The input text.

    Returns:
        The total count of personal pronouns.
    """

    # Define personal pronouns to count, considering both uppercase and lowercase
    pronouns = ["i", "you", "he", "she", "it", "we", "they","me", "you", "him", "her", "it", "us", "them","my", "your", "his", "her", "its", "our", "their","myself", "yourself", "himself", "herself", "itself", "ourselves","themselves","this", "that", "these", "those"]
    # Create a regex pattern to match personal pronouns, avoiding "US" as a country
    pattern = r'\b(?:' + '|'.join(pronouns) + r')\b(?!\s*[^ ]*\bUS\b)'

    # Find all matches using regex, case-insensitive
    matches = re.findall(pattern, text, re.IGNORECASE)

    # Return the total count of personal pronouns
    return len(matches)

def average_word_length(text):
    # Use regex to find all words in the text
    words = re.findall(r'\b\w+\b', text)
    
    # Calculate the total number of characters in all words
    total_characters = sum(len(word) for word in words)
    
    # Calculate the total number of words
    total_words = len(words)
    
    # Calculate the average word length
    avg_length = total_characters / total_words if total_words > 0 else 0
    
    return avg_length

def store_Excel(url_id,var1,var2,var3,var4,var5,var6,var7,var8,var9,var10,var11,var12,var13):
    excel_file_path = 'Output Data Structure.xlsx'

    # Read the existing Excel file into a DataFrame
    df = pd.read_excel(excel_file_path)


    new_data = {
        'URL_ID': url_id,  # The URL_ID to find
        'Variables': [var1,var2,var3,var4,var5,var6,var7,var8,var9,var10,var11,var12,var13]
    }

    # Extract URL_ID and new variables
    url_id_to_find = new_data['URL_ID']
    new_variables = new_data['Variables']
    columns_to_update = df.columns[2:]  # All columns from the 3rd one onward

    # Ensure the new variables list is the same length as the columns to update
    assert len(new_variables) == len(columns_to_update), "The number of variables does not match the number of columns to update."

    #  Update the DataFrame
    df.loc[df['URL_ID'] == url_id_to_find, columns_to_update] = new_variables

    # Save the updated DataFrame back to the Excel file
    df.to_excel(excel_file_path, index=False)
    print(f"Data has been updated for URL_ID {url_id_to_find} and saved to {excel_file_path}")



def extract_unique_words_from_files(folder_path,stopwords_set,positive_words_set,negative_words_set):

    # Iterate over all files in the file of the text file
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        word_list = [] 
        file_content = ''
        # Process only text files
        if filename.endswith('.txt'):
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                file_content = content
                # Find all words using regex
                words = re.findall(r'\b\w+\b', content.lower())  # Convert to lower case to normalize
                # Add words to the list if they are not already present
                for word in words:
                    if word not in stopwords_set:
                        word_list.append(word)

        #1  Extracting Derived variables
        total_counts = len(word_list)

        positive_score, negative_score = count_sentiments(word_list, positive_words_set, negative_words_set)
        negative_score = negative_score * -1
        
        Polarity_Score = (positive_score-negative_score)/ ((positive_score + negative_score) + 0.000001)

        Subjectivity_Score = (positive_score+negative_score)/ ((total_counts) + 0.000001)
        
        #print(positive_score,negative_score,Polarity_Score,Subjectivity_Score)

        #2	Analysis of Readability (Gunning Fox Index)
        Average_Sentence_length, Percentage_complex_words,Fog_index,complex_count = gunning_fog_index(file_content)
        #print(Average_Sentence_length,Percentage_complex_words,Fog_index)
        ''' A Fog Index score of 7-8 is considered fairly easy to read, suitable for most people.
            A score of 12 indicates the reading level of a high school senior.
            A score of 16 or above suggests the text is very difficult to read, requiring advanced education to understand. '''

        #3	Average Number of Words Per Sentence
        #print(Average_Sentence_length)

        #4	Complex Word Count
        #print(complex_count)

        #5	Word Count
        spacy_word_count = cleaned_word_count_spacy(file_content)
        #print(spacy_word_count)
        #6 Syllabe per word
        count_syllables=average_syllables_in_text(file_content)
        #print(count_syllables)

        #8 Personal Pronoun
        count_PP = count_total_personal_pronouns(file_content)
        #print(count_PP)

        #9 average length
        avg_length = average_word_length(file_content)


        url_id = filename.replace('.txt', '')

        store_Excel(url_id,positive_score, negative_score,Polarity_Score,Subjectivity_Score,Average_Sentence_length,Percentage_complex_words,Fog_index,Average_Sentence_length,complex_count,spacy_word_count,count_syllables,count_PP,avg_length)

folder_path = 'extracted_articles' 
stopwords_folder_path = 'StopWords'  
positive_file_path = 'MasterDictionary'
negative_file_path = 'MasterDictionary'

stopwords_set = load_stopwords(stopwords_folder_path)
positive_words_set = load_positive_stopwords(positive_file_path)
negative_words_set = load_negative_stopwords(negative_file_path)


extract_unique_words_from_files(folder_path,stopwords_set,positive_words_set,negative_words_set)

Data has been updated for URL_ID bctech2011 and saved to Output Data Structure.xlsx
Data has been updated for URL_ID bctech2012 and saved to Output Data Structure.xlsx
Data has been updated for URL_ID bctech2013 and saved to Output Data Structure.xlsx
Data has been updated for URL_ID bctech2014 and saved to Output Data Structure.xlsx
Data has been updated for URL_ID bctech2015 and saved to Output Data Structure.xlsx
Data has been updated for URL_ID bctech2016 and saved to Output Data Structure.xlsx
Data has been updated for URL_ID bctech2017 and saved to Output Data Structure.xlsx
Data has been updated for URL_ID bctech2018 and saved to Output Data Structure.xlsx
Data has been updated for URL_ID bctech2019 and saved to Output Data Structure.xlsx
Data has been updated for URL_ID bctech2020 and saved to Output Data Structure.xlsx
Data has been updated for URL_ID bctech2021 and saved to Output Data Structure.xlsx
Data has been updated for URL_ID bctech2022 and saved to Output Data Structu