In [6]:
import urllib
import pandas as pd
import requests
from bs4 import BeautifulSoup
# Extraction of Article from the website
def get_article_text(url):
    # Send a GET request to the URL
    res = requests.get(url)

    # Check if the request was successful (status code 200)
    if res.status_code == 200:
        # Parse the HTML content of the page
        html_page = res.content
        soup = BeautifulSoup(html_page, features='html.parser')

        # Find the article element
        article_element = soup.find("article")

        if article_element:
            # Extract all text within the article element and clean up spaces
            article_text = ' '.join(article_element.stripped_strings)

            # Return the cleaned article text
            return article_text
        else:
            print("Article element not found.")
            return None
    else:
        print(f"Failed to retrieve the page. Status code: {res.status_code}")
        return None

def url_to_dataframe(url):
    # Get article text using the function
    article_text = get_article_text(url)

    if article_text:
        # Create a DataFrame with a single column named 'Article'
        df = pd.DataFrame(data={'Article': [article_text]})
        return df
    else:
        # Return an empty DataFrame if article text is not available
        return pd.DataFrame()

# Example usage:https://insights.blackcoffer.com/in-future-or-in-upcoming-years-humans-and-machines-are-going-to-work-together-in-every-field-of-work/
url = "https://insights.blackcoffer.com/how-will-covid-19-affect-the-world-of-work-2/"
df = url_to_dataframe(url)

# Print the DataFrame


In [7]:
from IPython.display import display

# Print the DataFrame
display(df)




Unnamed: 0,Article
0,Home Blackcoffer How will COVID-19 affect the ...


In [9]:
import os
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize

# Download NLTK data (you need to do this once)
nltk.download('punkt')

# Define the directory paths
stopwords_directory = '/root/src/NLP-Based-Article-Analyzer/StopWords'
master_dictionary_directory = '/root/src/NLP-Based-Article-Analyzer/MasterDictionary'

# Function to read the content of all files in a directory and store them in a list
def read_stopwords_from_directory(directory):
    stopword_strings = []
    encodings = ['utf-8', 'latin-1', 'cp1252']  # List of possible encodings
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        for encoding in encodings:
            try:
                with open(filepath, 'r', encoding=encoding) as file:
                    stopword_strings.append(file.read())
                break  # Break if the file is successfully read
            except UnicodeDecodeError:
                continue  # Try the next encoding
    return stopword_strings

# Read the stopwords from the directory
stopword_strings = read_stopwords_from_directory(stopwords_directory)

# Sentiment analysis function
def calculate_sentiment_scores(dataframe):
    # Define a regular expression pattern for detecting links
    link_pattern = re.compile(r'https?://[^\s]+|www\.[^\s]+')

    # Initialize the final list of stopwords
    final_stopword_list = []

    # Loop through each stopword string
    for stopword_string in stopword_strings:
        # Convert to lowercase for consistency
        stopword_string_lower = stopword_string.lower()

        # Extract stopwords from the string and create a list
        stopwords_from_string = stopword_string_lower.split()

        # Filter out links and punctuations
        filtered_words = [word for word in stopwords_from_string if not link_pattern.match(word) and not re.match(r'\W+', word)]

        # Extend the final_stopword_list with the filtered words
        final_stopword_list.extend(filtered_words)

    # Remove duplicates by converting the list to a set and then back to a list
    final_stopword_list = list(set(final_stopword_list))

    # Function to remove stop words
    def remove_stopwords(text):
        words = text.split()
        filtered_words = [word for word in words if word.lower() not in final_stopword_list]
        return ' '.join(filtered_words)

    # Apply the remove_stopwords function to the 'Article' column
    dataframe['Cleaned_Article'] = dataframe['Article'].apply(lambda x: remove_stopwords(x))

    # Read positive and negative words from the MasterDictionary directory
    with open(os.path.join(master_dictionary_directory, 'positive-words.txt'), 'r', encoding='utf-8') as file:
        positive_word_list = re.findall(r'\b\w+\b', file.read())
    
    with open(os.path.join(master_dictionary_directory, 'negative-words.txt'), 'r', encoding='latin-1') as file:
        negative_word_list = re.findall(r'\b\w+\b', file.read())

    # Tokenize the text into words
    df_words = word_tokenize(dataframe['Cleaned_Article'].iloc[0])

    # Find words present in both df and positive_word_list
    found_positive_word_list = [word for word in df_words if word in positive_word_list]

    # Find words present in both df and negative_word_list
    found_negative_word_list = [word for word in df_words if word in negative_word_list]

    # Calculate Positive Score
    positive_score = sum(1 for word in df_words if word in found_positive_word_list)

    # Calculate Negative Score
    negative_score = sum(-1 for word in df_words if word in found_negative_word_list) * -1

    # Calculate Polarity Score
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)

    # Calculate Subjectivity Score
    subjectivity_score = (positive_score + negative_score) / (len(df_words) + 0.000001)

    return {
        'Positive Score': positive_score,
        'Negative Score': negative_score,
        'Polarity Score': polarity_score,
        'Subjectivity Score': subjectivity_score
    }

# Example usage
df = url_to_dataframe(url)
result = calculate_sentiment_scores(df)

# Print the results
for key, value in result.items():
    print(f"{key}: {value}")


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Positive Score: 49
Negative Score: 63
Polarity Score: -0.12499999888392858
Subjectivity Score: 0.13429256578621995


In [10]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

def calculate_sentence_complexity_metrics(dataframe):
    # Tokenize the text into sentences and words
    cleaned_articles = dataframe['Article'].apply(str)  # Ensure all entries are converted to strings
    sentences = cleaned_articles.apply(sent_tokenize)
    words = cleaned_articles.apply(word_tokenize)

    # Calculate Average Sentence Length
    average_sentence_length = words.apply(len) / sentences.apply(len)

    # Calculate Percentage of Complex Words
    complex_words = words.apply(lambda w: [word for word in w if len(word) > 6])  # Assuming words with more than 6 characters are complex
    percentage_complex_words = complex_words.apply(len) / words.apply(len)

    # Calculate Fog Index
    fog_index = 0.4 * (average_sentence_length + percentage_complex_words)

    return {
        'Average Sentence Length': average_sentence_length.mean(),
        'Percentage of Complex Words': percentage_complex_words.mean(),
        'Fog Index': fog_index.mean()
    }

# Example usage:
sentence_complexity_metrics_result = calculate_sentence_complexity_metrics(df)

# Print the results
for key, value in sentence_complexity_metrics_result.items():
    print(f"{key}: {value}")

Average Sentence Length: 40.6
Percentage of Complex Words: 0.29064039408866993
Fog Index: 16.356256157635467


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

def calculate_average_words_per_sentence(dataframe):
    # Tokenize the text into sentences and words
    sentences = [sent_tokenize(text) for text in dataframe['Article']]
    words = [word_tokenize(text) for text in dataframe['Article']]

    # Calculate Average Number of Words Per Sentence
    average_words_per_sentence = len(words[0]) /len(sentences[0])
    return average_words_per_sentence

# Example usage:
average_words_per_sentence_result = calculate_average_words_per_sentence(df)

# Print the result
print("Average Number of Words Per Sentence:", average_words_per_sentence_result)

Average Number of Words Per Sentence: 40.6


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
import syllables
from nltk.tokenize import word_tokenize

def calculate_complex_word_count(dataframe):
    # Tokenize the text into words
    words = [word_tokenize(text) for text in dataframe['Article']]

    # Flatten the list of words
    flattened_words = [word for sublist in words for word in sublist]

    # Count complex words (words with more than two syllables)
    complex_words_count = sum(1 for word in flattened_words if syllables.estimate(word) > 2)

    return complex_words_count

# Example usage:
complex_words_count_result = calculate_complex_word_count(df)

# Print the result
print("Complex Word Count:", complex_words_count_result)


Complex Word Count: 302


In [13]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')

def calculate_total_cleaned_words(dataframe):
    # Tokenize the text into words
    words = [word_tokenize(text) for text in dataframe['Article']]

    # Flatten the list of words
    flattened_words = [word for sublist in words for word in sublist]

    # Remove stop words and punctuations
    stop_words = set(stopwords.words('english'))
    filtered_words = [word.lower() for word in flattened_words if word.lower() not in stop_words and word.lower() not in string.punctuation]

    # Remove additional punctuations from each word
    filtered_words = [''.join(char for char in word if char.isalnum()) for word in filtered_words if any(char.isalnum() for char in word)]

    # Count the total cleaned words
    total_cleaned_words = len(filtered_words)

    return total_cleaned_words

# Example usage:
total_cleaned_words_result = calculate_total_cleaned_words(df)

# Print the result
print("Total Cleaned Words:", total_cleaned_words_result)

Total Cleaned Words: 765


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Function to count syllables in a word
def count_syllables(word):
    vowels = "aeiouy"
    count = 0
    prev_char = ''
    for char in word:
        char_lower = char.lower()
        if char_lower in vowels and prev_char not in vowels:
            count += 1
        prev_char = char_lower
    if word.endswith(("es", "ed")) and count > 1:
        count -= 1  # Adjust for exceptions
    return max(count, 1)  # At least one syllable

def calculate_syllable_counts(dataframe):
    # Tokenize the text into words
    words = [word_tokenize(text) for text in dataframe['Article']]

    # Flatten the list of words
    flattened_words = [word for sublist in words for word in sublist]

    # Count syllables per word
    syllables_per_word = [count_syllables(word) for word in flattened_words]

    # Calculate total syllables
    total_syllables = sum(syllables_per_word)

    # Calculate average syllables per word
    average_syllables_per_word = total_syllables / len(flattened_words)


    return syllables_per_word, total_syllables, average_syllables_per_word

# Example usage:
syllables_per_word, total_syllables, average_syllables_per_word = calculate_syllable_counts(df)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
import re

def count_personal_pronouns(dataframe):
    # Define the regex pattern for personal pronouns
    personal_pronouns_pattern = re.compile(r'\b(?:I|we|my|ours|us)\b', flags=re.IGNORECASE)

    # Find matches in the text
    personal_pronouns_matches = personal_pronouns_pattern.findall(dataframe['Article'].iloc[0])

    # Count the occurrences
    personal_pronouns_count = len(personal_pronouns_matches)

    return personal_pronouns_count

# Example usage:
personal_pronouns_count_result = count_personal_pronouns(df)

# Print the result
print("Personal Pronouns Count:", personal_pronouns_count_result)

Personal Pronouns Count: 5


In [17]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def calculate_average_word_length(dataframe):
    # Tokenize the text into words
    words = [word_tokenize(text) for text in dataframe['Article']]

    # Flatten the list of words
    flattened_words = [word for sublist in words for word in sublist]

    # Calculate the sum of the total number of characters in each word
    total_characters = sum(len(word) for word in flattened_words)

    # Calculate the total number of words
    total_words = len(flattened_words)

    # Calculate the average word length
    average_word_length = total_characters / total_words if total_words > 0 else 0

    return average_word_length

# Example usage:
average_word_length_result = calculate_average_word_length(df)

# Print the result
print("Average Word Length:", average_word_length_result)


Average Word Length: 4.836030964109782


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
