## Set Up Libraries, Paths, and Download NLTK Data

In [1]:
import os
import nltk

#  paths
ARTICLES_FOLDER_PATH = r'C:\Users\dell\OneDrive\Sem5\artilce\articles'
OUTPUT_FILE_PATH = 'Output Data.xlsx'
POSITIVE_WORDS_PATH = r'C:\Users\dell\OneDrive\Sem5\artilce\MasterDictionary\positive-words.txt'
NEGATIVE_WORDS_PATH = r'C:\Users\dell\OneDrive\Sem5\artilce\MasterDictionary\negative-words.txt'
CUSTOM_STOPWORDS_FOLDER_PATH = r'C:\Users\dell\OneDrive\Sem5\artilce\StopWords'

#  NLTK data is downloaded
nltk.download('stopwords')
nltk.download('punkt')

print("Setup complete.")


Setup complete.


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


##  Load Stop Words and Custom Stop Words

In [2]:
from nltk.corpus import stopwords

# Load stop words
stop_words = set(stopwords.words('english'))

# Load custom stop words
for filename in os.listdir(CUSTOM_STOPWORDS_FOLDER_PATH):
    file_path = os.path.join(CUSTOM_STOPWORDS_FOLDER_PATH, filename)
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        custom_stop_words = file.read().splitlines()
        stop_words.update(custom_stop_words)

print(f"Total stop words loaded: {len(stop_words)}")


Total stop words loaded: 12948


## Load Positive and Negative Words

In [3]:
# Load positive words
with open(POSITIVE_WORDS_PATH, 'r') as file:
    positive_words = set(file.read().splitlines())

# Load negative words
with open(NEGATIVE_WORDS_PATH, 'r') as file:
    negative_words = set(file.read().splitlines())

print(f"Positive words loaded: {len(positive_words)}")
print(f"Negative words loaded: {len(negative_words)}")


Positive words loaded: 2006
Negative words loaded: 4783


## Define Syllable Count Function

In [4]:
# Function to calculate syllables in a word
def syllable_count(word):
    word = word.lower()
    vowels = "aeiouy"
    count = 0
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

print("Syllable count function defined.")


Syllable count function defined.


## Initialize Results List

In [5]:
# Initialize a list for storing results
results = []




## Process Each Article

In [6]:
import re

# Process each article
for filename in os.listdir(ARTICLES_FOLDER_PATH):
    file_path = os.path.join(ARTICLES_FOLDER_PATH, filename)
    url_id = filename.split('_')[0].replace('blackassign', '')

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        content = file.read()
        heading = re.search(r'Heading: (.*)', content)
        article_text = re.search(r'Article Text:\n(.*)', content, re.DOTALL)
        
        if heading and article_text:
            heading = heading.group(1)
            article_text = article_text.group(1)
        else:
            continue

    print(f"Processed article {url_id}: {heading[:50]}...")


Processed article 0001: Rising IT cities and its impact on the economy, en...
Processed article 0002: Rising IT Cities and Their Impact on the Economy, ...
Processed article 0003: Internet Demand’s Evolution, Communication Impact,...
Processed article 0004: Rise of Cybercrime and its Effect in upcoming Futu...
Processed article 0005: OTT platform and its impact on the entertainment i...
Processed article 0006: The rise of the OTT platform and its impact on the...
Processed article 0007: Rise of Cyber Crime and its Effects...
Processed article 0008: Rise of Internet Demand and Its Impact on Communic...
Processed article 0009: Rise of Cybercrime and its Effect by the Year 2040...
Processed article 0010: Rise of Cybercrime and its Effect by the Year 2040...
Processed article 0011: Rise of Internet Demand and its Impact on Communic...
Processed article 0012: Rise of telemedicine and its Impact on Livelihood ...
Processed article 0013: Rise of e-health and its impact on humans by the y...
P

# here two files Processed article 0049: Error...
# Processed article 0036: Error...
### the articel will be not there 

## Clean and Tokenize the Text

In [7]:
from nltk.tokenize import word_tokenize

# Example processing for one article
words = word_tokenize(article_text)
words = [word for word in words if word.isalpha() and word.lower() not in stop_words]

print(f"Tokenized words: {words[:10]}...")


Tokenized words: ['business', 'close', 'prevent', 'transmission', 'financial', 'concerns', 'job', 'losses', 'human', 'impacts']...


In [8]:
import pandas as pd

# Define the path to the input Excel file
INPUT_EXCEL_PATH = r'C:\Users\dell\OneDrive\Sem5\artilce\Input.xlsx'

# Read the input Excel file
url_df = pd.read_excel(INPUT_EXCEL_PATH)

print(url_df.head())


            URL_ID                                                URL
0  blackassign0001  https://insights.blackcoffer.com/rising-it-cit...
1  blackassign0002  https://insights.blackcoffer.com/rising-it-cit...
2  blackassign0003  https://insights.blackcoffer.com/internet-dema...
3  blackassign0004  https://insights.blackcoffer.com/rise-of-cyber...
4  blackassign0005  https://insights.blackcoffer.com/ott-platform-...


## Calculate Derived Variables

In [10]:
from nltk.tokenize import sent_tokenize
# Process each article
for filename in os.listdir(ARTICLES_FOLDER_PATH):
    file_path = os.path.join(ARTICLES_FOLDER_PATH, filename)
    url_id = filename.split('_')[0].replace('blackassign', '')

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        content = file.read()
        heading = re.search(r'Heading: (.*)', content)
        article_text = re.search(r'Article Text:\n(.*)', content, re.DOTALL)
        
        if heading and article_text:
            heading = heading.group(1)
            article_text = article_text.group(1)
        else:
            continue
        
        # Clean and tokenize the text
        words = word_tokenize(article_text)
        words = [word for word in words if word.isalpha() and word.lower() not in stop_words]

        # Calculate derived variables
        positive_score = sum(1 for word in words if word in positive_words)
        negative_score = sum(1 for word in words if word in negative_words)
        polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
        subjectivity_score = (positive_score + negative_score) / (len(words) + 0.000001)

        sentences = sent_tokenize(article_text)
        avg_sentence_length = len(words) / len(sentences)
        complex_words = [word for word in words if syllable_count(word) > 2]
        complex_word_count = len(complex_words)
        percentage_of_complex_words = complex_word_count / len(words)
        fog_index = 0.4 * (avg_sentence_length + percentage_of_complex_words)
        avg_number_of_words_per_sentence = len(words) / len(sentences)
        word_count = len(words)
        syllable_per_word = sum(syllable_count(word) for word in words) / len(words)
        personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', article_text, re.I))
        avg_word_length = sum(len(word) for word in words) / len(words)

        # Get the URL from the DataFrame
        matching_url = url_df.loc[url_df['URL_ID'] == f'blackassign{url_id}', 'URL']
        if not matching_url.empty:
            current_url = matching_url.values[0]
        else:
            print(f"URL_ID blackassign{url_id} not found in the DataFrame.")
            continue
        
        
        # Append the results to the list
        results.append({
            'URL_ID': f'blackassign{url_id}',
            'URL': current_url,
            'POSITIVE SCORE': positive_score,
            'NEGATIVE SCORE': negative_score,
            'POLARITY SCORE': polarity_score,
            'SUBJECTIVITY SCORE': subjectivity_score,
            'AVG SENTENCE LENGTH': avg_sentence_length,
            'PERCENTAGE OF COMPLEX WORDS': percentage_of_complex_words,
            'FOG INDEX': fog_index,
            'AVG NUMBER OF WORDS PER SENTENCE': avg_number_of_words_per_sentence,
            'COMPLEX WORD COUNT': complex_word_count,
            'WORD COUNT': word_count,
            'SYLLABLE PER WORD': syllable_per_word,
            'PERSONAL PRONOUNS': personal_pronouns,
            'AVG WORD LENGTH': avg_word_length
        })



In [11]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save the results to an Excel file
results_df.to_excel(OUTPUT_FILE_PATH, index=False)
print(f"Results saved to {OUTPUT_FILE_PATH}")


Results saved to Output Data.xlsx
