In [30]:
# Imports & Configuration

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as bs
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import os

# Download necessary NLTK resources
for resource in ["punkt", "stopwords"]:
    try:
        nltk.data.find(f"tokenizers/{resource}" if resource == "punkt" else f"corpora/{resource}")
    except LookupError:
        nltk.download(resource)

# Confirm working directory
print("Current working directory:", os.getcwd())


Current working directory: C:\Users\sunik


In [29]:
# Load Input file directly from your Jupyter home directory
data = pd.read_excel("Input.xlsx")

print(f"Loaded {len(data)} URLs from Input.xlsx")
data.head()


Loaded 147 URLs from Input.xlsx


Unnamed: 0,URL_ID,URL
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...
3,bctech2014,https://insights.blackcoffer.com/effective-man...
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...


In [10]:
# Create empty lists for scraped results
titles = []
texts = []
failed_urls = []

for index, row in data.iterrows():
    url_id = row["URL_ID"]
    url = row["URL"]
    print(f"Processing {index+1}/{len(data)}: {url_id}")

    try:
        # Send request
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        bsoup = bs(response.text, "html.parser")

        # Extract title
        title_tag = bsoup.find("h1", class_="entry-title") or bsoup.find("title")
        article_title = title_tag.get_text(strip=True) if title_tag else "TITLE NOT FOUND"

        # Extract article text
        content_tag = bsoup.find("div", class_="td-post-content tagdiv-type") or bsoup.find("div", class_="tdb-block-inner td-fix-index")
        article_text = content_tag.get_text(separator=" ", strip=True) if content_tag else "ARTICLE TEXT NOT FOUND"

        titles.append(article_title)
        texts.append(article_text)

    except requests.exceptions.RequestException as e:
        print(f" Failed to fetch URL_ID {url_id}: {e}")
        titles.append("ERROR")
        texts.append("ERROR")
        failed_urls.append(url_id)

data["Article_Title"] = titles
data["Article_Text"] = texts
print(f"Scraping complete — {len(failed_urls)} URLs failed.")


Processing 1/147: bctech2011
Processing 2/147: bctech2012
Processing 3/147: bctech2013
Processing 4/147: bctech2014
Processing 5/147: bctech2015
Processing 6/147: bctech2016
Processing 7/147: bctech2017
Processing 8/147: bctech2018
Processing 9/147: bctech2019
Processing 10/147: bctech2020
Processing 11/147: bctech2021
Processing 12/147: bctech2022
Processing 13/147: bctech2023
Processing 14/147: bctech2024
Processing 15/147: bctech2025
Processing 16/147: bctech2026
Processing 17/147: bctech2027
Processing 18/147: bctech2028
Processing 19/147: bctech2029
Processing 20/147: bctech2030
Processing 21/147: bctech2031
Processing 22/147: bctech2032
Processing 23/147: bctech2033
Processing 24/147: bctech2034
Processing 25/147: bctech2035
Processing 26/147: bctech2036
Processing 27/147: bctech2037
Processing 28/147: bctech2038
Processing 29/147: bctech2039
Processing 30/147: bctech2040
Processing 31/147: bctech2041
Processing 32/147: bctech2042
Processing 33/147: bctech2043
Processing 34/147: 

In [12]:
# Load stopword files
def load_wordlist(path):
    words = set()
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            w = line.strip()
            if w and not w.startswith(";"):
                words.add(w.lower())
    return words

stop_words = set(stopwords.words("english"))

# Adding custom stopword lists (merge all uploaded ones)
custom_stop_files = [
    "StopWords_Auditor.txt", "StopWords_Currencies.txt", "StopWords_DatesandNumbers.txt",
    "StopWords_Generic.txt", "StopWords_GenericLong.txt", "StopWords_Geographic.txt",
    "StopWords_Names.txt"
]
for file in custom_stop_files:
    stop_words.update(load_wordlist(file))

# Load positive and negative words
positive_words = load_wordlist("positive-words.txt")
negative_words = load_wordlist("negative-words.txt")

print(f"Loaded {len(stop_words)} stopwords, {len(positive_words)} positive, {len(negative_words)} negative words.")


Loaded 12797 stopwords, 2006 positive, 4783 negative words.


In [13]:
test_url = 'https://insights.blackcoffer.com/efficient-aws-infrastructure-setup-and-management-addressing-security-scalability-and-compliance/'
print(url)

https://insights.blackcoffer.com/amazon-buy-bot-an-automation-ai-tool-to-auto-checkouts/


In [19]:
# Full scraping loop for all articles
import requests
from bs4 import BeautifulSoup as bs
from tqdm import tqdm

extracted_titles = []
extracted_texts = []
failed_urls = []

for index, row in tqdm(data.iterrows(), total=len(data)):
    url_id = row['URL_ID']
    url = row['URL']

    try:
        response = requests.get(url, timeout=60)
        response.raise_for_status()
        bsoup = bs(response.text, 'html.parser')

        title_tag = bsoup.find('h1', class_='entry-title') or bsoup.find('title')
        article_title = title_tag.get_text().strip() if title_tag else "TITLE NOT FOUND"

        content_tag = bsoup.find('div', class_='td-post-content tagdiv-type') or \
                      bsoup.find('div', class_='tdb-block-inner td-fix-index')
        article_text = content_tag.get_text(separator="\n").strip() if content_tag else "ARTICLE TEXT NOT FOUND"

        extracted_titles.append(article_title)
        extracted_texts.append(article_text)

    except requests.exceptions.RequestException as e:
        print(f" Failed URL_ID {url_id} ({url}): {e}")
        extracted_titles.append("ERROR")
        extracted_texts.append("ERROR")
        failed_urls.append(url_id)

data["Article_Title"] = extracted_titles
data["Article_Text"] = extracted_texts

print(f" Scraping complete. {len(failed_urls)} URLs failed.")


100%|████████████████████████████████████████████████████████████████████████████████| 147/147 [04:33<00:00,  1.86s/it]

 Scraping complete. 0 URLs failed.





In [20]:
print(data.head())

       URL_ID                                                URL  \
0  bctech2011  https://insights.blackcoffer.com/ml-and-ai-bas...   
1  bctech2012  https://insights.blackcoffer.com/streamlined-i...   
2  bctech2013  https://insights.blackcoffer.com/efficient-dat...   
3  bctech2014  https://insights.blackcoffer.com/effective-man...   
4  bctech2015  https://insights.blackcoffer.com/streamlined-t...   

                                       Article_Title  \
0  ML and AI-based insurance premium model to pre...   
1  Streamlined Integration: Interactive Brokers A...   
2  Efficient Data Integration and User-Friendly I...   
3  Effective Management of Social Media Data Extr...   
4  Streamlined Trading Operations Interface for M...   

                                        Article_Text  
0  Client Background\n\n\nClient:\n A leading ins...  
1  Client Background\n\n\nClient:\n A leading fin...  
2  Client Background\n\n\nClient:\n A leading tec...  
3  Client Background\n\n\nClient:\

In [25]:
import re
from nltk.tokenize import sent_tokenize, word_tokenize

def count_syllables(word):
    """Estimate syllables in a word."""
    word = word.lower()
    vowels = "aeiou"
    count, prev_vowel = 0, False
    for ch in word:
        if ch in vowels:
            if not prev_vowel:
                count += 1
            prev_vowel = True
        else:
            prev_vowel = False
    if word.endswith(("es", "ed")):
        count -= 1
    return max(1, count)

In [26]:
def analyze_text(text):
    """Compute sentiment and readability metrics for one article."""
    if not isinstance(text, str) or text.strip() in ["ERROR", "ARTICLE TEXT NOT FOUND"]:
        return {col: 0 for col in [
            "POSITIVE SCORE", "NEGATIVE SCORE", "POLARITY SCORE", "SUBJECTIVITY SCORE",
            "AVG SENTENCE LENGTH", "PERCENTAGE OF COMPLEX WORDS", "FOG INDEX",
            "AVG NUMBER OF WORDS PER SENTENCE", "COMPLEX WORD COUNT",
            "WORD COUNT", "SYLLABLE PER WORD", "PERSONAL PRONOUNS", "AVG WORD LENGTH"
        ]}

    sentences = sent_tokenize(text)
    words = [w for w in word_tokenize(text) if w.isalpha()]
    words_clean = [w.upper() for w in words if w.lower() not in stop_words]

    # Sentiment
    pos = sum(1 for w in words_clean if w in positive_words)
    neg = sum(1 for w in words_clean if w in negative_words)
    polarity = (pos - neg) / ((pos + neg) + 1e-6)
    subjectivity = (pos + neg) / (len(words_clean) + 1e-6)

    # Readability
    avg_sentence_len = len(words_clean) / max(1, len(sentences))
    complex_words = [w for w in words_clean if count_syllables(w) > 2]
    percent_complex = len(complex_words) / max(1, len(words_clean))
    fog_index = 0.4 * (avg_sentence_len + percent_complex)

    # Other metrics
    word_count = len(words_clean)
    syllables = sum(count_syllables(w) for w in words_clean)
    syllable_per_word = syllables / max(1, len(words_clean))
    pronouns = len(re.findall(r"\b(I|we|my|ours|us)\b", text, flags=re.I))
    avg_word_len = sum(len(w) for w in words_clean) / max(1, len(words_clean))

    return {
        "POSITIVE SCORE": pos,
        "NEGATIVE SCORE": neg,
        "POLARITY SCORE": polarity,
        "SUBJECTIVITY SCORE": subjectivity,
        "AVG SENTENCE LENGTH": avg_sentence_len,
        "PERCENTAGE OF COMPLEX WORDS": percent_complex,
        "FOG INDEX": fog_index,
        "AVG NUMBER OF WORDS PER SENTENCE": avg_sentence_len,
        "COMPLEX WORD COUNT": len(complex_words),
        "WORD COUNT": word_count,
        "SYLLABLE PER WORD": syllable_per_word,
        "PERSONAL PRONOUNS": pronouns,
        "AVG WORD LENGTH": avg_word_len
    }

In [22]:
print(" Running NLP analysis on all scraped articles...")
analysis_results = data["Article_Text"].apply(analyze_text).apply(pd.Series)

# Merge with main DataFrame
data = pd.concat([data, analysis_results], axis=1)

print(" NLP analysis complete.")


 Running NLP analysis on all scraped articles...
 NLP analysis complete.


In [23]:
output_path = "Output_Data_Structure_Completed.xlsx"
data.to_excel(output_path, index=False)
print(f" Final file saved successfully as: {output_path}")

 Final file saved successfully as: Output_Data_Structure_Completed.xlsx


In [24]:
data.head(3)

Unnamed: 0,URL_ID,URL,Article_Title,Article_Text,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,ML and AI-based insurance premium model to pre...,Client Background\n\n\nClient:\n A leading ins...,0.0,0.0,0.0,0.0,9.463277,0.507463,3.988296,9.463277,850.0,1675.0,2.660299,2.0,7.94806
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,Streamlined Integration: Interactive Brokers A...,Client Background\n\n\nClient:\n A leading fin...,0.0,0.0,0.0,0.0,6.269231,0.43865,2.683152,6.269231,143.0,326.0,2.631902,1.0,7.846626
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,Efficient Data Integration and User-Friendly I...,Client Background\n\n\nClient:\n A leading tec...,0.0,0.0,0.0,0.0,11.542857,0.34901,4.756747,11.542857,141.0,404.0,2.306931,1.0,7.264851


In [33]:
OUTPUT_SAVE = DATA_DIR / "Output_Data_Structure_Completed.xlsx"
OUTPUT_CSV = DATA_DIR / "Output_Data_Structure_Completed.csv"

print(OUTPUT_CSV)

C:\Users\sunik\Output_Data_Structure_Completed.csv
