In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
import syllables
import os
from tqdm import tqdm

# Download required NLTK resources (run this once)
nltk.download("punkt")

# Function to fetch the HTML content of a URL
def fetch_html_content(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch content from {url}")
        return None

# Function to extract article title and text from HTML
def extract_article_data(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    article_title_element = soup.find("title")  # Find the <title> tag for the article title
    article_text_elements = soup.find_all("p")  # Find all <p> tags for the article text

    article_title = article_title_element.text.strip() if article_title_element else ""
    article_text = "\n".join([element.text.strip() for element in article_text_elements]) if article_text_elements else ""
    
    return article_title, article_text


# Function to calculate sentimental analysis
def sentimental_analysis(text, positive_words, negative_words):
    tokens = word_tokenize(text.lower())
    positive_score = sum(1 for word in tokens if word in positive_words)
    negative_score = sum(1 for word in tokens if word in negative_words)
    if positive_score + negative_score == 0:
        polarity_score = 0
        subjectivity_score = 0
    else:
        polarity_score = (positive_score - negative_score) / (positive_score + negative_score)
        subjectivity_score = (positive_score + negative_score) / len(tokens)
    return positive_score, negative_score, polarity_score, subjectivity_score

def text_analysis(text, stop_words):
    words = word_tokenize(text)
    sentences = nltk.sent_tokenize(text)

    complex_words = [word for word in words if len(word) > 2 and word.isalpha()]

    if len(sentences) == 0:
        avg_sentence_length = 0
        percentage_complex_words = 0
    else:
        # Remove stop words from the words list
        words = [word for word in words if word.lower() not in stop_words]

        # Calculate average sentence length
        avg_sentence_length = len(words) / len(sentences)

        # Calculate percentage of complex words
        if len(words) == 0:
            percentage_complex_words = 0
        else:
            percentage_complex_words = len(complex_words) / len(words)

    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    if len(sentences) == 0:
        avg_words_per_sentence = 0
    else:
        avg_words_per_sentence = len(words) / len(sentences)

    word_count = len(words)

    if len(words) == 0:
        syllable_count_per_word = 0
    else:
        syllable_count_per_word = sum([syllables.estimate(word) for word in words]) / len(words)

    personal_pronouns = ["i", "we", "my", "ours", "us"]
    personal_pronouns_count = sum(1 for word in words if word.lower() in personal_pronouns)

    if len(words) == 0:
        avg_word_length = 0
    else:
        total_chars = sum(len(word) for word in words)
        avg_word_length = total_chars / len(words)

    return (
        avg_sentence_length,
        percentage_complex_words,
        fog_index,
        avg_words_per_sentence,
        len(complex_words),  # Use len(complex_words) directly here
        word_count,
        syllable_count_per_word,
        personal_pronouns_count,
        avg_word_length,
    )

# Read positive and negative words from text files
positive_words = set()
negative_words = set()

with open("/Users/vikranthreddimasu/Downloads/Assignment/positive-words.txt", "r", encoding="latin-1") as file:
    positive_words.update(file.read().splitlines())

with open("/Users/vikranthreddimasu/Downloads/Assignment/negative-words.txt", "r", encoding="latin-1") as file:
    negative_words.update(file.read().splitlines())

# Read stop words from the text files
stop_words = set()
stop_words_files = [
    "StopWords_Auditor.txt",
    "StopWords_Currencies.txt",
    "StopWords_DatesandNumbers.txt",
    "StopWords_Generic.txt",
    "StopWords_GenericLong.txt",
    "StopWords_Geographic.txt",
    "StopWords_Names.txt",
]

for stop_words_file in stop_words_files:
    with open(f"/Users/vikranthreddimasu/Downloads/Assignment/{stop_words_file}", "r", encoding="latin-1") as file:
        stop_words.update(file.read().splitlines())

# Read the URLs from the input Excel file
input_file_path = "/Users/vikranthreddimasu/Downloads/Assignment/input.xlsx"
df_urls = pd.read_excel(input_file_path)

# Create a new DataFrame to store the results
results_df = pd.DataFrame(columns=[
    "URL_ID", "URL", "Article Title", "Article Text", "Positive Score", "Negative Score",
    "Polarity Score", "Subjectivity Score", "Avg Sentence Length", "Percentage of Complex Words",
    "Fog Index", "Avg Words per Sentence", "Complex Word Count", "Word Count", "Syllable per Word",
    "Personal Pronouns", "Avg Word Length",
])

for index, row in tqdm(df_urls.iterrows(), total=df_urls.shape[0], desc="Processing URLs"):
    url = row["URL"]
    try:
        html_content = fetch_html_content(url)
        if html_content:
            article_title, article_text = extract_article_data(html_content)

            # Save the extracted article text to a text file
            file_name = f"{row['URL_ID']}.txt"
            file_path = os.path.join("/Users/vikranthreddimasu/Downloads/Assignment/ExtractedArticles", file_name)

            with open(file_path, "w", encoding="utf-8") as text_file:
                text_file.write(article_text)

            # Sentimental analysis
            positive_score, negative_score, polarity_score, subjectivity_score = sentimental_analysis(
                article_text, positive_words, negative_words
            )

            # Text analysis
            text_analysis_results = text_analysis(article_text, stop_words)

            results_df = results_df.append(
                {
                    "URL_ID": row["URL_ID"], "URL": url, "Article Title": article_title,
                    "Article Text": article_text, "Positive Score": positive_score,
                    "Negative Score": negative_score, "Polarity Score": polarity_score,
                    "Subjectivity Score": subjectivity_score, "Avg Sentence Length": text_analysis_results[0],
                    "Percentage of Complex Words": text_analysis_results[1], "Fog Index": text_analysis_results[2],
                    "Avg Words per Sentence": text_analysis_results[3], "Complex Word Count": text_analysis_results[4],
                    "Word Count": text_analysis_results[5], "Syllable per Word": text_analysis_results[6],
                    "Personal Pronouns": text_analysis_results[7], "Avg Word Length": text_analysis_results[8],
                },
                ignore_index=True,
            )
    except Exception as e:
        print(f"An error occurred for URL: {url}")
        print(f"Error: {str(e)}")

# Save the results DataFrame to an Excel file
output_file_path = "/Users/vikranthreddimasu/Downloads/Assignment/output_results.xlsx"
results_df.to_excel(output_file_path, index=False)

print("Data extraction and analysis completed. Results saved to:", output_file_path)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vikranthreddimasu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
Processing URLs:   7%|█▊                        | 8/114 [00:19<04:03,  2.30s/it]

Failed to fetch content from https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/


  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
Processing URLs:  18%|████▌                    | 21/114 [00:50<03:34,  2.30s/it]

Failed to fetch content from https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/


  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df =

Failed to fetch content from https://insights.blackcoffer.com/ensuring-growth-through-insurance-technology/


  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
  results_df = results_df.append(
Processing URLs: 100%|████████████████████████| 114/114 [05:02<00:00,  2.65s/it]

Data extraction and analysis completed. Results saved to: /Users/vikranthreddimasu/Downloads/Assignment/output_results.xlsx



