In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

In [4]:
# Read input file
input_file = "input.xlsx"
df = pd.read_excel(input_file)

In [5]:
# Create a directory
output_directory = "extracted_articles"
os.makedirs(output_directory, exist_ok=True)

# Function to extract article text from a given URL
def extract_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract article title
        title = soup.title.text.strip()

        # Extract article text
        article_text = ''
        for paragraph in soup.find_all('p'):
            article_text += paragraph.text + '\n'

        return title, article_text
    except Exception as e:
        print(f"Error extracting text from {url}: {e}")
        return None, None

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    # Extract article text
    title, article_text = extract_article_text(url)

    # Save the extracted article to a text file
    if title and article_text:
        output_file_path = os.path.join(output_directory, f"{url_id}.txt")
        with open(output_file_path, 'w', encoding='utf-8') as file:
            file.write(f"Title: {title}\n\n")
            file.write(article_text)

print("Extraction complete. Text files saved in the 'extracted_articles' directory.")


Extraction complete. Text files saved in the 'extracted_articles' directory.


# 

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

# Load stop words from multiple files
stop_words_files = ["StopWords_Auditor.txt", "StopWords_Currencies.txt", "StopWords_DatesandNumbers.txt",
                    "StopWords_Generic.txt", "StopWords_GenericLong.txt", "StopWords_Geographic.txt",
                    "StopWords_Names.txt"]

stop_words = set()
for file in stop_words_files:
    with open(file, 'r', encoding='utf-8') as f:
        stop_words.update(f.read().splitlines())

# Load positive and negative words from Master Dictionary
positive_words = set()
negative_words = set()

with open("MasterDictionary/positive-words.txt", 'r', encoding='utf-8') as f:
    positive_words.update(f.read().splitlines())

with open("MasterDictionary/negative-words.txt", 'r', encoding='utf-8') as f:
    negative_words.update(f.read().splitlines())

# Function to calculate variables
def calculate_variables(article_text):
    # Tokenize the text into words
    words = word_tokenize(article_text)

    # Remove stop words
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]

    # Count total words
    total_words = len(words)

    # Count complex words (more than two syllables)
    complex_words = [word for word in words if syllable_count(word) > 2]
    complex_word_count = len(complex_words)

    # Count personal pronouns
    personal_pronouns = ['i', 'we', 'my', 'ours', 'us']
    personal_pronoun_count = sum(1 for word in words if word.lower() in personal_pronouns)

    # Calculate positive and negative scores
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words) * -1

    # Calculate polarity score
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)

    # Calculate subjectivity score
    subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)

    # Calculate average sentence length
    sentences = sent_tokenize(article_text)
    total_sentences = len(sentences)
    avg_sentence_length = total_words / total_sentences

    # Calculate percentage of complex words
    percentage_complex_words = (complex_word_count / total_words) * 100

    # Calculate Fog Index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Calculate average number of words per sentence
    avg_words_per_sentence = total_words / total_sentences

    # Calculate average word length
    avg_word_length = sum(len(word) for word in words) / total_words

    return positive_score, negative_score, polarity_score, subjectivity_score, \
           avg_sentence_length, percentage_complex_words, fog_index, \
           avg_words_per_sentence, complex_word_count, total_words, \
           syllables_per_word(words), personal_pronoun_count, avg_word_length

# Function to count syllables in a word
def syllable_count(word):
    vowels = "aeiouy"
    count = 0

    # Single vowel at the beginning of the word
    if word[0] in vowels:
        count += 1

    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1

    # Handling exceptions
    if word.endswith(("es", "ed")):
        count -= 1

    # At least one vowel is required
    count = max(count, 1)
    return count

# Function to calculate syllables per word
def syllables_per_word(words):
    syllable_sum = sum(syllable_count(word) for word in words)
    return syllable_sum / len(words)

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    url_id = row['URL_ID']
    article_text_file_path = os.path.join(output_directory, f"{url_id}.txt")

    # Read the article text from the file
    with open(article_text_file_path, 'r', encoding='utf-8') as file:
        article_text = file.read()

    # Calculate variables
    variables = calculate_variables(article_text)

    # Update the DataFrame with calculated variables
    df.at[index, 'POSITIVE SCORE':'AVG WORD LENGTH'] = variables

# Save the DataFrame to the output Excel file
output_file = "Output Data Structure.xlsx"
df.to_excel(output_file, index=False, engine='openpyxl')

print(f"Data analysis complete. Results saved in {output_file}.")


In [6]:
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dvaru\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dvaru\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [8]:
# Load stop words from multiple files
stop_words_files = ["StopWords_Auditor.txt", "StopWords_Currencies.txt", "StopWords_DatesandNumbers.txt",
                    "StopWords_Generic.txt", "StopWords_GenericLong.txt", "StopWords_Geographic.txt",
                    "StopWords_Names.txt"]

stop_words = set()
for file in stop_words_files:
    with open(file, 'r', encoding='latin-1') as f:
        stop_words.update(f.read().splitlines())


In [10]:
import chardet

# Function to detect file encoding
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
    return result['encoding']

# ...

# Update the encoding for reading positive and negative words files
positive_encoding = detect_encoding("positive-words.txt")
negative_encoding = detect_encoding("negative-words.txt")

with open("positive-words.txt", 'r', encoding=positive_encoding) as f:
    positive_words.update(f.read().splitlines())

with open("negative-words.txt", 'r', encoding=negative_encoding) as f:
    negative_words.update(f.read().splitlines())


In [17]:
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import chardet

nltk.download('punkt')
nltk.download('stopwords')


# Function to count syllables in a word
def syllable_count(word):
    vowels = "aeiouy"
    count = 0

    # Single vowel at the beginning of the word
    if word[0] in vowels:
        count += 1

    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1

    # Handling exceptions
    if word.endswith(("es", "ed")):
        count -= 1

    # At least one vowel is required
    count = max(count, 1)
    return count


# Function to calculate variables
def calculate_variables(article_text):
    # Tokenize the text into words
    words = word_tokenize(article_text)

    # Remove stop words
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]

    # Count total words
    total_words = len(words)

    # Count complex words (more than two syllables)
    complex_words = [word for word in words if syllable_count(word) > 2]
    complex_word_count = len(complex_words)

    # Count personal pronouns
    personal_pronouns = ['i', 'we', 'my', 'ours', 'us']
    personal_pronoun_count = sum(1 for word in words if word.lower() in personal_pronouns)

    # Calculate positive and negative scores
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words) * -1

    # Calculate polarity score
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)

    # Calculate subjectivity score
    subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)

    # Calculate average sentence length
    sentences = sent_tokenize(article_text)
    total_sentences = len(sentences)
    avg_sentence_length = total_words / total_sentences

    # Calculate percentage of complex words
    percentage_complex_words = (complex_word_count / total_words) * 100

    # Calculate Fog Index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Calculate average number of words per sentence
    avg_words_per_sentence = total_words / total_sentences

    # Calculate average word length
    avg_word_length = sum(len(word) for word in words) / total_words

    return positive_score, negative_score, polarity_score, subjectivity_score, \
           avg_sentence_length, percentage_complex_words, fog_index, \
           avg_words_per_sentence, complex_word_count, total_words, \
           syllables_per_word(words), personal_pronoun_count, avg_word_length

# ...


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dvaru\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dvaru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
# Function to calculate syllables per word
def syllables_per_word(words):
    syllable_sum = sum(syllable_count(word) for word in words)
    return syllable_sum / len(words)

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    url_id = row['URL_ID']
    article_text_file_path = os.path.join(output_directory, f"{url_id}.txt")

    # Read the article text from the file
    with open(article_text_file_path, 'r', encoding='utf-8') as file:
        article_text = file.read()

    # Calculate variables
    variables = calculate_variables(article_text)

    # Update the DataFrame with calculated variables
    df.loc[index, ['POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
               'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
               'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
               'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']] = variables



In [20]:
# Save the DataFrame to the output Excel file
output_file = "Output Data Structure.xlsx"
df.to_excel(output_file, index=False, engine='openpyxl')

print(f"Data analysis complete. Results saved in {output_file}.")

Data analysis complete. Results saved in Output Data Structure.xlsx.
