In [14]:
# Importing Libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import re
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob


# Load the data from 'input.xlsx' and initialize new columns with None values for sentiment analysis and readability metrics


In [15]:
input_file = 'input.xlsx'
df = pd.read_excel(input_file)
df['POSITIVE SCORE'] = None
df['NEGATIVE SCORE'] = None
df['POLARITY SCORE'] = None
df['SUBJECTIVITY SCORE'] = None
df['AVG SENTENCE LENGTH'] = None
df['PERCENTAGE OF COMPLEX WORDS'] = None
df['FOG INDEX'] = None
df['AVG NUMBER OF WORDS PER SENTENCE'] = None
df['COMPLEX WORD COUNT'] = None
df['WORD COUNT'] = None
df['SYLLABLE PER WORD'] = None
df['PERSONAL PRONOUNS'] = None
df['AVG WORD LENGTH'] = None
df.shape

(114, 15)

# Define a function to extract article text from a URL, process an Excel file, and save extracted articles as text files.


In [16]:
# Function to extract article text from a URL
def extract_article_text(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract the title and article text
            title = soup.title.text
            article_text = ''
            article = soup.find('article')  # You may need to adjust this based on the HTML structure of the web pages.

            if article:
                for p in article.find_all('p'):
                    article_text += p.get_text() + '\n'

            return title, article_text
        else:
            return None, None
    except Exception as e:
        return None, None

# Read the Excel file
input_file = 'input.xlsx'
df = pd.read_excel(input_file)

# Create a directory to save text files
output_directory = 'article_texts'
os.makedirs(output_directory, exist_ok=True)

# Initialize a counter for failed extractions
failed_extraction_count = 0

# List to store the URLs that failed to extract or save
failed_urls = []

# Loop through the DataFrame and extract articles
for index, row in df.iterrows():
    url = row['URL']
    url_id = row['URL_ID']

    try:
        title, article_text = extract_article_text(url)

        if title is not None and article_text:
            # Save the article text to a text file
            file_name = os.path.join(output_directory, f'{url_id}.txt')
            with open(file_name, 'w', encoding='utf-8') as file:
                file.write(f'Title: {title}\n\n')
                file.write(article_text)
            print(f'Saved {file_name}')
        else:
            file_name = os.path.join(output_directory, f'{url_id}.txt')
            with open(file_name, 'w', encoding='utf-8') as file:
                # file.write(f'URL: {url}\n')
                file.write('')
            print(f'Failed to extract or save article from {url_id}')
            failed_extraction_count += 1
            failed_urls.append((url_id, url))
    except Exception as e:
        print(f'Error for {url_id}: {str(e)}')
        failed_extraction_count += 1
        failed_urls.append((url_id, url))

# Remove failed URLs from the DataFrame
# for url_id, url in failed_urls:
#     df = df[df['URL_ID'] != url_id]

# Save the modified DataFrame back to an Excel file
output_file = 'modified_input.xlsx'
df.to_excel(output_file, index=False)

print(f'Extraction and saving completed. {failed_extraction_count} URLs were not extracted successfully.')
print('Failed URLs:')
for url_id, url in failed_urls:
    print(f'URL_ID: {url_id}, URL: {url}')

Saved article_texts\123.0.txt
Saved article_texts\321.0.txt
Saved article_texts\2345.0.txt
Saved article_texts\4321.0.txt
Saved article_texts\432.0.txt
Saved article_texts\2893.8.txt
Saved article_texts\3355.6.txt
Saved article_texts\3817.4.txt
Saved article_texts\4279.2.txt
Saved article_texts\4741.0.txt
Saved article_texts\5202.8.txt
Saved article_texts\5664.6.txt
Saved article_texts\6126.4.txt
Saved article_texts\6588.2.txt
Saved article_texts\7050.0.txt
Saved article_texts\7511.8.txt
Saved article_texts\7973.6.txt
Saved article_texts\8435.4.txt
Saved article_texts\8897.2.txt
Saved article_texts\9359.0.txt
Saved article_texts\9820.8.txt
Saved article_texts\10282.6.txt
Saved article_texts\10744.4.txt
Saved article_texts\11206.2.txt
Failed to extract or save article from 11668.0
Saved article_texts\12129.8.txt
Saved article_texts\12591.6.txt
Saved article_texts\13053.4.txt
Saved article_texts\13515.2.txt
Saved article_texts\13977.0.txt
Saved article_texts\14438.8.txt
Saved article_tex

# Download NLTK data, define a function to clean and preprocess text, and clean text files in a directory.


In [17]:
# Download NLTK data for stopwords (if not already downloaded)
nltk.download('stopwords')
nltk.download('punkt')

# Function to clean and preprocess text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # Remove special characters and entities
    text = re.sub(r'#', '', text)
    
    # Tokenization (splitting into sentences)
    sentences = nltk.sent_tokenize(text)
    
    # Tokenization (splitting into words)
    words = nltk.word_tokenize(text)
    
    # Lowercasing
    words = [word.lower() for word in words]
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Lemmatization or stemming (use NLTK's or another library's functions)
    
    return ' '.join(words)

# Directory containing text files
directory = 'article_texts'

# List all files in the directory
files = os.listdir(directory)

# Iterate through each text file
for file in files:
    if file.endswith(".txt"):
        # Read the content of the text file
        file_path = os.path.join(directory, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # Clean the text
        cleaned_text = clean_text(text)

        # Save the cleaned text back to the same file
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_text)

        print(f'Cleaned and saved: {file}')

print('Text cleaning for all files is completed.')


Cleaned and saved: 10282.6.txt
Cleaned and saved: 10744.4.txt
Cleaned and saved: 11206.2.txt
Cleaned and saved: 11668.0.txt
Cleaned and saved: 12129.8.txt
Cleaned and saved: 123.0.txt
Cleaned and saved: 12591.6.txt
Cleaned and saved: 13053.4.txt
Cleaned and saved: 13515.2.txt
Cleaned and saved: 13977.0.txt
Cleaned and saved: 14438.8.txt


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tunky\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tunky\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Cleaned and saved: 14900.6.txt
Cleaned and saved: 15362.4.txt
Cleaned and saved: 15824.2.txt
Cleaned and saved: 16286.0.txt
Cleaned and saved: 16747.8.txt
Cleaned and saved: 17209.6.txt
Cleaned and saved: 17671.4.txt
Cleaned and saved: 18133.2.txt
Cleaned and saved: 18595.0.txt
Cleaned and saved: 19056.8.txt
Cleaned and saved: 19518.6.txt
Cleaned and saved: 19980.4.txt
Cleaned and saved: 20442.2.txt
Cleaned and saved: 20904.0.txt
Cleaned and saved: 21365.8.txt
Cleaned and saved: 21827.6.txt
Cleaned and saved: 22289.4.txt
Cleaned and saved: 22751.2.txt
Cleaned and saved: 23213.0.txt
Cleaned and saved: 2345.0.txt
Cleaned and saved: 23674.8.txt
Cleaned and saved: 24136.6.txt
Cleaned and saved: 24598.4.txt
Cleaned and saved: 25060.2.txt
Cleaned and saved: 25522.0.txt
Cleaned and saved: 25983.8.txt
Cleaned and saved: 26445.6.txt
Cleaned and saved: 26907.4.txt
Cleaned and saved: 27369.2.txt
Cleaned and saved: 27831.0.txt
Cleaned and saved: 28292.8.txt
Cleaned and saved: 28754.6.txt
Cleaned a

# Process text files in a directory, perform sentiment analysis and readability metrics, merge data with 'Input.xlsx', and save the result.


In [21]:

# Initialize a DataFrame to store the calculated values
data = []

folder_path = 'article_texts'

# Process each text file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_name = os.path.splitext(filename)[0]
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # Handle empty text file
        if not text.strip():  # Check if the text is empty or only contains whitespace
            values = {
                'URL_ID': file_name,
                'POSITIVE SCORE': 'NA',
                'NEGATIVE SCORE': 'NA',
                'POLARITY SCORE': 'NA',
                'SUBJECTIVITY SCORE': 'NA',
                'AVG SENTENCE LENGTH': 'NA',
                'PERCENTAGE OF COMPLEX WORDS': 'NA',
                'FOG INDEX': 'NA',
                'AVG NUMBER OF WORDS PER SENTENCE': 'NA',
                'COMPLEX WORD COUNT': 'NA',
                'WORD COUNT': 'NA',
                'SYLLABLE PER WORD': 'NA',
                'PERSONAL PRONOUNS': 'NA',
                'AVG WORD LENGTH': 'NA',
            }
        else:
            # Sentiment Analysis
            blob = TextBlob(text)
            polarity = blob.sentiment.polarity
            subjectivity = blob.sentiment.subjectivity
            positive_score = max(0, polarity)
            negative_score = max(0, -polarity)
            words = nltk.word_tokenize(text)
            sentences = nltk.sent_tokenize(text)
            word_count = len(words)
            sentence_count = len(sentences)
            avg_sentence_length = word_count / sentence_count
            
            def syllable_count(word):
                if len(word) <= 3:
                    return 1
                count = 0
                vowels = "aeiouy"
                if word[0] in vowels:
                    count += 1
                for index in range(1, len(word)):
                    if word[index] in vowels and word[index - 1] not in vowels:
                        count += 1
                if word.endswith("e"):
                    count -= 1
                if word.endswith("le"):
                    count += 1
                if count == 0:
                    count += 1
                return count
            
            complex_words = [word for word in words if syllable_count(word) > 2]
            complex_word_count = len(complex_words)
            syllable_count = sum(syllable_count(word) for word in words)
            percentage_complex_words = (complex_word_count / word_count) * 100
            fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
            total_word_length = sum(len(word) for word in words)
            avg_word_length = total_word_length / word_count
            
            personal_pronouns = ['i', 'me', 'my', 'mine', 'myself', 'you', 'your', 'yours', 'yourself', 'he', 'him', 'his', 'himself',
                                 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'we', 'us', 'our', 'ours', 'ourselves', 'they', 'them', 'their', 'theirs', 'themselves']
            personal_pronoun_count = sum(words.count(word) for word in personal_pronouns)
            
            # Create a dictionary with the calculated values
            values = {
                'URL_ID': file_name,
                'POSITIVE SCORE': positive_score,
                'NEGATIVE SCORE': negative_score,
                'POLARITY SCORE': polarity,
                'SUBJECTIVITY SCORE': subjectivity,
                'AVG SENTENCE LENGTH': avg_sentence_length,
                'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
                'FOG INDEX': fog_index,
                'AVG NUMBER OF WORDS PER SENTENCE': avg_sentence_length,
                'COMPLEX WORD COUNT': complex_word_count,
                'WORD COUNT': word_count,
                'SYLLABLE PER WORD': syllable_count,
                'PERSONAL PRONOUNS': personal_pronoun_count,
                'AVG WORD LENGTH': avg_word_length,
            }
        
        # Append the dictionary to the data list
        data.append(values)

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data)

# Read the 'URL' data from 'Input.xlsx'
input_file = 'Input.xlsx'
url_data = pd.read_excel(input_file)

# Convert the data type of 'URL_ID' to object (string) in url_data
url_data['URL_ID'] = url_data['URL_ID'].astype(str)

# Merge the 'URL' data with the existing DataFrame based on 'URL_ID'
df = df.merge(url_data, on='URL_ID', how='left')

# Reorder the columns to place 'URL' between 'URL_ID' and 'POSITIVE SCORE'
df = df[['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
         'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS',
         'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT',
         'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']]

# Sort the DataFrame on 'URL_ID' in ascending order
df_sorted = df.sort_values(by='URL_ID', ascending=True)

# Reset the index after sorting
df_sorted.reset_index(drop=True, inplace=True)

# Save the DataFrame to an Excel file
output_file = 'Final_Answer.xlsx'
df_sorted.to_excel(output_file, index=False, engine='openpyxl')

print(f'Dataframe sorted and saved to {output_file}.')


Dataframe sorted and saved to Final_Answer.xlsx.
