In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
import re
import os
# Ensure required NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Load the input and output structure Excel files
output_structure_path ="C:\\Users\\mryad\\Downloads\\Output Data Structure (1).xlsx"
input_path="C:\\Users\\mryad\\Downloads\\Input (1).xlsx"
output_structure_df = pd.read_excel(output_structure_path)
input_df=pd.read_excel(input_path)

directory ="C:\\Users\\mryad\\Downloads\\URL_ID"
def load_text_files(directory):
    text_data = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                text = file.read()
                text_data.append({'filename': filename, 'text': text})
    return pd.DataFrame(text_data)
# Load the text files into a DataFrame
text_df = load_text_files(directory)
print(text_df.head())
# Define positive and negative word lists (these should be more comprehensive in a real use case)
negative_words ="C:\\Users\\mryad\\Downloads\\MasterDictionary-20240619T101032Z-001\\MasterDictionary\\negative-words.txt"
positive_words ="C:\\Users\\mryad\\Downloads\\MasterDictionary-20240619T101032Z-001\\MasterDictionary\\positive-words.txt"





# Function to count syllables in a word
def count_syllables(word):
    word = word.lower()
    vowels = "aeiouy"
    count = 0
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if word.endswith("le") and len(word) > 2 and word[-3] not in vowels:
        count += 1
    if count == 0:
        count += 1
    return count

# Function to extract article content
def extract_article(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract the article title
            title_tag = soup.find('h1')
            title = title_tag.get_text(strip=True) if title_tag else 'No Title'
            
            # Extract the article text
            paragraphs = soup.find_all('p')
            article_text = "\n".join([para.get_text(strip=True) for para in paragraphs])
            
            return title, article_text
        else:
            return None, None
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None, None

# Function to perform textual analysis
def analyze_text(text):
    # Tokenize sentences and words
    sentences = nltk.sent_tokenize(text)
    words = nltk.word_tokenize(text)
    num_sentences = len(sentences)
    num_words = len(words)
    
    # Count positive and negative words
    pos_count = sum(1 for word in words if word.lower() in positive_words)
    neg_count = sum(1 for word in words if word.lower() in negative_words)
    
    # Calculate polarity and subjectivity
    polarity_score = (pos_count - neg_count) / ((pos_count + neg_count) + 0.000001)
    subjectivity_score = (pos_count + neg_count) / (num_words + 0.000001)
    
    # Calculate average sentence length
    avg_sentence_length = num_words / num_sentences
    
    # Count complex words
    complex_words = [word for word in words if count_syllables(word) >= 3]
    num_complex_words = len(complex_words)
    
    # Calculate percentage of complex words
    perc_complex_words = (num_complex_words / num_words) * 100
    
    # Calculate fog index
    fog_index = 0.4 * (avg_sentence_length + perc_complex_words)
    
    # Count syllables per word
    total_syllables = sum(count_syllables(word) for word in words)
    syllable_per_word = total_syllables / num_words
    
    # Count personal pronouns
    personal_pronouns = re.findall(r'\b(I|we|my|ours|us)\b', text, re.I)
    num_personal_pronouns = len(personal_pronouns)
    
    # Calculate average word length
    total_characters = sum(len(word) for word in words)
    avg_word_length = total_characters / num_words
    
    return {
        'POSITIVE SCORE': pos_count,
        'NEGATIVE SCORE': neg_count,
        'POLARITY SCORE': polarity_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': perc_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_sentence_length,
        'COMPLEX WORD COUNT': num_complex_words,
        'WORD COUNT': num_words,
        'SYLLABLE PER WORD': syllable_per_word,
        'PERSONAL PRONOUNS': num_personal_pronouns,
        'AVG WORD LENGTH': avg_word_length
    }

# Create a list to store the results
results_list = []

# Iterate over each row in the input dataframe
for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    title, article_text = extract_article(url)
    if title and article_text:
        content = f"{title}\n\n{article_text}"
        
        # Perform textual analysis
        analysis_results = analyze_text(article_text)
        
        # Append the results to the list
        results_list.append({
            'URL_ID': url_id,
            'Title': title,
            'Article': article_text,
            **analysis_results
        })
        
        print(f"Article {'text'} analyzed successfully.")
    else:
        print(f"Failed to extract article for URL_ID {'text'}.")

# Create a DataFrame from the results list
results_df = pd.DataFrame(results_list)

# Ensure the results DataFrame matches the output structure
for column in output_structure_df.columns:
    if column not in results_df.columns:
        results_df[column] = None

# Reorder the DataFrame columns to match the output structure
results_df = results_df[output_structure_df.columns]

# Save the results to an Excel file
output_file_path ="C:\\Users\\mryad\\Downloads\\text_analysis_exel.xlsx"
results_df.to_excel(output_file_path, index=False)

print("Textual analysis completed and results saved.")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mryad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mryad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


              filename                                               text
0  blackassign0001.txt  Rising IT cities and its impact on the economy...
1  blackassign0002.txt  Rising IT Cities and Their Impact on the Econo...
2  blackassign0003.txt  Internet Demand’s Evolution, Communication Imp...
3  blackassign0004.txt  Rise of Cybercrime and its Effect in upcoming ...
4  blackassign0005.txt  OTT platform and its impact on the entertainme...
Article text analyzed successfully.
Article text analyzed successfully.
Article text analyzed successfully.
Article text analyzed successfully.
Article text analyzed successfully.
Article text analyzed successfully.
Article text analyzed successfully.
Article text analyzed successfully.
Article text analyzed successfully.
Article text analyzed successfully.
Article text analyzed successfully.
Article text analyzed successfully.
Article text analyzed successfully.
Article text analyzed successfully.
Article text analyzed successfully.
Article text ana