In [13]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import re
import glob

# Ensure the necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91638\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91638\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
# Load default NLTK stop words
stop_words = set(stopwords.words('english'))

# Load additional stop words from provided files
stop_words_files = [
    'C:/Users/91638/Downloads/StopWords_Names.txt', 
    'C:/Users/91638/Downloads/StopWords_Geographic.txt', 
    'C:/Users/91638/Downloads/StopWords_GenericLong.txt', 
    'C:/Users/91638/Downloads/StopWords_Generic.txt', 
    'C:/Users/91638/Downloads/StopWords_DatesandNumbers.txt',
    'C:/Users/91638/Downloads/StopWords_Currencies.txt',
    'C:/Users/91638/Downloads/StopWords_Auditor.txt'
]

for stop_words_file in stop_words_files:
    if os.path.exists(stop_words_file):
        with open(stop_words_file, 'r') as file:
            additional_stop_words = file.read().split()
            stop_words.update(additional_stop_words)
    else:
        print(f"Stop words file {stop_words_file} not found.")

# Load positive and negative words from Master Dictionary
try:
    positive_words = set(open('C:/Users/91638/Downloads//positive-words.txt').read().split())
    negative_words = set(open('C:/Users/91638/Downloads//negative-words.txt').read().split())
except FileNotFoundError:
    print("The positive or negative words file is not found. Please check the path.")
    positive_words = set()
    negative_words = set()


In [16]:
# Load URLs from Input.xlsx
input_df = pd.read_excel('C:/Users/91638/Downloads/Input.xlsx')

# Function to extract article text
def extract_article_text(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract title and content (adjust selectors based on actual HTML structure)
        title = soup.find('h1').get_text(strip=True)
        paragraphs = soup.find_all('p')
        content = ' '.join([para.get_text(strip=True) for para in paragraphs])
        
        return title, content
    except Exception as e:
        print(f"Failed to extract article from {url}: {e}")
        return None, None


In [17]:
# Directory to save text files
os.makedirs('articles', exist_ok=True)

# Extract and save articles
for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    title, content = extract_article_text(url)
    
    if title and content:
        with open(f'articles/{url_id}.txt', 'w', encoding='utf-8') as file:
            file.write(title + '\n' + content)
    else:
        print(f"Skipping {url_id} due to extraction issues.")


In [18]:
# Function to clean text
def clean_text(text):
    words = word_tokenize(text)
    return [word for word in words if word.lower() not in stop_words and word.isalnum()]

# Function to count syllables in a word
def count_syllables(word):
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if word.endswith("le") and len(word) > 2 and word[-3] not in vowels:
        count += 1
    if count == 0:
        count += 1
    return count

# Function to compute variables
def compute_variables(text):
    cleaned_words = clean_text(text)
    sentences = sent_tokenize(text)
    
    word_count = len(cleaned_words)
    positive_score = sum(1 for word in cleaned_words if word in positive_words)
    negative_score = sum(1 for word in cleaned_words if word in negative_words)
    
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (word_count + 0.000001)
    
    avg_sentence_length = word_count / len(sentences)
    complex_words = [word for word in cleaned_words if count_syllables(word) > 2]
    percentage_complex_words = len(complex_words) / word_count
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    
    avg_word_length = sum(len(word) for word in cleaned_words) / word_count
    syllables_per_word = sum(count_syllables(word) for word in cleaned_words) / word_count
    personal_pronouns = sum(1 for word in cleaned_words if re.match(r'\b(I|we|my|ours|us)\b', word, re.IGNORECASE))
    
    return {
        'POSITIVE SCORE': positive_score,
        'NEGATIVE SCORE': negative_score,
        'POLARITY SCORE': polarity_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_sentence_length,
        'COMPLEX WORD COUNT': len(complex_words),
        'WORD COUNT': word_count,
        'SYLLABLE PER WORD': syllables_per_word,
        'PERSONAL PRONOUNS': personal_pronouns,
        'AVG WORD LENGTH': avg_word_length
    }


In [19]:
# Analyze text files and compute variables
output_data = []

text_files = glob.glob('articles/*.txt')
for file in text_files:
    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()
        variables = compute_variables(text)
        url_id = os.path.splitext(os.path.basename(file))[0]
        row = [url_id] + list(variables.values())
        output_data.append(row)

# Save results to Output Data Structure.xlsx
output_columns = ['URL_ID', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']
output_df = pd.DataFrame(output_data, columns=output_columns)
output_df.to_excel('Downloads/Output Data Structure.xlsx', index=False)


In [25]:
# Check the content of a sample file
sample_file = 'articles/bctech2011.txt'

with open(sample_file, 'r', encoding='utf-8') as file:
    content = file.read()

print("Sample File Content:")
print(content[:1000])  # Print the first 1000 characters for inspection





Sample File Content:
ML and AI-based insurance premium model to predict premium to be charged by the insurance company
Healthcare AI ChatBot using LLAMA, LLM, Langchain Efficient Supply Chain Assessment: Overcoming Technical Hurdles for Web Application Development Streamlined Integration: Interactive Brokers API with Python for Desktop Trading Application Efficient Data Integration and User-Friendly Interface Development: Navigating Challenges in Web Application Deployment AI Chatbot using LLM, Langchain, LLama AI Bot Audio to audio Methodology for ETL Discovery Tool using LLMA, OpenAI, Langchain Methodology for database discovery tool using openai, LLMA, Langchain Rising IT cities and its impact on the economy, environment, infrastructure, and city life by the year 2040. Rising IT Cities and Their Impact on the Economy, Environment, Infrastructure, and City Life in Future Internet Demand’s Evolution, Communication Impact, and 2035’s Alternative Pathways Rise of Cybercrime and its Effe

In [37]:
import pandas as pd
from IPython.display import display

# Set Pandas display options to show more rows
pd.set_option('display.max_rows', None)  # Show all rows

# Assuming 'output_df' is your DataFrame
display(output_df)

# Save DataFrame to Excel if needed
output_df.to_excel('Downloads/Output Data Structure.xlsx', index=False)

print("Output file saved as 'Output Data Structure.xlsx' in Downloads folder.")



Unnamed: 0,URL_ID,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,bctech2011,34,8,0.619048,0.074074,23.625,0.486772,9.644709,23.625,276,567,2.578483,0,7.601411
1,bctech2012,4,0,1.0,0.021164,27.0,0.407407,10.962963,27.0,77,189,2.465608,0,7.444444
2,bctech2013,4,0,1.0,0.02139,26.714286,0.433155,10.858976,26.714286,81,187,2.486631,0,7.475936
3,bctech2014,4,0,1.0,0.021164,27.0,0.428571,10.971429,27.0,81,189,2.497354,0,7.470899
4,bctech2015,4,0,1.0,0.021053,27.142857,0.431579,11.029774,27.142857,82,190,2.494737,0,7.489474
5,bctech2016,4,0,1.0,0.021505,26.571429,0.430108,10.800614,26.571429,80,186,2.494624,0,7.505376
6,bctech2017,4,0,1.0,0.021053,27.142857,0.415789,11.023459,27.142857,79,190,2.447368,0,7.389474
7,bctech2018,4,0,1.0,0.021164,27.0,0.417989,10.967196,27.0,79,189,2.465608,0,7.433862
8,bctech2019,4,0,1.0,0.020202,28.285714,0.424242,11.483983,28.285714,84,198,2.479798,0,7.424242
9,bctech2020,4,0,1.0,0.021164,27.0,0.433862,10.973545,27.0,82,189,2.492063,0,7.492063


Output file saved as 'Output Data Structure.xlsx' in Downloads folder.
