<h2 style="color: blue;">Project: NLP Assignment</h2>


### Importing Necessary Libraries

In [11]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Read input data
input_data = pd.read_excel("Input.xlsx")

# Function to extract article title and text from URL
def extract_title_and_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract article title
    title = soup.title.string if soup.title else ""

    # Potential classes for the content div
    potential_classes = ['td-post-content tagdiv-type', 'td_block_wrap tdb_single_content tdi_130 td-pb-border-top td_block_template_1 td-post-content tagdiv-type']  # Add more classes as needed

    # Attempt to find the content div using different classes
    article_text = ""
    for class_name in potential_classes:
        article_body = soup.find('div', class_=class_name)
        if article_body:
            article_text = article_body.get_text()
            break  # Stop searching once content is found

    return title, article_text

# Loop through each row in the input data
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    # Extract article title and text
    title, article_text = extract_title_and_text(url)

    # Save extracted title and text to a file with URL_ID as the file name
    with open(f"{url_id}.txt", "w", encoding="utf-8") as file:
        file.write(f"Title: {title}\n\nArticle Text: {article_text}")

    # Print a message if no content is found
    if not article_text.strip():
        print(f"Missing text for URL_ID: {url_id}")


Missing text for URL_ID: blackassign0036
Missing text for URL_ID: blackassign0049


#### Loading the Stopwords and MasterDictionary folders

In [32]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import os

# Specify the path to the folders
stop_words_folder = 'StopWords'
master_dict_folder = 'MasterDictionary'

# Load Stop Words Lists
stop_words_files = os.listdir(stop_words_folder)
stop_words_set = set()

for stop_words_file in stop_words_files:
    with open(os.path.join(stop_words_folder, stop_words_file), 'r', encoding='ISO-8859-1') as file:
        stop_words_set.update(file.read().splitlines())

# Load Positive and Negative words from Master Dictionary
positive_words_path = os.path.join(master_dict_folder, 'positive-words.txt')
negative_words_path = os.path.join(master_dict_folder, 'negative-words.txt')

positive_words = set(open(positive_words_path, 'r', encoding='ISO-8859-1').read().splitlines())
negative_words = set(open(negative_words_path, 'r', encoding='ISO-8859-1').read().splitlines())


#### Preparing analysis function 

In [33]:
def analyze_text(text):
    # Tokenize the text into sentences and words
    sentences = text.split('.')  # Assuming that '.' is used to denote the end of a sentence
    words = text.split()  # Splitting by space to get individual words


    # Remove stop words
    cleaned_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words_set]

    # Calculate Positive Score
    positive_score = sum(1 for word in cleaned_words if word in positive_words)

    # Calculate Negative Score
    negative_score = sum(1 for word in cleaned_words if word in negative_words)

    # Calculate Polarity Score
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)

    # Calculate Subjectivity Score
    subjectivity_score = (positive_score + negative_score) / (len(cleaned_words) + 0.000001)

    # Calculate Average Sentence Length
    avg_sentence_length = len(words) / len(sentences)

    # Calculate Percentage of Complex Words
    complex_words = [word for word in cleaned_words if len(word) > 2]
    percentage_complex_words = len(complex_words) / len(cleaned_words) if len(cleaned_words) > 0 else 0

    # Calculate Fog Index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Calculate Average Number of Words Per Sentence
    avg_words_per_sentence = len(words) / len(sentences)

    # Calculate Complex Word Count
    complex_word_count = len(complex_words)

    # Calculate Word Count
    word_count = len(cleaned_words)

    # Calculate Syllable Per Word
    syllable_per_word = sum(syllable_count(word) for word in cleaned_words) / len(cleaned_words) if len(cleaned_words) > 0 else 0

    # Calculate Personal Pronouns Count
    personal_pronouns_count = sum(1 for word in cleaned_words if word.lower() in {'i', 'we', 'my', 'ours', 'us'})

    # Calculate Average Word Length
    avg_word_length = sum(len(word) for word in cleaned_words) / len(cleaned_words) if len(cleaned_words) > 0 else 0

    # Return the computed variables
    return {
        'POSITIVE SCORE': positive_score,
        'NEGATIVE SCORE': negative_score,
        'POLARITY SCORE': polarity_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_words_per_sentence,
        'COMPLEX WORD COUNT': complex_word_count,
        'WORD COUNT': word_count,
        'SYLLABLE PER WORD': syllable_per_word,
        'PERSONAL PRONOUNS': personal_pronouns_count,
        'AVG WORD LENGTH': avg_word_length
    }

def syllable_count(word):
    # A simple function to count syllables in a word
    vowels = "aeiouy"
    count = 0

    # Count consecutive vowels as one syllable
    for char in word:
        if char.lower() in vowels:
            count += 1

    # Adjust for silent 'e'
    if word.endswith('e') and count > 1:
        count -= 1

    return max(count, 1)


#### finding  variables scores for the first 2 text files 

In [34]:
import os

# Function to read the content of a file
def read_file_content(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# List of file names (adjust these based on your file names)
file_names = ["blackassign0001.txt", "blackassign0002.txt"]

# Loop through each file and analyze the text
for file_name in file_names:
    file_path = os.path.join(os.getcwd(), file_name)

    # Read the content of the file
    text_content = read_file_content(file_path)

    # Analyze the text and print the variables
    variables = analyze_text(text_content)
    print(f"Variables for {file_name}:")
    print(variables)
    print("\n" + "="*50 + "\n")


Variables for blackassign0001.txt:
{'POSITIVE SCORE': 30, 'NEGATIVE SCORE': 6, 'POLARITY SCORE': 0.6666666481481487, 'SUBJECTIVITY SCORE': 0.08017817354080585, 'AVG SENTENCE LENGTH': 15.5625, 'PERCENTAGE OF COMPLEX WORDS': 0.9933184855233853, 'FOG INDEX': 6.622327394209354, 'AVG NUMBER OF WORDS PER SENTENCE': 15.5625, 'COMPLEX WORD COUNT': 446, 'WORD COUNT': 449, 'SYLLABLE PER WORD': 2.4788418708240534, 'PERSONAL PRONOUNS': 0, 'AVG WORD LENGTH': 6.599109131403118}


Variables for blackassign0002.txt:
{'POSITIVE SCORE': 48, 'NEGATIVE SCORE': 23, 'POLARITY SCORE': 0.35211267109700467, 'SUBJECTIVITY SCORE': 0.11993243222984387, 'AVG SENTENCE LENGTH': 18.085365853658537, 'PERCENTAGE OF COMPLEX WORDS': 0.9983108108108109, 'FOG INDEX': 7.63347066578774, 'AVG NUMBER OF WORDS PER SENTENCE': 18.085365853658537, 'COMPLEX WORD COUNT': 591, 'WORD COUNT': 592, 'SYLLABLE PER WORD': 2.8175675675675675, 'PERSONAL PRONOUNS': 0, 'AVG WORD LENGTH': 7.298986486486487}




[nltk_data] Error loading punkt: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


#### finding scores  for all the variables

In [68]:
import os
import pandas as pd

# Function to read the content of a file
def read_file_content(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Function to extract URL from existing DataFrame based on URL_ID
def extract_url(url_id, existing_df):
    url_row = existing_df[existing_df['URL_ID'] == url_id]['URL']
    return url_row.iloc[0] if not url_row.empty else ''

# List all files in the current directory
all_files = [file for file in os.listdir() if file.endswith(".txt")]

# Create a list to store dictionaries for each file
data = []

# Read the existing Excel file
existing_df = pd.read_excel("Output Data Structure.xlsx")

# Loop through each file and analyze the text
for file_name in all_files:
    file_path = os.path.join(os.getcwd(), file_name)

    # Read the content of the file
    text_content = read_file_content(file_path)

    # Extract URL_ID from the file name
    url_id = file_name.split('.')[0]

    # Extract URL from the existing DataFrame
    url = extract_url(url_id, existing_df)

    # Analyze the text
    variables = analyze_text(text_content)

    # Create a dictionary for the current file
    file_data = {'URL_ID': url_id, 'URL': url, **variables}

    # Append the dictionary to the list
    data.append(file_data)



In [69]:
pd.DataFrame(data)

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,30,6,0.666667,0.080178,15.562500,0.993318,6.622327,15.562500,446,449,2.478842,0,6.599109
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,48,23,0.352113,0.119932,18.085366,0.998311,7.633471,18.085366,591,592,2.817568,0,7.298986
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,34,20,0.259259,0.112500,18.964912,0.995833,7.984298,18.964912,478,480,3.014583,0,8.089583
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,29,62,-0.362637,0.195699,20.557692,0.997849,8.622217,20.557692,464,465,3.043011,0,8.051613
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,18,7,0.440000,0.085616,17.024390,1.000000,7.209756,17.024390,292,292,2.630137,0,7.383562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,blackassign0096,https://insights.blackcoffer.com/what-is-the-r...,26,50,-0.315789,0.156057,21.377358,1.000000,8.950943,21.377358,487,487,2.704312,0,7.275154
96,blackassign0097,https://insights.blackcoffer.com/impact-of-cov...,20,32,-0.230769,0.151163,27.325000,0.985465,11.324186,27.325000,339,344,2.502907,0,6.680233
97,blackassign0098,https://insights.blackcoffer.com/contribution-...,5,3,0.250000,0.043011,16.040000,1.000000,6.816000,16.040000,186,186,2.650538,0,7.198925
98,blackassign0099,https://insights.blackcoffer.com/how-covid-19-...,15,2,0.764706,0.071730,17.828571,0.966245,7.517926,17.828571,229,237,2.345992,0,6.417722


In [74]:
pd.DataFrame(data).to_excel("Output Data Structure.xlsx", index=False)