## Data Extraction from URLs

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# Load the input.xlsx file
file_path = '/Users/veer/Downloads/Input.xlsx'
df = pd.read_excel(file_path)

In [3]:
print(df)

     URL_ID                                                URL
0        37  https://insights.blackcoffer.com/ai-in-healthc...
1        38  https://insights.blackcoffer.com/what-if-the-c...
2        39  https://insights.blackcoffer.com/what-jobs-wil...
3        40  https://insights.blackcoffer.com/will-machine-...
4        41  https://insights.blackcoffer.com/will-ai-repla...
..      ...                                                ...
109     146  https://insights.blackcoffer.com/blockchain-fo...
110     147  https://insights.blackcoffer.com/the-future-of...
111     148  https://insights.blackcoffer.com/big-data-anal...
112     149  https://insights.blackcoffer.com/business-anal...
113     150  https://insights.blackcoffer.com/challenges-an...

[114 rows x 2 columns]


In [33]:
# Iterate over each row in the dataframe
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    # Send an HTTP GET request to the URL and retrieve the webpage content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, "lxml")

    # Find the article title and text
    article_title_elem = soup.find('h1')
    article_text_elem = soup.find('article')

    # Check if the elements are found before accessing their text
    if article_title_elem and article_text_elem:
        article_title = article_title_elem.get_text()
        article_text = article_text_elem.get_text()

        # Save the extracted article in a text file
        file_name = f"{url_id}.txt"
        with open(file_name, 'w') as file:
            file.write(article_title + '\n')
            file.write(article_text)

        print(f"Article extracted and saved: {file_name}")
    else:
        print(f"Error: Article elements not found for URL_ID: {url_id}")

Article extracted and saved: 37.txt
Article extracted and saved: 38.txt
Article extracted and saved: 39.txt
Article extracted and saved: 40.txt
Article extracted and saved: 41.txt
Article extracted and saved: 42.txt
Article extracted and saved: 43.txt
Error: Article elements not found for URL_ID: 44
Article extracted and saved: 45.txt
Article extracted and saved: 46.txt
Article extracted and saved: 47.txt
Article extracted and saved: 48.txt
Article extracted and saved: 49.txt
Article extracted and saved: 50.txt
Article extracted and saved: 51.txt
Article extracted and saved: 52.txt
Article extracted and saved: 53.txt
Article extracted and saved: 54.txt
Article extracted and saved: 55.txt
Article extracted and saved: 56.txt
Error: Article elements not found for URL_ID: 57
Article extracted and saved: 58.txt
Article extracted and saved: 59.txt
Article extracted and saved: 60.txt
Article extracted and saved: 61.txt
Article extracted and saved: 62.txt
Article extracted and saved: 63.txt
Ar

## Data Clean Up

In [4]:
import os

# Define the path to the "StopWords" folder
stop_words_folder = 'StopWords'
# Load stop words from each .txt file in the "StopWords" folder
stop_words = set()
for file_name in os.listdir(stop_words_folder):
    if file_name.endswith('.txt'):
        file_path = os.path.join(stop_words_folder, file_name)
        with open(file_path, 'r', encoding='latin-1') as file:
            words = file.read().splitlines()
            stop_words.update(words)

In [5]:
# Create a new set to store the split words
stop_words_split = set()

# Split words in stop_words set
for word in stop_words:
    if '|' in word:
        split_words = [w.strip() for w in word.split('|')]
        stop_words_split.update(split_words)
    else:
        stop_words_split.add(word)

# Update the stop_words set with the split words
stop_words = stop_words_split

In [7]:
len(stop_words)

12992

In [8]:
import nltk
from nltk.tokenize import word_tokenize
import re

# Initialize a dictionary to store cleaned tokens for each file
cleaned_file_tokens = {}

# Iterate over each text file
for index, row in df.iterrows():
    url_id = row['URL_ID']
    file_name = f"{url_id}.txt"
    
    # Check if the file exists
    if not os.path.isfile(file_name):
        print(f"Error: File not found for URL_ID: {url_id}")
        continue
    
    # Read the contents of the text file
    with open(file_name, 'r') as file:
        text = file.read()
    
    # Tokenize the text using NLTK
    tokens = word_tokenize(text)
    
    # Clean the tokens by removing stop words and special characters
    cleaned_tokens = [re.sub(r'[^a-zA-Z0-9]', '', token) for token in tokens if token.lower() not in stop_words]
    
    # Remove empty tokens
    cleaned_tokens = [token for token in cleaned_tokens if token]
    
    # Store the cleaned tokens in the dictionary with the file name as the key
    cleaned_file_tokens[file_name] = cleaned_tokens

Error: File not found for URL_ID: 44
Error: File not found for URL_ID: 57
Error: File not found for URL_ID: 144


In [9]:
# Load the positive and negative dictionaries
positive_words = set()
negative_words = set()

positive_file = "MasterDictionary/positive-words.txt"
negative_file = "MasterDictionary/negative-words.txt"

encodings = ['utf-8-sig', 'latin1', 'utf-16', 'utf-32']

for encoding in encodings:
    try:
        with open(positive_file, 'r', encoding=encoding) as file:
            positive_words = set(file.read().splitlines())

        with open(negative_file, 'r', encoding=encoding) as file:
            negative_words = set(file.read().splitlines())

        break  # Break out of the loop if the files are successfully read
    except UnicodeDecodeError:
        continue  # Continue to the next encoding if an error occurs

In [10]:
print(len(positive_words))

2006


In [11]:
print(len(negative_words))

4783


In [12]:
# Calculate sentiment analysis scores for individual tokens
sentiment_scores = {}

for file_name, tokens in cleaned_file_tokens.items():
    positive_score = 0
    negative_score = 0

    for token in tokens:
        if token in positive_words:
            positive_score += 1
        if token in negative_words:
            negative_score += 1

    total_words = len(tokens)
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)

    sentiment_scores[file_name] = {
        'Positive Score': positive_score,
        'Negative Score': negative_score,
        'Polarity Score': polarity_score,
        'Subjectivity Score': subjectivity_score
    }

In [13]:
sentiment_scores

{'37.txt': {'Positive Score': 64,
  'Negative Score': 33,
  'Polarity Score': 0.3195876255712616,
  'Subjectivity Score': 0.08326180250363793},
 '38.txt': {'Positive Score': 57,
  'Negative Score': 37,
  'Polarity Score': 0.2127659551833409,
  'Subjectivity Score': 0.12929848675474762},
 '39.txt': {'Positive Score': 66,
  'Negative Score': 35,
  'Polarity Score': 0.3069306900303892,
  'Subjectivity Score': 0.10171198378478148},
 '40.txt': {'Positive Score': 58,
  'Negative Score': 26,
  'Polarity Score': 0.3809523764172336,
  'Subjectivity Score': 0.09976247019030586},
 '41.txt': {'Positive Score': 52,
  'Negative Score': 25,
  'Polarity Score': 0.35064934609546305,
  'Subjectivity Score': 0.08071278817535348},
 '42.txt': {'Positive Score': 49,
  'Negative Score': 24,
  'Polarity Score': 0.3424657487333459,
  'Subjectivity Score': 0.10209790195510783},
 '43.txt': {'Positive Score': 29,
  'Negative Score': 12,
  'Polarity Score': 0.4146341362284357,
  'Subjectivity Score': 0.08779443236

In [44]:
import pandas as pd

# Create a list to store the sentiment analysis scores
output_data = []

# Iterate over the sentiment scores dictionary
for file_name, scores in sentiment_scores.items():
    url_id = file_name[:-4]  # Extract URL_ID from the file name
    url = df.loc[df['URL_ID'] == int(url_id), 'URL'].values[0]  # Get the corresponding URL from the original DataFrame

    # Append the scores to the output data list as a dictionary
    output_data.append({
        'URL_ID': url_id,
        'URL': url,
        'Positive Score': scores['Positive Score'],
        'Negative Score': scores['Negative Score'],
        'Polarity Score': scores['Polarity Score'],
        'Subjectivity Score': scores['Subjectivity Score']
    })

# Convert the output data list to a DataFrame
output_df = pd.DataFrame.from_records(output_data)

# Save the DataFrame to an Excel file
output_file = 'ODS.xlsx'
output_df.to_excel(output_file, index=False)

print(f"Sentiment analysis scores saved to {output_file}")


Sentiment analysis scores saved to ODS.xlsx


In [14]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [15]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/veer/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Readability 

In [16]:
import nltk
from nltk.corpus import stopwords
import os
import re

In [22]:
import nltk
import os
from nltk.corpus import stopwords
import string
import re

nltk.download('stopwords')

# Function to calculate the Average Sentence Length
def calculate_average_sentence_length(text):
    sentences = nltk.sent_tokenize(text)
    total_sentences = len(sentences)
    total_words = 0
    
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        cleaned_words = [word.lower() for word in words if word.lower() not in stop_words and word.lower() not in string.punctuation]
        total_words += len(cleaned_words)
    
    average_sentence_length = total_words / total_sentences
    return average_sentence_length

# Function to calculate the Percentage of Complex Words
def calculate_percentage_complex_words(text):
    words = nltk.word_tokenize(text)
    complex_words = [word for word in words if count_syllables(word) > 2]
    percentage_complex_words = (len(complex_words) / len(words)) * 100
    return percentage_complex_words

# Function to count the number of syllables in a word
def count_syllables(word):
    vowels = "aeiou"
    exceptions = ["es", "ed"]
    count = 0
    if word[-2:] in exceptions or word[-1:] in exceptions:
        return 1
    for vowel in vowels:
        count += word.count(vowel)
    return count


# Function to calculate the Word Count
def calculate_word_count(text):
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)
    cleaned_words = [word.lower() for word in words if word.lower() not in stop_words and word.lower() not in string.punctuation]
    word_count = len(cleaned_words)
    return word_count

# Function to calculate Syllable Count Per Word
def calculate_syllable_count_per_word(text):
    total_syllables = 0
    total_words = calculate_word_count(text)  # Call calculate_word_count function
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)
    for word in words:
        if word.lower() not in stop_words and word.lower() not in string.punctuation:
            total_syllables += count_syllables(word)
    if total_words == 0:
        syllable_count_per_word = 0
    else:
        syllable_count_per_word = total_syllables / total_words
    return syllable_count_per_word


# Function to calculate Personal Pronouns
def calculate_personal_pronouns(text):
    pronoun_count = 0
    # Regex pattern to match personal pronouns
    pronoun_pattern = r'\b(I|we|my|ours|us)\b'
    # Find matches using regex
    matches = re.findall(pronoun_pattern, text, flags=re.IGNORECASE)
    # Count the matches
    pronoun_count = len(matches)
    # Remove the count for "US" if present
    if re.search(r'\bUS\b', text, flags=re.IGNORECASE):
        pronoun_count -= 1
    return pronoun_count

def calculate_average_word_length(text):
    words = nltk.word_tokenize(text)
    total_characters = sum(len(word) for word in words)
    total_words = len(words)
    average_word_length = total_characters / total_words
    return average_word_length

# Create a dictionary to store the readability metrics
readability_metrics = {}

# Iterate over the txt files
for file_name in os.listdir():
    if file_name.endswith('.txt'):
        with open(file_name, 'r') as file:
            text = file.read()
            if text.strip() == "":
                continue  # Skip empty files
            average_sentence_length = calculate_average_sentence_length(text)
            percentage_complex_words = calculate_percentage_complex_words(text)
            average_words_per_sentence = calculate_average_words_per_sentence(text)
            word_count = calculate_word_count(text)
            fog_index = 0.4 * (average_sentence_length + percentage_complex_words)
            syllable_count_per_word = calculate_syllable_count_per_word(text)
            pronoun_count = calculate_personal_pronouns(text)
            average_word_length = calculate_average_word_length(text)
            readability_metrics[file_name] = {
                'Average Sentence Length': average_sentence_length,
                'Percentage of Complex Words': percentage_complex_words,
                'Fog Index': fog_index,
                'Average Number of Words Per Sentence': average_sentence_length,
                'Word Count': word_count,
                'Syllable Count Per Word': syllable_count_per_word,
                'Personal Pronouns' : pronoun_count,
                'Average Word Length' : average_word_length
            }
             

# Print the readability metrics for each txt file
for file_name, metrics in readability_metrics.items():
    print(f"Readability metrics for {file_name}:")
    print(f"Average Sentence Length: {metrics['Average Sentence Length']}")
    print(f"Percentage of Complex Words: {metrics['Percentage of Complex Words']}")
    print(f"Fog Index: {metrics['Fog Index']}")
    print(f"Average Number of Words Per Sentence: {metrics['Average Number of Words Per Sentence']}")
    print(f"Word Count: {metrics['Word Count']}")
    print(f"Syllable Count Per Word: {metrics['Syllable Count Per Word']}")
    print(f"Personal Pronouns: {metrics['Personal Pronouns']}")
    print(f"Average Word Length: {metrics['Average Word Length']}")
    print()


[nltk_data] Downloading package stopwords to /Users/veer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Readability metrics for 114.txt:
Average Sentence Length: 12.287671232876713
Percentage of Complex Words: 16.71907756813417
Fog Index: 11.602699520404355
Average Number of Words Per Sentence: 12.287671232876713
Word Count: 998
Syllable Count Per Word: 2.031062124248497
Personal Pronouns: 7
Average Word Length: 4.560796645702306

Readability metrics for 100.txt:
Average Sentence Length: 15.067796610169491
Percentage of Complex Words: 18.694362017804153
Fog Index: 13.50486345118946
Average Number of Words Per Sentence: 15.067796610169491
Word Count: 966
Syllable Count Per Word: 2.0610766045548656
Personal Pronouns: 8
Average Word Length: 4.74540059347181

Readability metrics for 128.txt:
Average Sentence Length: 11.541666666666666
Percentage of Complex Words: 19.953775038520803
Fog Index: 12.598176682074987
Average Number of Words Per Sentence: 11.541666666666666
Word Count: 647
Syllable Count Per Word: 2.2194744976816074
Personal Pronouns: 6
Average Word Length: 4.5416024653312785

Read

In [23]:
import re

# Function to extract the numeric part from the file name
def extract_numeric_part(file_name):
    return int(re.search(r'\d+', file_name).group())

# Sort the readability_metrics dictionary based on the numeric part of the keys
sorted_metrics = dict(sorted(readability_metrics.items(), key=lambda x: extract_numeric_part(x[0])))

# Print the readability metrics for each txt file in the sorted order
for file_name, metrics in sorted_metrics.items():
    print(f"Readability metrics for {file_name}:")
    print(f"Average Sentence Length: {metrics['Average Sentence Length']}")
    print(f"Percentage of Complex Words: {metrics['Percentage of Complex Words']}")
    print(f"Fog Index: {metrics['Fog Index']}")
    print(f"Average Number of Words Per Sentence: {metrics['Average Number of Words Per Sentence']}")
    print(f"Word Count: {metrics['Word Count']}")
    print(f"Syllable Count Per Word: {metrics['Syllable Count Per Word']}")
    print(f"Personal Pronouns: {metrics['Personal Pronouns']}")
    print(f"Average Word Length: {metrics['Average Word Length']}")
    print()

Readability metrics for 37.txt:
Average Sentence Length: 15.337662337662337
Percentage of Complex Words: 25.591782045556048
Fog Index: 16.371777753287354
Average Number of Words Per Sentence: 15.337662337662337
Word Count: 1304
Syllable Count Per Word: 2.3849693251533743
Personal Pronouns: 5
Average Word Length: 5.177311299687361

Readability metrics for 38.txt:
Average Sentence Length: 8.823529411764707
Percentage of Complex Words: 16.170903190914007
Fog Index: 9.997773041071486
Average Number of Words Per Sentence: 8.823529411764707
Word Count: 895
Syllable Count Per Word: 2.0726256983240225
Personal Pronouns: 8
Average Word Length: 4.392644672796106

Readability metrics for 39.txt:
Average Sentence Length: 11.188888888888888
Percentage of Complex Words: 23.0624706434946
Fog Index: 13.700543812953397
Average Number of Words Per Sentence: 11.188888888888888
Word Count: 1137
Syllable Count Per Word: 2.380826737027265
Personal Pronouns: 4
Average Word Length: 4.913574448097698

Readabil

In [27]:
import pandas as pd

# Create a list to store the sentiment analysis scores
output_data = []

# Iterate over the sentiment scores dictionary
for file_name, scores in sorted_metrics.items():
    url_id = file_name[:-4]  # Extract URL_ID from the file name
    url = df.loc[df['URL_ID'] == int(url_id), 'URL'].values[0]  # Get the corresponding URL from the original DataFrame

    # Append the scores to the output data list as a dictionary
    output_data.append({
        'URL_ID': url_id,
        'URL': url,
        'Average Sentence Length': scores['Average Sentence Length'],
        'Percentage of Complex Words': scores['Percentage of Complex Words'],
        'Fog Index': scores['Fog Index'],
        'Average Number of Words Per Sentence': scores['Average Number of Words Per Sentence'],
        'Word Count': scores['Word Count'],
        'Syllable Count Per Word': scores['Syllable Count Per Word'],
        'Personal Pronouns': scores['Personal Pronouns'],
        'Average Word Length': scores['Average Word Length']
    })

# Convert the output data list to a DataFrame
output_df = pd.DataFrame.from_records(output_data)

# Save the DataFrame to an Excel file
output_file = 'ODS2.xlsx'
output_df.to_excel(output_file, index=False)

print(f"Readability scores saved to {output_file}")


Readability scores saved to ODS2.xlsx
