In [12]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from textblob import TextBlob


In [13]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
# Load the input data
input_file = pd.read_csv("/content/Input.csv")
output_file = pd.read_csv("/content/Output Data Structure.csv")

In [16]:
# Function to extract text from URL
def extract_text_from_url(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Remove unwanted content
        for footer in soup.find_all(class_="td-footer-template-wrap"):
            footer.decompose()
        for header in soup.find_all(class_="td-header-template-wrap"):
            header.decompose()
        for img in soup.find_all('img'):
            img.decompose()
        for href in soup.find_all('href'):
            href.decompose()

        # Extract text
        text = ""
        for paragraph in soup.find_all('p'):
            text += paragraph.get_text() + " "

        return text
    except Exception as e:
        print(f"Error extracting text from {url}: {str(e)}")
        return None

In [17]:
# Function to compute text analysis variables
def compute_text_analysis(text):
    # Tokenize text into sentences and words
    sentences = sent_tokenize(text)
    words = word_tokenize(text)

    # Define stopwords and initialize stemmer
    stop_words = set(stopwords.words("english"))
    ps = PorterStemmer()

    # Initialize variables
    total_words = len(words)
    total_sentences = len(sentences)
    total_syllables = 0
    complex_word_count = 0
    personal_pronouns = ['i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours']
    personal_pronoun_count = 0

    # Compute variables
    for word in words:
        # Syllable count
        total_syllables += syllable_count(word)

        # Complex word count
        if syllable_count(word) > 2:
            complex_word_count += 1

        # Personal pronoun count
        if word.lower() in personal_pronouns:
            personal_pronoun_count += 1

    # Compute average sentence length
    avg_sentence_length = total_words / total_sentences

    # Compute percentage of complex words
    percentage_complex_words = (complex_word_count / total_words) * 100

    # Compute FOG index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Compute average number of words per sentence
    avg_words_per_sentence = total_words / total_sentences

    # Compute syllables per word
    syllables_per_word = total_syllables / total_words

    # Compute polarity and subjectivity scores
    blob = TextBlob(text)
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity

    # Compute positive and negative score
    positive_score = sum([1 for word in words if word.lower() in positive_words])
    negative_score = sum([1 for word in words if word.lower() in negative_words])

    # Compute average word length
    avg_word_length = sum(len(word) for word in words) / total_words

    return [positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_length,
            percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count,
            total_words, syllables_per_word, personal_pronoun_count, avg_word_length]




In [18]:
# Function to count syllables in a word
def syllable_count(word):
    word = word.lower()
    if word in ['a', 'i', 'u', 'e', 'o']:
        return 1
    if word[:3] == "tri" and len(word) > 4 and word[:4] != "tria":
        return len(re.findall('(?!e$)[aeiouy]+', word)) + 1
    return len(re.findall('(?!e$)[aeiouy]+', word))

In [22]:
positive_words = set(pd.read_csv("/content/positive-words.txt", header=None, encoding='ISO-8859-1')[0])
negative_words = set(pd.read_csv("/content/negative-words.txt", header=None, encoding='ISO-8859-1')[0])

In [25]:
# Drop rows with NaN values in the 'URL' column
input_data = input_data.dropna(subset=['URL'])

# Extract text and compute variables for each URL
output_data = []
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    text = extract_text_from_url(url)
    if text:
        variables = compute_text_analysis(text)
        output_data.append([url_id] + variables)
    else:
        output_data.append([url_id] + [None] * 13)


In [26]:
# Extract text and compute variables for each URL
output_data = []
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    text = extract_text_from_url(url)
    if text:
        variables = compute_text_analysis(text)
        output_data.append([url_id] + variables)
    else:
        output_data.append([url_id] + [None] * 13)




In [27]:
# Create DataFrame for output data
output_columns = ['URL_ID', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
                  'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
                  'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
                  'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']
output_df = pd.DataFrame(output_data, columns=output_columns)

In [31]:
# Specify the file path for the output CSV file
output_csv_file_path = "/content/Output Data Structure.csv"

# Save output DataFrame to CSV file
output_df.to_csv(output_csv_file_path, index=False)
