## Extracting the Data from given input file for each link

1. Have used beautifulsoup to parse HTML content
2. Following code generates seperate txt files for each given url
3. If URL not present throw error


In [19]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

# Function to extract article text and title and save them in a file
def extract_article(url_id, url):
    # Send a GET request to the URL and retrieve the HTML content
    response = requests.get(url)
    html_content = response.text

    # Use BeautifulSoup to parse the HTML content and extract the article text and title
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the article text and title based on the HTML tags and attributes
    article_elem = soup.find(class_="td-post-content tagdiv-type")
    title_elem = soup.find('h1', class_='entry-title')

    # Check if the article text and title were found on the page
    if article_elem is not None and title_elem is not None:
        article_text = article_elem.text.strip()
        title = title_elem.text.strip()

        # Save the article text and title in a text file with URL_ID as its filename
        with open(f'{url_id}.txt', 'w', encoding='utf-8') as f:
            f.write(f'Title: {title}\n\n')
            f.write(f'Article Text:\n\n{article_text}')
    else:
        print(f'Error: Article text or title not found for URL_ID {url_id}')

# Read the input file into a pandas dataframe
df = pd.read_excel('input.xlsx')

# Loop over the rows of the dataframe and extract the article text and title for each URL
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    extract_article(url_id, url)

Error: Article text or title not found for URL_ID 44
Error: Article text or title not found for URL_ID 51
Error: Article text or title not found for URL_ID 57
Error: Article text or title not found for URL_ID 91
Error: Article text or title not found for URL_ID 92
Error: Article text or title not found for URL_ID 100
Error: Article text or title not found for URL_ID 107
Error: Article text or title not found for URL_ID 108
Error: Article text or title not found for URL_ID 112
Error: Article text or title not found for URL_ID 144


Analysis

In [22]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os
import re

## Analyse the extracted txt files

1. imported stop word files and have set +ve and -ve words files
2. Output file analysis.csv is generated with the mentioned headers
3. The file name is compared to each indiviual file in generated txt folder (contains all the txt files)
4. Loop through each file to generate the required variables using formule  


In [227]:
import os
import csv
import pyphen
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from nltk.corpus import stopwords

# Set up stop words
stop_words_files = ['StopWords_Generic.txt', 'StopWords_GenericLong.txt', 'StopWords_Names.txt','StopWords_Currencies.txt',
                    'StopWords_Auditor.txt', 'StopWords_DatesandNumbers.txt']

stop_words = set()
for file in stop_words_files:
    with open(file, 'r') as f:
        words = f.read().splitlines()
        stop_words.update(words)
        
# Set up positive and negative words
positive_words = set()
negative_words = set()
with open('positive-words.txt', 'r') as f:
    words = f.read().splitlines()
    positive_words.update(words)
with open('negative-words.txt', 'r') as f:
    words = f.read().splitlines()
    negative_words.update(words)
        
# Set up output CSV file
output_file = 'analysis.csv'
header = ['URL_ID', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', "SUBJECTIVITY SCORE", "AVG SENTENCE LENGTH",
         "PERCENTAGE OF COMPLEX WORDS","COMPLEX WORD COUNT","FOG INDEX","AVG NUMBER OF WORDS PER SENTENCE","WORD COUNT",
         "AVG WORD LENGTH","PERSONAL PRONOUNS","SYLLABLE PER WORD"]
with open(output_file, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)
        
    # Loop through text files and perform analysis
    for filename in os.listdir('C:/Users/modx/Desktop/generated_txt/'):
        if filename.endswith('.txt'):
            with open('C:/Users/modx/Desktop/generated_txt/' + filename, 'r', encoding='utf-8') as f:
                
                text = f.read().lower()
                
                # Clean the text
                words = word_tokenize(text)
                words_cleaned = [word for word in words if word not in stop_words and word.isalpha()]
                
                # Calculate positive and negative scores
                positive_score = sum([1 for word in words_cleaned if word in positive_words])
                negative_score = sum([1 for word in words_cleaned if word in negative_words])
                
                # Calculate polarity score
                polarity_score = round((positive_score - negative_score) / ((positive_score + negative_score) + 0.000001),2)
                
                # calculate the subjectivity score
                subjectivity_score = round(positive_score + negative_score / (len(words_cleaned) + 0.000001),2) 
                
                #calculate the average sentence length
                sentences = sent_tokenize(text)
                total_sentences = len(sentences)
                avg_sentence_length = round(len(words_cleaned) / len(sentences),2)
                
                #calculate the percentage of complex words
                
                cmplx_words = []
                #use pyphen for syllabus count
                dic = pyphen.Pyphen(lang='en')
                for word in words_cleaned:
                    
                    syllable_count = len(dic.inserted(word).split('-'))
       
                    if syllable_count > 2:
                        cmplx_words.append(word)

                #Complex words 
                complex_word_count = len(cmplx_words)
                
                complex_words = (complex_word_count / len(words_cleaned))*100 

                # convert to percentage
                percent_complex_words = str(complex_words) + "%" 
                
                
                # Calculate FOG Index
                fog_index = round(0.4 * (avg_sentence_length + complex_words),2)
                
                # Calculate Average Number of Words per Sentence and Syllables per Word
                avg_words_per_sentence = round(len(words_cleaned) / len(sentences),2)
                
               
                # Count the total number of cleaned words
                word_count = len(words_cleaned)
                
                # avg word length
                total_chars = sum(len(word) for word in words_cleaned)
                avg_word_length = round(total_chars / len(words_cleaned),2)
                
                #Avg words / sentence
                avg_words_per_sentence = round(word_count / total_sentences,2)
           
                #pronoun 
                personal_pronouns = ["I", "we", "my", "ours", "us"]
                counts = {}
                for pronoun in personal_pronouns:
                    pattern = r"\b" + pronoun + r"\b"
                    count = len(re.findall(pattern, text, re.IGNORECASE))
                    counts[pronoun] = count
                    
                pronoun_count = sum(counts.values())
                
                #syllabels per word
                #use pyphen for syllabus count
                dic = pyphen.Pyphen(lang='en')
                for word in words_cleaned:
                    
                    syllable_count = len(dic.inserted(word).split('-'))
        
                # Write to CSV
                url_id = filename.split('.')[0]
                row = [url_id, positive_score, negative_score, polarity_score, subjectivity_score,avg_sentence_length,
                       percent_complex_words,complex_word_count,fog_index,avg_words_per_sentence,word_count, avg_word_length,
                      pronoun_count,syllable_count]
                
                
                writer.writerow(row)

## Sort the rows wrt to URL as required in the output file and merge with the input file to get the required output

my machine kept arranging the url in some weird fashion, so the sorting according to the url step was added

In [228]:
#sort the rows

import csv

filename = "analysis.csv"
rows = []

# Read the CSV file and skip the header row
with open(filename, 'r') as csv_file:
    reader = csv.reader(csv_file)
    header = next(reader)  # skip the header row
    for row in reader:
        rows.append(row)

# Sort the rows by the first column (as an integer)
rows.sort(key=lambda x: int(x[0]))

# Write the sorted rows back to the same file
with open(filename, 'w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(header)
    writer.writerows(rows)

In [229]:
#Final output file

In [230]:

import csv
import pandas as pd

# read the two CSV files
df1 = pd.read_csv('analysis.csv')
df2 = pd.read_excel('Input.xlsx')

# merge the two dataframes on the 'URL_ID' column
merged_df = pd.merge(df1, df2, on='URL_ID', how='left')

# reorder the columns
merged_df = merged_df[["URL_ID","URL", "POSITIVE SCORE" ,"NEGATIVE SCORE","POLARITY SCORE","SUBJECTIVITY SCORE","AVG SENTENCE LENGTH",
                       "PERCENTAGE OF COMPLEX WORDS" ,"FOG INDEX", "AVG NUMBER OF WORDS PER SENTENCE" ,"COMPLEX WORD COUNT",
                       "WORD COUNT", "SYLLABLE PER WORD", "PERSONAL PRONOUNS", "AVG WORD LENGTH"]]

# save the merged dataframe to a new CSV file
merged_df.to_csv('output.csv', index=False)