In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import json
import numpy as np
import requests
from requests.models import MissingSchema
import spacy
import re
from nltk.tokenize import RegexpTokenizer, sent_tokenize


df = pd.read_csv("Input.csv")

In [2]:
def beautifulsoup_extract_text_fallback(response_content):
    
    
    paragraphs = " "    
    # Create the beautifulsoup object:
    soup = BeautifulSoup(response_content, 'html.parser')

    for each in soup.find_all('p'):
        #each = each.replace('\t','')
        each = remove_tags(str(each))
        each = each.replace('\t', '').strip()
        #each = re.sub(r'[^a-zA-Z ]','',each)
        #print(each)
        paragraphs += each

    return paragraphs
    
def extract_text_from_single_web_page(url):
    resp = requests.get(url,headers={"User-Agent": "XY"})
    # We will only extract the text from successful requests:
    if resp.status_code == 200:
          return beautifulsoup_extract_text_fallback(resp.content)

TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
    return TAG_RE.sub('', text)

def['paragraph'] = df["URL"].apply(extract_text_from_single_web_page)

In [3]:
def tokenizer(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z ]','',text)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    filtered_words = list(filter(lambda token: token not in stopWordList, tokens))
    return filtered_words

In [4]:
with open('StopWords_Generic.txt','r') as stop_words:
    stopWords = stop_words.read().lower()
stopWordList = stopWords.split('\n')
stopWordList[-1:] = []

with open('PositiveWords.txt','r') as posfile:
    positivewords=posfile.read().lower()
positiveWordList=positivewords.split('\n')

with open('NegativeWords.txt','r') as negfile:
    negativeword=negfile.read().lower()
negativeWordList=negativeword.split('\n')

def positive_Score(text):
    positive = 0
    text = tokenizer(text)
    for each in text:
        if each in positiveWordList:
            positive += 1
    return positive

def negative_Score(text):
    negative = 0
    text = tokenizer(text)
    for each in text:
        if each in negativeWordList:
            negative += 1
    return negative

def average_sentence_length(text):
    return len(tokenizer(text))/len(sent_tokenize(text))

def percentage_complex_word(text):
    tokens = tokenizer(text)
    complexWord = 0
    complex_word_percentage = 0
    
    for word in tokens:
        vowels=0
        if word.endswith(('es','ed')):
            pass
        else:
            for w in word:
                if(w=='a' or w=='e' or w=='i' or w=='o' or w=='u'):
                    vowels += 1
            if(vowels > 2):
                complexWord += 1
    if len(tokens) != 0:
        complex_word_percentage = complexWord/len(tokens)
    
    return round(complex_word_percentage*100,2)


def word_count(text):
    text = re.sub(r'[^a-zA-Z ]','',text)
    return len(tokenizer(text))

def complex_word_count(text):
    text = re.sub(r'[^a-zA-Z ]','',text)
    tokens = tokenizer(text)
    complexWord = 0
    
    for word in tokens:
        vowels=0
        if word.endswith(('es','ed')):
            pass
        else:
            for w in word:
                if(w=='a' or w=='e' or w=='i' or w=='o' or w=='u'):
                    vowels += 1
            if(vowels > 2):
                complexWord += 1
    return complexWord

def count_syllables(word):
    return len(
        re.findall('(?!e$)[aeiouy]+', word, re.I) +
        re.findall('^[^aeiouy]*e$', word, re.I)
    )

def count_syllablesperoword(text):
    count_syllab = 0
    tokens = tokenizer(text)
    for each in tokens:
        count_syllab += count_syllables(each)
    return count_syllab/len(tokens)

def personal_pronoun(text):
    pronounRegex = re.compile(r'\b(I|we|my|ours|(?-i:us))\b',re.I)
    pronouns = pronounRegex.findall(text)
    return len(pronouns)

def avg_word_length(text):
    tokens = tokenizer(text)
    total_words = 0
    for each in tokens:
       total_words += len(each)
    return total_words/len(tokens)


In [5]:
df['POSITIVE SCORE'] = df['paragraph'].apply(positive_Score)
df['Negaive SCORE'] = df['paragraph'].apply(negative_Score)
df['POLARITY SCORE'] = (df['POSITIVE SCORE'] - df['Negaive SCORE']) / (df['POSITIVE SCORE'] + df['Negaive SCORE']+ 0.00001)
df['SUBJECTIVITY SCORE'] = (df['POSITIVE SCORE'] + df['Negaive SCORE']) / (len(df['paragraph'])+ 0.0001)
df['AVG SENTENCE LENGTH'] = df['paragraph'].apply(average_sentence_length)
df['PERCENTAGE OF COMPLEX WORDS'] = df['paragraph'].apply(percentage_complex_word)
df['FOG INDEX'] = 0.4 * (df['AVG SENTENCE LENGTH']  + df['PERCENTAGE OF COMPLEX WORDS'])
df['COMPLEX WORD COUNT'] = df['paragraph'].apply(complex_word_count)
df['WORD COUNT'] = df['paragraph'].apply(word_count)
df['SYLLABLE PER WORD'] = df['paragraph'].apply(count_syllablesperoword)
df['PERSONAL PRONOUNS'] = df['paragraph'].apply(personal_pronoun)
df['AVG WORD LENGTH'] =  df['paragraph'].apply(avg_word_length)


Unnamed: 0,URL_ID,URL,paragraph,POSITIVE SCORE,Negaive SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,1,https://insights.blackcoffer.com/how-is-login-...,When people hear AI they often think about se...,4,4,0.000000,0.047059,23.277778,33.65,22.771111,141,419,2.081146,4,6.491647
1,2,https://insights.blackcoffer.com/how-does-ai-h...,With increasing computing power and more data...,9,6,0.200000,0.088235,19.300000,39.12,23.368000,151,386,2.220207,2,6.787565
2,3,https://insights.blackcoffer.com/ai-and-its-im...,If you were a fan of the 90’s film Clueless b...,31,20,0.215686,0.300000,27.230769,43.22,28.180308,459,1062,2.333333,13,7.051789
3,4,https://insights.blackcoffer.com/how-do-deep-l...,"Understanding exactly how data is ingested, a...",5,1,0.666666,0.035294,18.357143,44.75,25.242857,115,257,2.233463,1,6.789883
4,5,https://insights.blackcoffer.com/how-artificia...,"From the stone age to the modern world, from ...",10,11,-0.047619,0.123529,12.709677,44.67,22.951871,176,394,2.327411,21,6.969543
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,167,https://insights.blackcoffer.com/role-big-data...,"Can academia, researchers, decision makers an...",17,37,-0.370370,0.317647,18.276596,40.51,23.514638,348,859,2.275902,15,6.786962
166,168,https://insights.blackcoffer.com/sales-forecas...,Inventory planning is a fundamental part of r...,20,11,0.290322,0.182353,20.409091,45.21,26.247636,203,449,2.407572,0,7.113586
167,169,https://insights.blackcoffer.com/detect-data-e...,Insider threat detection specifically to dete...,4,45,-0.836735,0.288235,14.261905,42.24,22.600762,253,599,2.419032,6,7.033389
168,170,https://insights.blackcoffer.com/data-exfiltra...,"If we talk in terms of our general life, Exfi...",4,4,0.000000,0.047059,13.550000,38.38,20.772000,104,271,2.143911,11,6.464945


In [6]:
df.drop('URL_ID', axis=1, inplace=True)
df.drop('paragraph', axis=1, inplace=True)

df.to_csv('output_vaishnavi.csv', sep=',', encoding='utf-8', index = False)