In [36]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
import os
import re

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('cmudict')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\VinitKate\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\VinitKate\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\VinitKate\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

In [37]:
data_urls = pd.read_csv('input.csv')

In [38]:
def getText(text, req=['h1', 'p']):
    useful = ''
    for t in text:
        if t.parent.name in req:
            useful += '{} '.format(t)
    return useful

ROOT_DIR = os.curdir

def saveFile(name, content):
    if not os.path.isdir(os.path.join(ROOT_DIR, 'textFiles')):
        os.mkdir(os.path.join(ROOT_DIR, 'textFiles'))
    targetPath = os.path.join(ROOT_DIR, 'textFiles', name+'.txt')
    with open(targetPath, 'w', encoding='utf-8') as file:
        file.write(content)

# --------- FOR IMPORTING DATA ============ uncomment the following if want to download data may take time
# for i in range(len(data_urls['URL'])):
#     url = data_urls['URL'][i]
#     urlID = data_urls['URL_ID'][i]
#     res = requests.get(url)
#     html_pg = res.content
#     soup = BeautifulSoup(html_pg, 'html.parser')
#     text = soup.find_all(text=True)
#     text = getText(text)
#     text = text[:text.rindex('Contact us')]
#     fileName = urlID
#     saveFile(str(urlID), text)

In [39]:
stopWordFileDest = os.path.join(os.curdir, 'StopWords')
fileNames = os.listdir(stopWordFileDest)
stop_words = []
for fileName in fileNames:
    with open(os.path.join(stopWordFileDest, fileName), 'r') as file:
        for line in file:
            st = line.rstrip()
            f = st.split(" | ")
            for w in f:
                stop_words.append(w.strip())
        
# stop_words

In [40]:
positiveDict = []
with open('positive-words.txt', 'r') as file:
    for line in file:
        positiveDict.append(line.rstrip())

negativeDict = []
with open('negative-words.txt', 'r') as file:
    for line in file:
        negativeDict.append(line.rstrip())

In [41]:
def is_complex(word):
    syllables = 0
    vowels = ['a', 'e', 'i', 'o', 'u']
    for w in word:
        if w in vowels:
            syllables += 1    
    return syllables >= 2

def numberOfComplexWords(data, words):
    complex_word_count = 0
    for word in words:
        if is_complex(word):
            complex_word_count += 1
    return complex_word_count

def numberOfPersonalPronouns(data, words, pronouns=['i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours']):
#     words = word_tokenize(data)
    personalPronouns = 0
    for word in words:
        if word.lower() in pronouns:
            personalPronouns += 1
    return personalPronouns

def syllablePerWord(data, words):
    syllables = 0
    vowels = ['a', 'e', 'i', 'o', 'u']
    for word in words:
        if word.endswith('ed') or word.endswith('es'):
            continue
        temp = 0
        for w in word.lower():
            if w in vowels:
                temp += 1
        
        syllables += temp
            
    if len(words) > 0:
        return syllables / len(words)
    return 0

def totalWords(data, words):
    return len(words)

def averageWordLength(data, words):
    charCount = sum(len(word) for word in words)
    if len(words) > 0:
        return charCount / len(words)
    return 0

def averageWordsPerSentense(data, sentenses):
    words = 0
    for sentense in sentenses:
        wds = word_tokenize(sentense)
        words += len(wds)
        
    if len(sentenses) > 0:
        return words / len(sentenses)
    return 0

def fogIndex(data, sentenses, words):
    complex_words = numberOfComplexWords(data, words)
    total_words = totalWords(data, words)
    total_sentenses = len(sentenses)
    return 0.4*((total_words/total_sentenses) 
                + 100*(complex_words/total_words))

def percentComplexWords(data, words):
    complex_words = numberOfComplexWords(data, words)
    total_words = totalWords(data, words)
    return (complex_words/total_words)*100

def averageSentenseLength(data, sentenses):
    sentensesLength = sum(len(sentense) for sentense in sentenses)
    if len(sentenses) > 0:
        return sentensesLength / len(sentenses)
    return 0

def polarity(positive, negative):
    return (positive - negative)/(positive + negative) + 10**(-6)
    

def subjectivity(data, words):
    tws = totalWords(data, words)
    pos, neg = positiveNegativeScore(data, words)
    return (pos + neg) / tws + 10**(-6)
    
def positiveNegativeScore(data, words):
    positive, negative = 0, 0
    for word in words:
        if word in positiveDict:
            positive += 1
        elif word in negativeDict:
            negative += 1
    
    return (positive, negative)

def clean(data):
    
    #Removing URLs with a regular expression
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    data = url_pattern.sub(r'', data)

    # Remove Emails
    data = re.sub('\S*@\S*\s?', '', data)

#     Remove new line characters
    data = re.sub('\s+', ' ', data)

    # Remove single quotes
    data = re.sub("\'", "", data)
    
    #Remove comma, fullstop
    data = re.sub(r'[^\w\s]', '', data)
        
    return data

def removeStopWords(data):
    words = data.split(" ")
    for i in range(len(words)):
        if words[i] in stop_words:
            words[i] = ""
            
    return " ".join(map(str, words))

In [42]:
textFileDest = os.path.join(os.path.curdir, 'textFiles')

out_df = pd.read_csv('outputData.csv')

# files = os.listdir(textFileDest)
out_df


Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthc...,,,,,,,,,,,,,
1,38,https://insights.blackcoffer.com/what-if-the-c...,,,,,,,,,,,,,
2,39,https://insights.blackcoffer.com/what-jobs-wil...,,,,,,,,,,,,,
3,40,https://insights.blackcoffer.com/will-machine-...,,,,,,,,,,,,,
4,41,https://insights.blackcoffer.com/will-ai-repla...,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,146,https://insights.blackcoffer.com/blockchain-fo...,,,,,,,,,,,,,
110,147,https://insights.blackcoffer.com/the-future-of...,,,,,,,,,,,,,
111,148,https://insights.blackcoffer.com/big-data-anal...,,,,,,,,,,,,,
112,149,https://insights.blackcoffer.com/business-anal...,,,,,,,,,,,,,


In [43]:
for file in os.listdir(textFileDest):
#     print(file)
    fileDest = os.path.join(textFileDest, file)
    with open(fileDest, 'r', encoding='utf-8') as f:
        text = f.read()
        dataWithoutStop = removeStopWords(text)
        data = clean(dataWithoutStop)
        row = out_df[out_df['URL_ID'] == int(file[:file.find('.txt')])].index
        words = word_tokenize(data)
        sentenses = sent_tokenize(dataWithoutStop)
        pos, neg = positiveNegativeScore(data, words)
        out_df.loc[row, 'POSITIVE SCORE'], out_df.loc[row, 'NEGATIVE SCORE'] = pos, neg
        out_df.loc[row, 'POLARITY SCORE'] = polarity(pos, neg)
        out_df.loc[row, 'SUBJECTIVITY SCORE'] = subjectivity(data, words)
        out_df.loc[row, 'AVG SENTENCE LENGTH'] = averageSentenseLength(dataWithoutStop, sentenses)
        out_df.loc[row, 'PERCENTAGE OF COMPLEX WORDS'] = percentComplexWords(data, words)
        out_df.loc[row, 'FOG INDEX'] = fogIndex(data, sentenses, words)
        out_df.loc[row, 'AVG NUMBER OF WORDS PER SENTENCE'] = averageWordsPerSentense(dataWithoutStop, sentenses)
        out_df.loc[row, 'COMPLEX WORD COUNT'] = numberOfComplexWords(data, words)
        out_df.loc[row, 'WORD COUNT'] = totalWords(data, words)
        out_df.loc[row, 'SYLLABLE PER WORD'] = syllablePerWord(data, words)
        out_df.loc[row, 'PERSONAL PRONOUNS'] = numberOfPersonalPronouns(data, words)
        out_df.loc[row, 'AVG WORD LENGTH'] = averageWordLength(data, words)
        

In [44]:
out_df

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthc...,64.0,33.0,0.319589,0.090995,133.486486,85.928705,40.133644,17.405405,916.0,1066.0,2.420263,2.0,7.429644
1,38,https://insights.blackcoffer.com/what-if-the-c...,59.0,37.0,0.229168,0.134267,78.333333,72.167832,32.397997,11.765432,516.0,715.0,2.055944,1.0,6.639161
2,39,https://insights.blackcoffer.com/what-jobs-wil...,67.0,33.0,0.340001,0.105043,102.290698,85.189076,38.503537,13.662791,811.0,952.0,2.341387,1.0,7.301471
3,40,https://insights.blackcoffer.com/will-machine-...,59.0,23.0,0.439025,0.110364,73.222222,76.446837,33.880957,10.233333,568.0,743.0,2.259758,2.0,6.648721
4,41,https://insights.blackcoffer.com/will-ai-repla...,51.0,23.0,0.378379,0.080875,104.384615,77.704918,35.774275,14.769231,711.0,915.0,2.209836,6.0,6.821858
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,146,https://insights.blackcoffer.com/blockchain-fo...,26.0,26.0,0.000001,0.100001,97.854167,80.000000,36.333333,12.812500,416.0,520.0,2.205769,5.0,7.157692
110,147,https://insights.blackcoffer.com/the-future-of...,39.0,11.0,0.560001,0.078617,112.480000,77.987421,36.282969,15.560000,496.0,636.0,2.226415,2.0,6.904088
111,148,https://insights.blackcoffer.com/big-data-anal...,29.0,39.0,-0.147058,0.104136,86.969231,79.938744,35.993959,12.138462,522.0,653.0,2.361409,2.0,6.803982
112,149,https://insights.blackcoffer.com/business-anal...,31.0,3.0,0.823530,0.096318,125.884615,84.135977,39.085160,16.076923,297.0,353.0,2.439093,1.0,7.453258


In [34]:
out_df.columns

Index(['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
       'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH',
       'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
       'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
       'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'],
      dtype='object')

In [35]:
out_df.to_csv('Answers.csv', index=False)

In [45]:
import math

for file in os.listdir(textFileDest):
#     print(file)
    fileDest = os.path.join(textFileDest, file)
    with open(fileDest, 'r', encoding='utf-8') as f:
        text = f.read()
        dataWithoutStop = removeStopWords(text)
        data = clean(dataWithoutStop)
        row = out_df[out_df['URL_ID'] == int(file[:file.find('.txt')])].index
        words = word_tokenize(data)
        sentenses = sent_tokenize(dataWithoutStop)
        pos, neg = positiveNegativeScore(data, words)
        out_df.loc[row, 'POSITIVE SCORE'], out_df.loc[row, 'NEGATIVE SCORE'] = pos, neg
        out_df.loc[row, 'POLARITY SCORE'] = polarity(pos, neg)
        out_df.loc[row, 'SUBJECTIVITY SCORE'] = subjectivity(data, words)
        out_df.loc[row, 'AVG SENTENCE LENGTH'] = math.floor(averageSentenseLength(dataWithoutStop, sentenses))
        out_df.loc[row, 'PERCENTAGE OF COMPLEX WORDS'] = percentComplexWords(data, words)
        out_df.loc[row, 'FOG INDEX'] = fogIndex(data, sentenses, words)
        out_df.loc[row, 'AVG NUMBER OF WORDS PER SENTENCE'] = math.floor(averageWordsPerSentense(dataWithoutStop, sentenses))
        out_df.loc[row, 'COMPLEX WORD COUNT'] = numberOfComplexWords(data, words)
        out_df.loc[row, 'WORD COUNT'] = totalWords(data, words)
        out_df.loc[row, 'SYLLABLE PER WORD'] = syllablePerWord(data, words)
        out_df.loc[row, 'PERSONAL PRONOUNS'] = numberOfPersonalPronouns(data, words)
        out_df.loc[row, 'AVG WORD LENGTH'] = math.floor(averageWordLength(data, words))

In [49]:
out_df.to_csv('Answers.csv', index=False)