# Objective: 
To extract textual data articles from the number of URL's and perfrom text analysis to compute following variables.

1.	POSITIVE SCORE
2.	NEGATIVE SCORE
3.	POLARITY SCORE
4.	SUBJECTIVITY SCORE
5.	AVG SENTENCE LENGTH
6.	PERCENTAGE OF COMPLEX WORDS
7.	FOG INDEX
8.	AVG NUMBER OF WORDS PER SENTENCE
9.	COMPLEX WORD COUNT
10.	WORD COUNT
11.	SYLLABLE PER WORD
12.	PERSONAL PRONOUNS
13.	AVG WORD LENGTH


# STEP 1

In [362]:
!pip install validators
!pip install syllables



In [784]:
from bs4 import BeautifulSoup
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from urllib.request import urlopen
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [None]:
# Load basic package and dataset for NLP
import os
import pandas as pd
import json
import validators
import requests
import syllables
import re
import nltk

import warnings
warnings.filterwarnings('ignore')

#Get the URL's
input_file = pd.read_excel('Input.xlsx')
input_file.head()

In [786]:
#We had 114 URL's contains two columns URL_ID and URL'S
len(input_file)

114

# STEP 2

# Checking for Invalid URLS


In [686]:
from urllib.request import urlopen
from urllib.error import *

In [754]:

working_urls_id=[]
not_working_urls_id=[]
# try block to read URL
for i in range(0,114):  
    try:
        html = urlopen(input_file['URL'].values[i])
    except HTTPError as e:
        #print("HTTP error", e)
        not_working_urls_id+=[i]

    except URLError as e:
        #print("Opps ! Page not found!", e)
        not_working_urls_id+=[i]

    else:
        #print('Yeah !  found ')
        working_urls_id+=[i]

In [755]:
not_working_urls_id

[7, 20, 107]

In [788]:
#Dropping URL's that are not working
input_file.drop( input_file.index[not_working_urls_id], axis=0, inplace=True)
input_file

In [793]:
len(input_file)

111

In [792]:
urls = input_file['URL'].values
len(urls)

111

In [691]:
#Dropping url's from output_file
output_file = pd.read_excel('Output Data Structure.xlsx')
output_file.drop(not_working_urls_id,axis=0, inplace=True)


In [692]:
len(output_file)

110

# STEP 3

# Merging stopword files

In [794]:
#Given different files that contain stopwords cleaning and merging all those files
import re
sw_files = ['StopWords_Currencies.txt','StopWords_Geographic.txt','StopWords_Names.txt','StopWords_Auditor.txt','StopWords_DatesandNumbers.txt','StopWords_Generic.txt','StopWords_GenericLong.txt']
user_stopwords =[]

for name in sw_files:
    with open(name) as sw:
        sw_content = sw.read()
        sw_content = sw_content.lower()
        sw_content = sw_content.replace('|','')
        sw_list = sw_content.split() 
        user_stopwords += [x for x in sw_list]

len(user_stopwords)


14238

# Extracting Positive Words

In [None]:
#loading positive words and cleaning

import re
positive_words = []
with open('positive-words.txt') as pw:
            pw_content = pw.read()
            pw_content = pw_content.split('\n')
            positive_words = [x for x in pw_content]
print(len(positive_words))
print(positive_words)

# Extracting Negative Words

In [None]:
#loading negative words and cleaning

import re
negative_words = []
with open('negative-words.txt') as nw:
            nw_content = nw.read()
            nw_content = nw_content.split('\n')
            negative_words = [x for x in nw_content]
print(len(negative_words))
print(negative_words)


# STEP 4

# Extracting Data using Beautifulsoup

In [797]:
#Functions to extract data, sentence tokenizaton, word tokenization. 
def Extracting_Data(url):
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'lxml') 
    #title = soup.findAll('h1')[0].text
    s= (soup.title.text).split()
    title=' '.join(s[0:(len(s)-3)])
    p_tag = soup.findAll('p', {'class' : ''})
    String = ''
    for i in p_tag:
        N = i.text
        String += ''.join(N)+ ' '
    S = String
    Data = title + " "+ String
    Data = Data.lower()
    return Data

def sentence(Data):
    sentences = nltk.sent_tokenize(Data)
    corpus=[]
    for i in range(len(sentences)):
        review = re.sub('[^a-zA-Z]',' ', sentences[i] )
        review = review.split()
        r=[word for word in review if word not in user_stopwords]
        r=' '.join(r)
        corpus.append(r)
    return corpus


def total_words(sentence):
    total_words=[]
    for i in sentence:
        total_words.extend(nltk.word_tokenize(i))
        #total_words.append(words)        
    return total_words


# STEP 5

# Functions to Derrived variables


In [798]:

def positive_score(words):
    positive_score=0
    for j in words:
        if j in positive_words:
            positive_score+=1
    return positive_score

            
def negative_score(words):
    negative_score=0
    for j in words:
        if j in negative_words:
            negative_score+=1
    return negative_score

def polarity_score(positive_score,negative_score):
    polarity_score = (positive_score - negative_score)/ ((positive_score + negative_score) + 0.000001)
    return polarity_score


def subjectivity_score(positive_score,negative_score,total_words):
    subjectivity_score = (positive_score + negative_score)/ ((len(total_words)) + 0.000001)
    return subjectivity_score



# Functions for Analysis of Readability

In [799]:
def avg_sentence_lengths(l_w,l_s):
    #Average_length_words = (len(total_words))/(len(sentence))
    Average_length_words = (((l_w))/((l_s)))
    return float(Average_length_words)


def per_of_complex_numbers(total_words):
    complex_words=[]
    complex_count=0
    for j in total_words:
        if (syllables.estimate(j))>2:
            complex_words.append(j)
            complex_count+=1
    percentage = ((complex_count)/(len(total_words)))*100
    return percentage

def fog_index(Average_length_words, per_complex_words):
    Fog_Index = 0.4 * (Average_length_words + per_complex_words)
    return float(Fog_Index)



In [800]:
def avg_no_words_per_sec(l_w,l_s):
    #Average_Number_of_Words_Per_Sentence = len(total_words) / len(corpus)
    Average_Number_of_Words_Per_Sentence = (l_w)/(l_s)
    return Average_Number_of_Words_Per_Sentence
    
def complex_words_count(total_words):
    complex_words=[]
    complex_count=0
    for j in total_words:
        if (syllables.estimate(j))>2:
            complex_words.append(j)
            complex_count+=1
    return complex_count

def word_count(data):   
    new_text=[]
    for i in data:
        x=i.split()
        for j in range(0,len(x)):
            if x[j] in stopwords.words('english'):
                new_text.append('')
            else:
                new_text.append(j)          
    return len(new_text)



def personal_pronouns(text):
    pronouns = re.compile(r'\b(I|we|my|ours|(?-i:us))\b',re.I)
    pronoun = pronouns.findall(text)
    return len(pronoun)

def avg_word_length(word):
    s=0
    for i in word:
        s+=len(i)
    Avg_w_L = (s)/(len(word))
    return Avg_w_L

# STEP 6

In [None]:
urls

# Calculating all scores and appending in lists

In [802]:
p_score=[]
n_score=[]
pol_score=[]
sub=[]
avg_sentence_length = []
per_complex_w = []
for_index = []
avg_number_words_per_second=[]
com_count=[]
word_counts=[]
personal_pronoun=[]
avg_w_len =[]


#For loop takes each url and calculate all scores and append in lists
for i in urls:
    d=Extracting_Data(i)
    s=sentence(d)
    t_w = total_words(s)
    length_sentence = len(s)
    length_word = len(t_w)
    p=positive_score(t_w)
    n=negative_score(t_w)
    pol=polarity_score(p,n)
    sub_score = subjectivity_score(p,n,t_w)
    avg_sent_length = avg_sentence_lengths(length_word,length_sentence)
    per_c_w =per_of_complex_numbers(t_w)
    fog_ind = fog_index(avg_sent_length,per_c_w)
    avg_number_words_per_sec = avg_no_words_per_sec(length_word,length_sentence)
    complex_count = complex_words_count(t_w)
    w_c = word_count(d)
    p_pronoun =personal_pronouns(d)
    avg_word_len =avg_word_length(t_w)
    
    
    
    #Appending all values into above lists
    p_score.append(p)
    n_score.append(n)
    pol_score.append(pol)
    sub.append(sub_score)
    avg_sentence_length.append(avg_sent_length)
    per_complex_w.append(per_c_w)
    for_index.append(fog_ind)
    avg_number_words_per_second.append(avg_number_words_per_sec)
    word_counts.append(w_c)
    com_count.append(complex_count)
    personal_pronoun.append(p_pronoun)
    avg_w_len.append(avg_word_len)

# Created DataFrame and stored all values

In [804]:
df =pd.DataFrame()   

In [805]:
len(input_file['URL_ID'])

111

In [806]:
df['URL_ID']=input_file['URL_ID']
df['POSITIVE SCORE']= p_score
df['NEGATIVE SCORE']=n_score
df['POLARITY SCORE']=pol_score 
df['SUBJECTIVITY SCORE'] = sub
df['AVG SENTENCE LENGTH'] = avg_sentence_length
df['PERCENTAGE OF COMPLEX WORDS'] = per_complex_w
df['FOG INDEX'] =  for_index 
df['AVG NUMBER OF WORDS PER SENTENCE'] = avg_number_words_per_second
df['COMPLEX WORD COUNT'] =com_count
df['WORD COUNT'] =  word_counts
df['PERSONAL PRONOUNS'] = personal_pronoun
df['AVG WORD LENGTH'] = avg_w_len

In [807]:
df.shape

(111, 13)

In [808]:
df.head()

Unnamed: 0,URL_ID,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37.0,66,34,0.32,0.104493,12.76,51.515152,25.710061,12.76,493,10316,1,7.579937
1,38.0,58,37,0.221053,0.17658,6.810127,42.379182,19.675723,6.810127,228,7037,7,7.154275
2,39.0,64,35,0.292929,0.123288,9.447059,55.292653,25.895885,9.447059,444,9352,3,7.712329
3,40.0,60,27,0.37931,0.147708,6.402174,43.803056,20.082092,6.402174,258,7846,18,7.185059
4,41.0,58,25,0.39759,0.116246,9.272727,47.338936,22.644665,9.272727,338,8850,18,7.327731


# STEP 7

# Writing into output excel file

In [553]:
!pip install xlwt




In [554]:
!pip install openpyxl



In [810]:
columns = ['URL_ID', 'POSITIVE SCORE', 'NEGATIVE SCORE','POLARITY SCORE','SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH',
           'PERCENTAGE OF COMPLEX WORDS','FOG INDEX','AVG NUMBER OF WORDS PER SENTENCE',
           'COMPLEX WORD COUNT','WORD COUNT','PERSONAL PRONOUNS','AVG WORD LENGTH']
for i in output_file:
    for j in columns:
        output_file[j]=df[j]
        
output_file.to_excel('output_Data_Final.xlsx', sheet_name='Sheet1')

# Comments

In [None]:
1. Syllable per word was not filled in result output sheet as it contains list of syllable words
2. Final output data is in output_Data_Final.xlsx file