## Extract information from any e-book (PDF) to get Summarization and Sentiment Analysis

### Part 1 - Installing and importing the libraries + Extracting PDF file to text

In [1]:
#install the packages

#!pip install --upgrade pip
#!pip install PyPDF2
#!pip install nltk
#!pip install -U spacy
#!python -m spacy download en_core_web_lg

In [2]:
#import the libraries

import PyPDF2 as pdf #used for text entraction from pdf
import nltk #nlp framework
#nltk.download('all') #need to only install/download once. this is installing nltk data (all)

import re #can use regex functions
import spacy #open source library for advanced nlp
from spacy.lang.en.stop_words import STOP_WORDS #spacy.lang.en is an english language class in spacy. Library for STOP_WORDS package
from string import punctuation #Library for punctuation package

In [3]:
# read the input pdf file

file = open(r"C:\Users\prathikm\Documents\Project_101\ebooks\Harry_Potter_Book1_and_The_Philosophers_Stone.pdf", 'rb')
pdf_reader = pdf.PdfFileReader(file)

# from pages 0 to end of pages (numpages), get/read the page and extract it text with appending it to last extracted page. Then print it.

text=''
for i in range(0,pdf_reader.numPages):
    pageObj = pdf_reader.getPage(i)
    text=text+pageObj.extractText()
print(text)

1Harry Potter and the Sorcerer's Stone
CHAPTER ONE
THE BOY WHO LIVED

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say
that they were perfectly normal, thank you very much. They were the last
people you'd expect to be involved in anything strange or mysterious,

because they just didn't hold with such nonsense.
Mr. Dursley was the director of a firm called Grunnings, which made
drills. He was a big, beefy man with hardly any neck, although he did
have a very large mustache. Mrs. Dursley was thin and blonde and had
nearly twice the usual amount of neck, which came in very useful as she

spent so much of her time craning over garden fences, spying on the

neighbors. The Dursleys had a small son called Dudley and in their

opinion there was no finer boy anywhere.
The Dursleys had everything they wanted, but they also had a secret, and
their greatest fear was that somebody would discover it. They didn't

think they could bear it if anyone found out about the Potters. M

In [4]:
type(text)

str

### Part 2 - Cleaning & formatting of extracted text for further NLP analysis

In [5]:
# text cleaning using tokenization - method used is .split() 
# make a list from the above string 'text' such that- each list is not made by one whitespace (which is by default ' '), but each string is defined only
# if there are 2 whitespace characters '  '
# why? so that any new paragraphs or text starting from a new page is taken as a new list, not individual words

new_text = text.split('  ')
#new_text = [line for line in text.split('\n') if line.strip() != '']
new_text



In [6]:
# now we have a list of sentences, just remove the \n and replace it with single space, \n\n and replace it with since space ' ' and so on. 
# this changes as per input book. For example - if input book is a fiction, we will have these characters below which needs to be converted to single whitespace
# but if the book is a maths book for example, then there might be some other unwanted formulas which needs to be converted to single whitespace

list2 = [x.replace('\n', ' ').replace('\n\n', ' ').replace(' -- ',' ').replace('- ',' ') for x in new_text]

In [7]:
# all the \n, \n\n, space -- space and - space in the above list have been converted to single whitespace list.

list2



In [8]:
# the datatype is still list

type(list2)

list

In [9]:
# we need to work with this list and thus for NLP pre-processing, we will have to convert it to a string

# LTS Function  - to convert list to string
def listToString(s): 
    # initialize an empty string
    str1 = "" 
    # traverse in the string  
    for ele in s: 
        str1 += ele  
    # return string  
    return str1 
        
        
# Driver code/ pass out list to this LTS function to get output as sting
s = list2
now_string = listToString(s)
print(now_string) 



In [10]:
# check our output of function

now_string



In [11]:
#is our output string? yes

type(now_string)

str

### Part 3 - Generate the Summary of all input text

In [12]:
# WHY summarization? Picking only important information for complete information, selection process easier, also increases chance to process multiple books
# at once.

In [13]:
stop_words = list(STOP_WORDS) #pre-defined stop words in package STOP_WORDS
stop_words

['besides',
 'where',
 'ours',
 'least',
 'from',
 '’ll',
 'than',
 'call',
 'nevertheless',
 'make',
 'front',
 'until',
 'third',
 'ever',
 'all',
 'they',
 '’m',
 'full',
 'part',
 'often',
 'sometimes',
 'many',
 'otherwise',
 'but',
 'through',
 'done',
 'themselves',
 'ten',
 'twenty',
 'about',
 'wherever',
 'when',
 'too',
 'would',
 'n‘t',
 'whereupon',
 'due',
 'seemed',
 'whence',
 'if',
 'we',
 'toward',
 'meanwhile',
 'has',
 'four',
 'see',
 'becoming',
 'yours',
 '‘m',
 'out',
 'however',
 'empty',
 'himself',
 'thereupon',
 'under',
 'latter',
 'should',
 'that',
 'seeming',
 'as',
 'somewhere',
 'moreover',
 'except',
 'what',
 'me',
 'used',
 '’re',
 'yourselves',
 'had',
 'seems',
 'wherein',
 'its',
 'something',
 'is',
 'keep',
 'it',
 'same',
 'his',
 'whom',
 'via',
 'while',
 'of',
 'hereby',
 'become',
 'during',
 'now',
 'there',
 'myself',
 'anyhow',
 'in',
 'may',
 'please',
 'various',
 'yourself',
 'two',
 'or',
 'whose',
 'was',
 'even',
 'take',
 'which'

In [14]:
nlp = spacy.load('en_core_web_lg') #NLP Trained Models & Pipelines in Spacy
#nlp.max_length = 1230000 # or even higher for book 4
nlp.max_length = 1638156 # or even higher for book 5

In [None]:
doc = nlp(now_string) # "nlp" Object is used to create documents with linguistic annotations 'tokenize the input string'. 

In [None]:
type(doc)

In [None]:
tokens = [token.text for token in doc] # build list of token words. NOTE: punctuation and stop words are also part of original tokens
print(tokens)

In [None]:
punctuation = punctuation + '\n' #check punctuation (for removing) and add new line \n in punctuation as it is not there in it by default
punctuation

In [None]:
word_frequencies = {} #calculate word frequencies from doc
for word in doc:
    if word.text.lower() not in stop_words: #other than stopwords, punctuation and if it is new word -> add as new word count, else -> old word + 1
        if word.text.lower() not in punctuation:
            if word.text not in word_frequencies.keys():
                word_frequencies[word.text] = 1
            else:
                word_frequencies[word.text] += 1

In [None]:
word_frequencies

In [None]:
from operator import itemgetter
sorted(word_frequencies.items(), key=itemgetter(1), reverse = True)

In [None]:
max_frequency = max(word_frequencies.values()) #get maximum of above frequency

In [None]:
max_frequency #max frequency of a word in the above is 2270.

In [None]:
for word in word_frequencies.keys(): #divide each word by this max frequency to get normalized frequency of words. 2270/2270 = 1, that word has 1 as normalized frequency which is max.
    word_frequencies[word] = word_frequencies[word]/max_frequency

In [None]:
print(word_frequencies) #print the normalized frequency

In [None]:
sentence_tokens = [sent for sent in doc.sents] #do the sentence tokenization
print(sentence_tokens)

In [None]:
sentence_scores = {} #calculate the sentences score. Calculate most important sentence based on normalized frequency and word frequency
for sent in sentence_tokens:
    for word in sent:
        if word.text.lower() in word_frequencies.keys():
            if sent not in sentence_scores.keys():
                sentence_scores[sent] = word_frequencies[word.text.lower()]
            else:
                sentence_scores[sent] += word_frequencies[word.text.lower()]

In [None]:
sentence_scores #each sentence is scored

In [None]:
from heapq import nlargest #task is to get 30% of sentences with maximum score from above. this gives important sentences only.

In [None]:
select_length = int(len(sentence_tokens)*0.3) #calculate 30% of total sentences
select_length

In [None]:
summary = nlargest(select_length, sentence_scores,key = sentence_scores.get) #select 1887 (30%) of sentences with maximum count/score. nlargest(n,iterable, keys)

In [None]:
summary #these sentences represent the summary of the text, based on max importance of each senetence. 1887 (30%) most important sentences

In [None]:
final_summary = [word.text for word in summary] #generate list of summary

In [None]:
final_summary

In [None]:
summary_2 = ''.join(final_summary) #join the summary list to make a paragraph
print(summary_2)

In [None]:
len(text) #length of original text characters

In [None]:
len(summary_2) #length of summary text characters

In [None]:
summary_2

In [None]:
summary_2.count('Snape')

In [None]:
percentage_of_text_in_summary = (len(summary_2)/len(text))*100
print("Percentage of text in final summary is :", percentage_of_text_in_summary)

### Part 4 - EDA of Summary - Done with Tableau (links in PPT)

### Part 5 - Sentiment Analysis

In [None]:
type(final_summary) #type of input is list

In [None]:
final_summary[0:10] #get first 10 items/sentences in list

In [None]:
!pip install vaderSentiment

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [None]:
def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    print("{:-<40} {}".format(sentence, str(score)))

In [None]:
import pandas as pd

scores =[]
#sentences = ["A really bad, horrible book.","A good, awesome, wonderful, cool book !!!  :)"]

for sentence in final_summary:
    score = analyser.polarity_scores(sentence)
    scores.append(score)
    
#Converting List of Dictionaries into Dataframe
dataFrame= pd.DataFrame(scores)

print("Sentiment Score for each sentence in the book (Summarized i.e. Important sentences) :-\n")
print(dataFrame)

print("Overall Sentiment Score for complete book :-\n",dataFrame.mean())