## Preliminaries

In [1]:
import pandas as pd
import numpy as np
import PyPDF2
import textract
import re

## Reading Text

- converted PDF file to txt format for better pre-processing

In [2]:
filename ='JavaBasics-notes.pdf' 

pdfFileObj = open(filename,'rb')               #open allows you to read the file
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)   #The pdfReader variable is a readable object that will be parsed
num_pages = pdfReader.numPages                 #discerning the number of pages will allow us to parse through all the pages


count = 0
text = ""
                                                            
while count < num_pages:                       #The while loop will read each page
    pageObj = pdfReader.getPage(count)
    count +=1
    text += pageObj.extractText()
    
#Below if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.

if text != "":
    text = text
    
#If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text

else:
    text = textract.process('http://bit.ly/epo_keyword_extraction_document', method='tesseract', language='eng')

    # Now we have a text variable which contains all the text derived from our PDF file.

In [3]:
text = text.encode('ascii','ignore').lower() #Lowercasing each word

## Extracting Keywords

In [4]:
keywords = re.findall(r'[a-zA-Z]\w+',text)
len(keywords)                               #Total keywords in document

3410

In [5]:
df = pd.DataFrame(list(set(keywords)),columns=['keywords'])  #Dataframe with unique keywords to avoid repetition in rows

## Calculating Weightage

 - In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling. 

- __TF: Term Frequency__, which measures how frequently a term occurs in a document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. Thus, the term frequency is often divided by the document length (aka. the total number of terms in the document) as a way of normalization: 

__TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).__

- __IDF: Inverse Document Frequency__, which measures how important a term is. While computing TF, all terms are considered equally important. However it is known that certain terms, such as "is", "of", and "that", may appear a lot of times but have little importance. Thus we need to weigh down the frequent terms while scale up the rare ones, by computing the following: 

__IDF(t) = log_e(Total number of documents / Number of documents with term t in it).__

In [6]:
def weightage(word,text,number_of_documents=1):
    word_list = re.findall(word,text)
    number_of_times_word_appeared =len(word_list)
    tf = number_of_times_word_appeared/float(len(text))
    idf = np.log((number_of_documents)/float(number_of_times_word_appeared))
    tf_idf = tf*idf
    return number_of_times_word_appeared,tf,idf ,tf_idf    

In [7]:
df['number_of_times_word_appeared'] = df['keywords'].apply(lambda x: weightage(x,text)[0])
df['tf'] = df['keywords'].apply(lambda x: weightage(x,text)[1])
df['idf'] = df['keywords'].apply(lambda x: weightage(x,text)[2])
df['tf_idf'] = df['keywords'].apply(lambda x: weightage(x,text)[3])

In [8]:
df = df.sort_values('tf_idf',ascending=True)
df.to_csv('Keywords.csv')
df.head(25)

Unnamed: 0,keywords,number_of_times_word_appeared,tf,idf,tf_idf
194,in,369,0.014913,-5.910797,-0.088146
317,re,258,0.010427,-5.55296,-0.057899
880,at,247,0.009982,-5.509388,-0.054996
783,on,243,0.009821,-5.493061,-0.053945
690,the,203,0.008204,-5.313206,-0.04359
876,an,199,0.008042,-5.293305,-0.042571
25,to,190,0.007679,-5.247024,-0.04029
799,or,167,0.006749,-5.117994,-0.034542
878,as,157,0.006345,-5.056246,-0.032082
588,java,135,0.005456,-4.905275,-0.026763


***

## Second Method - Using Gensim library

In [9]:
from gensim.summarization import keywords
import warnings
warnings.filterwarnings("ignore")



In [10]:
values = keywords(text=text,split='\n',scores=True)

In [11]:
data = pd.DataFrame(values,columns=['keyword','score'])
data = data.sort_values('score',ascending=False)
data.head(10)

Unnamed: 0,keyword,score
0,java basics,0.314014
2,methods,0.247325
1,method,0.247325
3,applets,0.241786
4,applet,0.241786
5,class,0.2198
6,classes,0.2198
7,objects,0.190636
8,object,0.190636
9,programs,0.163243


***

### Third Approach - Using RAKE (Rapid Automatic Keyword Extraction)

In [12]:
from rake_nltk import Rake

In [13]:
r = Rake()
r.extract_keywords_from_text(text)

In [14]:
phrases = r.get_ranked_phrases_with_scores()

In [15]:
table = pd.DataFrame(phrases,columns=['score','Phrase'])
table = table.sort_values('score',ascending=False)
# table.head(10)

***