<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# NLP Basics

**PDF Documents, Key Words, Key Sentences**

&copy; Dr. Yves J. Hilpisch

<a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>

### PDF Documents

In [None]:
!git clone https://github.com/tpq-classes/natural_language_processing.git
import sys
sys.path.append('natural_language_processing')


In [None]:
url = 'https://certificate.tpq.io/ftwp_ch_01.pdf'
# url = 'https://arxiv.org/pdf/1706.03762.pdf'
# url = 'https://www.nber.org/system/files/working_papers/w31975/w31975.pdf'

In [None]:
!wget $url

In [None]:
!pip install pypdf2

In [None]:
from PyPDF2 import PdfReader

In [None]:
pdf = PdfReader(url.split('/')[-1])

In [None]:
pdf.metadata

In [None]:
page_0 = pdf.pages[0]

In [None]:
page_0.extract_text()

In [None]:
print(page_0.extract_text())

In [None]:
# page_1 = pdf.pages[1]

In [None]:
# page_1.extract_text()

In [None]:
!pip install pdfminer.six

In [None]:
import pdfminer

In [None]:
from pdfminer.high_level import extract_text

In [None]:
%time text = extract_text(url.split('/')[-1])

In [None]:
text[:1000]

In [None]:
print(text[:1000])

## Key Word Extraction

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
tokens = word_tokenize(text)

In [None]:
tokens = [t.lower() for t in tokens]

In [None]:
# tokens = [t.strip('-') for t in tokens]

In [None]:
tokens = [t for t in tokens if t not in stop_words and len(t) > 3]

In [None]:
tokens[:10]

In [None]:
freq_words = FreqDist(tokens)

In [None]:
freq_words

In [None]:
freq_words.most_common(5)

In [None]:
def key_words(tokens, n_key_words=5):
    ''' Based on the frequency of the words.
        => Frequentist Approach.
    '''
    freq_words = FreqDist(tokens)
    key_words_ = freq_words.most_common(n_key_words)
    key_words = [w for w, f in key_words_]
    return key_words

In [None]:
key_words(tokens, 10)

## Key Sentence Extraction

In [None]:
from heapq import nlargest
from collections import defaultdict
from nltk.tokenize import sent_tokenize

In [None]:
sentences = sent_tokenize(text)

In [None]:
sentences[15:17]

In [None]:
max_freq = max(freq_words.values())

In [None]:
sent_scores = defaultdict(int)
for sent in sentences:
    words = word_tokenize(sent)
    words = [w.lower() for w in words]
    words = [w for w in words if w not in stop_words and len(w) > 3]
    if len(words) > 6:
        for word in words:
            sent_scores[sent] += freq_words[word] / max_freq / len(sent)

In [None]:
key_sent = nlargest(3, sent_scores, key=sent_scores.get)

In [None]:
for ks in key_sent:
    print(ks)

In [None]:
# nlargest(3, sent_scores)

## Word Clouds

In [None]:
# !pip install wordcloud

In [None]:
import wordcloud

In [None]:
wc = wordcloud.WordCloud(max_words=15, background_color='white',
                        width=1024, height=768).generate(text)

In [None]:
# wc = wordcloud.WordCloud(max_words=15, background_color='white',
#                         width=1024, height=768).generate(' '.join(tokens))

In [None]:
from pylab import plt

In [None]:
plt.imshow(wc)
plt.axis('off');

<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>