<img src="http://hilpisch.com/tpq_logo.png" width="36%" align="right" style="vertical-align: top;">

# Natural Language Processing

**Basic Techniques and Algorithms**

_Illustrated based on the texts from three Apple press releases._

Dr Yves J Hilpisch | Michael Schwed

The Python Quants GmbH

## Data Retrieval

In [None]:
import requests  

In [None]:
sources = [
    'https://nr.apple.com/dE0b1T5G3u',  # iPad Pro
    'https://nr.apple.com/dE4c7T6g1K',  # MacBook Air
    'https://nr.apple.com/dE4q4r8A2A',  # Mac Mini
]  

In [None]:
html = [requests.get(url).text for url in sources]  

## Preprocessing

In [None]:
import sys
sys.path.append('../../modules/')
import nlp_functions as nlp  

In [None]:
data = [nlp.clean_up_text(t) for t in html]  

In [None]:
data[0][:500]  

## Basic Text Analysis

In [None]:
import nltk

In [None]:
nltk.__version__

In [None]:
# to be executed once
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('vader_lexicon')

### Basic Tokenization and Vocabulary

In [None]:
t = 'A sentence about NLP; and about ML.'

In [None]:
nltk.word_tokenize(t)

In [None]:
nltk.word_tokenize(data[0])[:10]  

In [None]:
tokens = nltk.word_tokenize(' '.join(data))  

In [None]:
text = nltk.Text(tokens)  

In [None]:
text.count('ipad')  

In [None]:
text.count('mojave')  

In [None]:
text.collocations()  

In [None]:
text.similar('performance')  

In [None]:
text.common_contexts(['ipad', 'macbook'])  

In [None]:
text.vocab()  

In [None]:
text.concordance('macbook', width=70, lines=7)

In [None]:
text.concordance('mojave', width=70, lines=7)

In [None]:
words = sorted(set([w.lower() for w in tokens]))  

In [None]:
len(words)  

In [None]:
words[:7]  

### Improved Tokenization and Vocabulary

In [None]:
t = 'A sentence about NLP; and about ML.'

In [None]:
tokens = nlp.tokenize(t, min_char=3)
tokens

In [None]:
%time tokens = nlp.tokenize(' '.join(data))  

In [None]:
tokens[:7]  

In [None]:
words = sorted(set([w.lower() for w in tokens]))  

In [None]:
len(words)  

In [None]:
words[:6]  

## Similarity

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
ts = [
    'A sentence about NLP; and about ML.',
    'Another one about NLP and algorithms.',
    'A text fragment about Apple.'
]  

In [None]:
vec = TfidfVectorizer(stop_words='english')  

In [None]:
mat = vec.fit_transform(ts)  

In [None]:
mat.A.round(3)  

In [None]:
df = pd.DataFrame(mat.A, columns=vec.get_feature_names())  

In [None]:
df.round(3)  

In [None]:
(mat * mat.T).A  

In [None]:
tl = [' '.join(nlp.tokenize(t)) for t in data]  

In [None]:
mat = vec.fit_transform(tl)  
mat  

In [None]:
sm = (mat * mat.T).A  
sm  

In [None]:
prs = ['ipad', 'air', 'mini']  

In [None]:
df = pd.DataFrame(sm, index=prs, columns=prs)  

In [None]:
df  

## Word Clouds

In [None]:
for name, text in zip(prs, data):  
    nlp.generate_word_cloud(text, 35)#,
           # name='../../images/wc_{}.png'.format(name))  

## Topic Modeling

### NMF Clustering

In [None]:
from sklearn.decomposition import NMF

In [None]:
vec = TfidfVectorizer(stop_words='english')  

In [None]:
mat = vec.fit_transform(data)  

In [None]:
nmf = NMF(n_components=len(data))  

In [None]:
nmf.fit(mat)  

In [None]:
nmf.components_.round(3)

In [None]:
feature_names = vec.get_feature_names()

In [None]:
n_words = 8  

In [None]:
for i, topic in enumerate(nmf.components_[:n_words]):
    print('TOPIC %d'% (i))
    print(60 * '=')
    print(', '.join([feature_names[i] for i in
                     topic.argsort()[:-n_words-1:-1]]) + '\n')  

### KMeans Clustering

In [None]:
import collections
from sklearn.cluster import KMeans

In [None]:
n_clusters = len(data)

In [None]:
km = KMeans(n_clusters=n_clusters)  

In [None]:
km.fit(mat)  

In [None]:
n_kw = 7

In [None]:
sorted_centroids = km.cluster_centers_.argsort()[:, ::-1]
words = vec.get_feature_names()
kw_list = []
for c in range(n_clusters):
    keywords = []
    for w in sorted_centroids[c, :n_kw]:
        keywords.append(words[w])
    kw_list.append(keywords)

In [None]:
kw_list

In [None]:
kw_df = pd.DataFrame(index=['topic_{}'.format(i) for i in range(n_clusters)], 
                           columns=['keyword_{}'.format(i) for i in range(n_kw)],
                           data=kw_list)

In [None]:
kw_df

## Sentiment Analysis

In [None]:
import warnings; warnings.simplefilter('ignore')

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
sid = SentimentIntensityAnalyzer()  

In [None]:
p = 'The product is amazing. I love it.'  

In [None]:
sid.polarity_scores(p)  

In [None]:
n = 'The product is of low quality. I cannot recommend it.'  

In [None]:
sid.polarity_scores(n)  

In [None]:
sid.polarity_scores(' '.join(data))  

In [None]:
scores = [sid.polarity_scores(l) for l in data]  

In [None]:
scores 

## Summarization

In [None]:
import pprint
from gensim.summarization import keywords
from gensim.summarization.summarizer import summarize

In [None]:
for i, tokens in enumerate(tl):
    print('\nTEXT {}\n'.format(i) + 50 * '=')
    kws = keywords(tokens, words=8, split=False,
                   lemmatize=True, scores=True)  
    pprint.pprint(kws)

In [None]:
texts = [nlp.clean_up_html(h) for h in html]  

In [None]:
for i, text in enumerate(texts):
    print('\nTEXT {}\n'.format(i) + 50 * '=')
    print(summarize(text, word_count=45))  

<img src="http://hilpisch.com/tpq_logo.png" width="36%" align="right" style="vertical-align: top;">