<img src="http://hilpisch.com/tpq_logo.png" width="36%" align="right" style="vertical-align: top;">

# Natural Language Processing

**Open Information Extraction**

_Illustrated based on a simple example and the texts from three Apple press releases._

Dr Yves J Hilpisch | Michael Schwed

The Python Quants GmbH

## Simple Example

In [None]:
import os
import nltk
import requests
import pandas as pd

In [None]:
import sys
sys.path.append('../../../')
sys.path.append('../../modules/')
import soiepy.main as ie  
import ng_functions as ng  
import nlp_functions as nlp

In [None]:
t = '''
Peter studies data science.
Peter knows Java.
Peter prefers Python.
Peter works as a data scientist.
Peter applies machine learning.
A data scientist uses Python.
Python revolutionized data science.
Python is preferred for NLP.
Python is used for machine learning.
'''

In [None]:
s = nltk.sent_tokenize(t)  

In [None]:
s[:3]  

In [None]:
s = [nlp.clean_up_text(_) for _ in s]  
s = [' '.join(nlp.tokenize(_)) + '.' for _ in s]  

In [None]:
s[:3]  

In [None]:
abs_path = os.path.abspath('../../')

In [None]:
data_path = os.path.join(abs_path, 'data')
tokens_path = os.path.join(data_path, 'tokens')
if not os.path.isdir(tokens_path):
    os.mkdir(tokens_path)

In [None]:
fn = os.path.join(tokens_path, 'tokens_example.txt')  

In [None]:
with open(fn, 'w') as f:
    f.writelines([_ + '\n' for _ in s])  

In [None]:
r = ie.stanford_ie(fn, verbose=True)  

In [None]:
r[:3]  

In [None]:
d = pd.DataFrame(r, columns=['Node1', 'Relation', 'Node2'])  

In [None]:
d = d.applymap(lambda _: _.strip())  

In [None]:
d.iloc[:3]

In [None]:
g = ng.create_graph(d)  

In [None]:
G = ng.plot_graph(g, central_gravity=0.01)  

In [None]:
G.show('ng_example.html')  

## Apple Press Releases

In [None]:
import requests

In [None]:
sources = [
    'https://nr.apple.com/dE0b1T5G3u',  # iPad Pro
    'https://nr.apple.com/dE4c7T6g1K',  # MacBook Air
    'https://nr.apple.com/dE4q4r8A2A',  # Mac Mini
]

In [None]:
html = [requests.get(url).text for url in sources]

In [None]:
sents = [nltk.sent_tokenize(h) for h in html]

In [None]:
s = []
for sent in sents:
    s.extend(sent)

In [None]:
len(s)

In [None]:
s = [nlp.clean_up_text(se) for se in s]

In [None]:
s = [' '.join(nlp.tokenize(se)) + '.' for se in s]

In [None]:
s = [se for se in s if len(se) > 5]

In [None]:
fn = os.path.join(tokens_path, 'tokens_apple.txt')
with open(fn, 'w') as f:
    f.writelines([_ + '\n' for _ in s])

In [None]:
%time r = ie.stanford_ie(fn, verbose=False)

In [None]:
r[:3]

In [None]:
d = pd.DataFrame(r, columns=['Node1', 'Relation', 'Node2'])

In [None]:
d = d.applymap(lambda x: x.strip())

In [None]:
d.iloc[:10]

In [None]:
d = d[d.applymap(lambda x: len(x) < 25)].dropna()

In [None]:
d.iloc[:5]

In [None]:
g = ng.create_graph(d)

In [None]:
G = ng.plot_graph(g, with_edge_label=False,
                  font_color='grey', central_gravity=0.01)

In [None]:
G.show('ng_apple.html')

<img src="http://hilpisch.com/tpq_logo.png" width="36%" align="right" style="vertical-align: top;">