<img src="http://hilpisch.com/tpq_logo.png" width="36%" align="right" style="vertical-align: top;">

# Natural Language Processing

**Open Information Extraction**

_Illustrated based on a simple example and the texts from three Apple press releases._

Dr Yves J Hilpisch | Michael Schwed

The Python Quants GmbH

## Simple Example

In [1]:
import os
import nltk
import pandas as pd
import soiepy.main as ie  # <1>
import ng_functions as ng  # <2>
import nlp_functions as nlp

In [2]:
t = '''
Peter studies data science.
Peter knows Java.
Peter prefers Python.
Peter works as a data scientist.
Peter applies machine learning.
A data scientist uses Python.
Python revolutionized data science.
Python is preferred for NLP.
Python is used for machine learning.
'''

In [3]:
s = nltk.sent_tokenize(t)  # <1>

In [4]:
s[:3]  # <1>

['\nPeter studies data science.', 'Peter knows Java.', 'Peter prefers Python.']

In [5]:
s = [nlp.clean_up_text(_) for _ in s]  # <2>
s = [' '.join(nlp.tokenize(_)) + '.' for _ in s]  # <2>

In [6]:
s[:3]  # <3>

['peter study data science.', 'peter know java.', 'peter prefer python.']

In [7]:
path = os.path.join(os.getcwd(), 'tokens')  # <1>
fn = os.path.join(path, 'tokens.txt')  # <2>

In [8]:
with open(fn, 'w') as f:
    f.writelines([_ + '\n' for _ in s])  # <3>

In [9]:
%time r = ie.stanford_ie(fn, verbose=False)  # <4>

CPU times: user 4 ms, sys: 4 ms, total: 8 ms
Wall time: 10.3 s


In [10]:
r[:3]  # <5>

[['peter', ' know', ' java'],
 ['peter', ' prefer', ' python'],
 ['peter', ' works', ' data scientist']]

In [11]:
d = pd.DataFrame(r, columns=['Node1', 'Relation', 'Node2'])  # <1>

In [12]:
d = d.applymap(lambda _: _.strip())  # <2>

In [13]:
d.iloc[:3]

Unnamed: 0,Node1,Relation,Node2
0,peter,know,java
1,peter,prefer,python
2,peter,works,data scientist


In [14]:
g = ng.create_graph(d)  # <1>

In [15]:
G = ng.plot_graph(g, central_gravity=4)  # <2>

In [16]:
G.show('ng_example.html')  # <3>

## Apple Press Releases

In [None]:
sources = [
    'https://nr.apple.com/dE0b1T5G3u',  # iPad Pro
    'https://nr.apple.com/dE4c7T6g1K',  # MacBook Air
    'https://nr.apple.com/dE4q4r8A2A',  # Mac Mini
]

In [None]:
html = [requests.get(url).text for url in sources]

In [None]:
sents = [nltk.sent_tokenize(h) for h in html]

In [None]:
sentences = []
for s in sents:
    sentences.extend(s)

In [None]:
len(sentences)

In [None]:
from importlib import reload; reload(nlp)

In [None]:
sentences = [nlp.clean_up_text(s) for s in sentences]
sentences = [' '.join(nlp.tokenize(s)) + '.' for s in sentences]
sentences = [s for s in sentences if len(s) > 5]

In [None]:
sentences[:4]

In [None]:
token_path = os.path.join(os.getcwd(), 'tokenized_data')
if not os.path.isdir(token_path):
    os.mkdir(token_path)

In [None]:
filename = os.path.join(token_path, 'tokens_apple.txt')
with open(filename, 'w') as f:
    for s in sentences:
        f.write('%s\n' % s)  

In [None]:
filename

In [None]:
%time relations = main.stanford_ie(filename, verbose=False)

In [None]:
relations[:3]

In [None]:
relations_df = pd.DataFrame(relations, columns=['Node1', 'Relation', 'Node2'])

In [None]:
relations_df = relations_df.applymap(lambda x: x.strip())

In [None]:
relations_df = relations_df[relations_df.applymap(lambda x: len(x) < 25)].dropna()

In [None]:
relations_df.iloc[:5]

In [None]:
graph = nlp.create_graph(relations_df)

In [None]:
ng = nlp.plot_graph(graph, with_edge_label=False,
                  font_color='grey', central_gravity=3)

In [None]:
ng.show('apple_graph.html')