<a href="https://colab.research.google.com/github/yohanesnuwara/66DaysOfData/blob/main/D11_NLTK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing with NLTK

In [1]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.draw import TreeWidget
from nltk.draw.util import CanvasFrame

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
example_string = 'The rest of Hugin Formation which was water filled, showing very low resistivities < 0.05 ohmm in the best sands, is reflecting the very saline formation water in Volve (~ 130000 ppm). This might indicate that Hugin Formation never has been oil filled, and that the two observed oil pockets are results of hydrocarbon migration. Hugin Formation properties seem to be poorer than in Hugin Formation in previous Volve wells, with an average porosity of 0.17.'

In [3]:
sent_tokenize(example_string)

['The rest of Hugin Formation which was water filled, showing very low resistivities < 0.05 ohmm in the best sands, is reflecting the very saline formation water in Volve (~ 130000 ppm).',
 'This might indicate that Hugin Formation never has been oil filled, and that the two observed oil pockets are results of hydrocarbon migration.',
 'Hugin Formation properties seem to be poorer than in Hugin Formation in previous Volve wells, with an average porosity of 0.17.']

In [4]:
words = word_tokenize(example_string)
words

['The',
 'rest',
 'of',
 'Hugin',
 'Formation',
 'which',
 'was',
 'water',
 'filled',
 ',',
 'showing',
 'very',
 'low',
 'resistivities',
 '<',
 '0.05',
 'ohmm',
 'in',
 'the',
 'best',
 'sands',
 ',',
 'is',
 'reflecting',
 'the',
 'very',
 'saline',
 'formation',
 'water',
 'in',
 'Volve',
 '(',
 '~',
 '130000',
 'ppm',
 ')',
 '.',
 'This',
 'might',
 'indicate',
 'that',
 'Hugin',
 'Formation',
 'never',
 'has',
 'been',
 'oil',
 'filled',
 ',',
 'and',
 'that',
 'the',
 'two',
 'observed',
 'oil',
 'pockets',
 'are',
 'results',
 'of',
 'hydrocarbon',
 'migration',
 '.',
 'Hugin',
 'Formation',
 'properties',
 'seem',
 'to',
 'be',
 'poorer',
 'than',
 'in',
 'Hugin',
 'Formation',
 'in',
 'previous',
 'Volve',
 'wells',
 ',',
 'with',
 'an',
 'average',
 'porosity',
 'of',
 '0.17',
 '.']

## Stemming

In [5]:
stemmer = PorterStemmer()

stemmed_words = [stemmer.stem(word) for word in words]
stemmed_words

['the',
 'rest',
 'of',
 'hugin',
 'format',
 'which',
 'wa',
 'water',
 'fill',
 ',',
 'show',
 'veri',
 'low',
 'resist',
 '<',
 '0.05',
 'ohmm',
 'in',
 'the',
 'best',
 'sand',
 ',',
 'is',
 'reflect',
 'the',
 'veri',
 'salin',
 'format',
 'water',
 'in',
 'volv',
 '(',
 '~',
 '130000',
 'ppm',
 ')',
 '.',
 'thi',
 'might',
 'indic',
 'that',
 'hugin',
 'format',
 'never',
 'ha',
 'been',
 'oil',
 'fill',
 ',',
 'and',
 'that',
 'the',
 'two',
 'observ',
 'oil',
 'pocket',
 'are',
 'result',
 'of',
 'hydrocarbon',
 'migrat',
 '.',
 'hugin',
 'format',
 'properti',
 'seem',
 'to',
 'be',
 'poorer',
 'than',
 'in',
 'hugin',
 'format',
 'in',
 'previou',
 'volv',
 'well',
 ',',
 'with',
 'an',
 'averag',
 'poros',
 'of',
 '0.17',
 '.']

## Part-of-Speech (POS) Tagging

In [6]:
nltk.pos_tag(words)

[('The', 'DT'),
 ('rest', 'NN'),
 ('of', 'IN'),
 ('Hugin', 'NNP'),
 ('Formation', 'NNP'),
 ('which', 'WDT'),
 ('was', 'VBD'),
 ('water', 'NN'),
 ('filled', 'VBN'),
 (',', ','),
 ('showing', 'VBG'),
 ('very', 'RB'),
 ('low', 'JJ'),
 ('resistivities', 'NNS'),
 ('<', 'VBP'),
 ('0.05', 'CD'),
 ('ohmm', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('best', 'JJS'),
 ('sands', 'NNS'),
 (',', ','),
 ('is', 'VBZ'),
 ('reflecting', 'VBG'),
 ('the', 'DT'),
 ('very', 'RB'),
 ('saline', 'JJ'),
 ('formation', 'NN'),
 ('water', 'NN'),
 ('in', 'IN'),
 ('Volve', 'NNP'),
 ('(', '('),
 ('~', '$'),
 ('130000', 'CD'),
 ('ppm', 'NN'),
 (')', ')'),
 ('.', '.'),
 ('This', 'DT'),
 ('might', 'MD'),
 ('indicate', 'VB'),
 ('that', 'IN'),
 ('Hugin', 'NNP'),
 ('Formation', 'NNP'),
 ('never', 'RB'),
 ('has', 'VBZ'),
 ('been', 'VBN'),
 ('oil', 'NN'),
 ('filled', 'VBN'),
 (',', ','),
 ('and', 'CC'),
 ('that', 'IN'),
 ('the', 'DT'),
 ('two', 'CD'),
 ('observed', 'JJ'),
 ('oil', 'NN'),
 ('pockets', 'NNS'),
 ('are', 'VBP'),
 (

## Lemmatization

In [7]:
statement = 'I have some beautifully woven scarves.'
words2 = word_tokenize(statement)

lemmatizer = WordNetLemmatizer()

# Lemmatizing noun
lemmatized_words = [lemmatizer.lemmatize(word, pos='n') for word in words2]
print(lemmatized_words) # scarves to scarf

# Lemmatizing verb
lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words2]
print(lemmatized_words) # woven to weave

['I', 'have', 'some', 'beautifully', 'woven', 'scarf', '.']
['I', 'have', 'some', 'beautifully', 'weave', 'scarves', '.']


## Chunking

In [8]:
statement = 'The formation has some resistive sands which is very porous.'

words3 = word_tokenize(statement)
pos_tagged = nltk.pos_tag(words3)

# Focus on: some resistive sands, as DT, JJ, NNS after POS tagging
grammar = "NP: {<DT>?<JJ>*<NNS>}"
chunk_parser = nltk.RegexpParser(grammar)
chunks = chunk_parser.parse(pos_tagged)

chunks.draw()

TclError: ignored

References:

* https://realpython.com/nltk-nlp-python/