## Natural Language Processing

#### As seen on the [Python-Natural-Language-Processing-Cookbook](https://github.com/PacktPublishing/Python-Natural-Language-Processing-Cookbook/tree/master)

#### Chapter 1

In [4]:
%pwd

'/home/ale/git/main/nlp'

In [5]:
import nltk

my_filename = "Python-Natural-Language-Processing-Cookbook/Chapter01/sherlock_holmes_1.txt"
my_file = open( my_filename, 'r', encoding='utf-8' )
text = my_file.read()

In [6]:
text = text.replace('\n', ' ')

In [7]:
# Initialiize an NLTK tokenizer using the PUNKT model
tokenizer_instance = nltk.data.load('tokenizers/punkt/english.pickle')

In [8]:
# Divide text into sentences:
sentences_output = tokenizer_instance.tokenize( text )
sentences_output

['To Sherlock Holmes she is always _the_ woman.',
 'I have seldom heard him mention her under any other name.',
 'In his eyes she eclipses and predominates the whole of her sex.',
 'It was not that he felt any emotion akin to love for Irene Adler.',
 'All emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind.',
 'He was, I take it, the most perfect reasoning and observing machine that the world has seen, but as a lover he would have placed himself in a false position.',
 'He never spoke of the softer passions, save with a gibe and a sneer.',
 'They were admirable things for the observer—excellent for drawing the veil from men’s motives and actions.',
 'But for the trained reasoner to admit such intrusions into his own delicate and finely adjusted temperament was to introduce a distracting factor which might throw a doubt upon all his mental results.',
 'Grit in a sensitive instrument, or a crack in one of his own high-power lenses, would n

In [9]:
# Divide text into words
words_output = nltk.tokenize.word_tokenize( text )
words_output

['To',
 'Sherlock',
 'Holmes',
 'she',
 'is',
 'always',
 '_the_',
 'woman',
 '.',
 'I',
 'have',
 'seldom',
 'heard',
 'him',
 'mention',
 'her',
 'under',
 'any',
 'other',
 'name',
 '.',
 'In',
 'his',
 'eyes',
 'she',
 'eclipses',
 'and',
 'predominates',
 'the',
 'whole',
 'of',
 'her',
 'sex',
 '.',
 'It',
 'was',
 'not',
 'that',
 'he',
 'felt',
 'any',
 'emotion',
 'akin',
 'to',
 'love',
 'for',
 'Irene',
 'Adler',
 '.',
 'All',
 'emotions',
 ',',
 'and',
 'that',
 'one',
 'particularly',
 ',',
 'were',
 'abhorrent',
 'to',
 'his',
 'cold',
 ',',
 'precise',
 'but',
 'admirably',
 'balanced',
 'mind',
 '.',
 'He',
 'was',
 ',',
 'I',
 'take',
 'it',
 ',',
 'the',
 'most',
 'perfect',
 'reasoning',
 'and',
 'observing',
 'machine',
 'that',
 'the',
 'world',
 'has',
 'seen',
 ',',
 'but',
 'as',
 'a',
 'lover',
 'he',
 'would',
 'have',
 'placed',
 'himself',
 'in',
 'a',
 'false',
 'position',
 '.',
 'He',
 'never',
 'spoke',
 'of',
 'the',
 'softer',
 'passions',
 ',',
 'save

In [10]:
tweet = "@EmpireStateBldg Central Park Tower is reaaaaaaaaaally hiiiiiiiiigh"
tweet_words = nltk.tokenize.casual.casual_tokenize( tweet,
                                                   preserve_case=True,
                                                   reduce_len=True,
                                                   strip_handles=True)
tweet_words

['Central', 'Park', 'Tower', 'is', 'reaaally', 'hiiigh']

In [11]:
# Word tokenization with spacy
# The NLTK package only has word tokenization for English.
# spaCy has models for several other languages: Chinese, Danish, Dutch, English, French, German, Greek, Italian, Japanese, Lithuanian, Norwegian, Polish, Portuguese, Romanian, and Spanish. In order to use those models, you would have to download them separately
import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp(text)
words = [token.text for token in doc]
words

['To',
 'Sherlock',
 'Holmes',
 'she',
 'is',
 'always',
 '_',
 'the',
 '_',
 'woman',
 '.',
 'I',
 'have',
 'seldom',
 'heard',
 'him',
 'mention',
 'her',
 'under',
 'any',
 'other',
 'name',
 '.',
 'In',
 'his',
 'eyes',
 'she',
 'eclipses',
 'and',
 'predominates',
 'the',
 'whole',
 'of',
 'her',
 'sex',
 '.',
 'It',
 'was',
 'not',
 'that',
 'he',
 'felt',
 'any',
 'emotion',
 'akin',
 'to',
 'love',
 'for',
 'Irene',
 'Adler',
 '.',
 'All',
 'emotions',
 ',',
 'and',
 'that',
 'one',
 'particularly',
 ',',
 'were',
 'abhorrent',
 'to',
 'his',
 'cold',
 ',',
 'precise',
 'but',
 'admirably',
 'balanced',
 'mind',
 '.',
 'He',
 'was',
 ',',
 'I',
 'take',
 'it',
 ',',
 'the',
 'most',
 'perfect',
 'reasoning',
 'and',
 'observing',
 'machine',
 'that',
 'the',
 'world',
 'has',
 'seen',
 ',',
 'but',
 'as',
 'a',
 'lover',
 'he',
 'would',
 'have',
 'placed',
 'himself',
 'in',
 'a',
 'false',
 'position',
 '.',
 'He',
 'never',
 'spoke',
 'of',
 'the',
 'softer',
 'passions',
 '

In [12]:
# Let's now tag words with their function in the speech
words = [token.text for token in doc]
pos = [token.pos_ for token in doc]
word_pos_tuples = list(zip(words, pos))
word_pos_tuples


[('To', 'ADP'),
 ('Sherlock', 'PROPN'),
 ('Holmes', 'PROPN'),
 ('she', 'PRON'),
 ('is', 'AUX'),
 ('always', 'ADV'),
 ('_', 'PUNCT'),
 ('the', 'DET'),
 ('_', 'PROPN'),
 ('woman', 'NOUN'),
 ('.', 'PUNCT'),
 ('I', 'PRON'),
 ('have', 'AUX'),
 ('seldom', 'ADV'),
 ('heard', 'VERB'),
 ('him', 'PRON'),
 ('mention', 'VERB'),
 ('her', 'PRON'),
 ('under', 'ADP'),
 ('any', 'DET'),
 ('other', 'ADJ'),
 ('name', 'NOUN'),
 ('.', 'PUNCT'),
 ('In', 'ADP'),
 ('his', 'PRON'),
 ('eyes', 'NOUN'),
 ('she', 'PRON'),
 ('eclipses', 'VERB'),
 ('and', 'CCONJ'),
 ('predominates', 'VERB'),
 ('the', 'DET'),
 ('whole', 'NOUN'),
 ('of', 'ADP'),
 ('her', 'PRON'),
 ('sex', 'NOUN'),
 ('.', 'PUNCT'),
 ('It', 'PRON'),
 ('was', 'AUX'),
 ('not', 'PART'),
 ('that', 'SCONJ'),
 ('he', 'PRON'),
 ('felt', 'VERB'),
 ('any', 'DET'),
 ('emotion', 'NOUN'),
 ('akin', 'ADJ'),
 ('to', 'PART'),
 ('love', 'VERB'),
 ('for', 'ADP'),
 ('Irene', 'PROPN'),
 ('Adler', 'PROPN'),
 ('.', 'PUNCT'),
 ('All', 'DET'),
 ('emotions', 'NOUN'),
 (',', 'PU

In [13]:
# Stemming test
from nltk.stem.snowball import SnowballStemmer  # Import nltk stemmer module
stemmer = SnowballStemmer('english')  # Set the English language
words = ['leaf', 'leaves', 'booking', 'writing', 'completed', 'stemming', 'skies']
stemmed_words = [stemmer.stem(word) for word in words]
stemmed_words

['leaf', 'leav', 'book', 'write', 'complet', 'stem', 'sky']

In [14]:
# Check which languages are available
print(SnowballStemmer.languages)

('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')


### Counting nouns - plural & singular nouns

In [1]:
import nltk

my_filename = "sherlock_holmes_1.txt"
my_file = open( my_filename, 'r', encoding='utf-8' )
text = my_file.read()

In [4]:
import sys
sys.version

'3.11.6 (main, Nov  2 2023, 04:39:43) [Clang 14.0.3 (clang-1403.0.22.14.1)]'

In [1]:
import spacy
spacy.__version__

'3.7.4'

## Natural Language Processing

#### As seen on the [Python-Natural-Language-Processing-Cookbook](https://github.com/PacktPublishing/Python-Natural-Language-Processing-Cookbook/tree/master)

#### Chapter 2

In [None]:
# # Tech requirements:
# pip install inflect
# python -m spacy download en_core_web_md
# pip install textacy
# pip install neuralcoref

In [19]:
# lemmatization test
# lemmatization is similar to stemming, but lemmatization provides us with a real word, the canonical form, instead of the word root like stemming does
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
words = ['duck', 'geese', 'cats', 'books']

lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
lemmatized_words

['duck', 'goose', 'cat', 'book']

In [3]:
for i in reversed(range(10)):
    print(i)

9
8
7
6
5
4
3
2
1
0


In [21]:
for i in range(100):
    fizz = (i%5)
    buzz = (i%3)

    if fizz == 0 and buzz == 0:
        print("Fizzbuzz",i)
    elif (i%5) == 0 and i > 5:
        print('Buzz',i)
    elif (i%3) == 0 and i > 3:
        print('Fizz',i)
    else:
        print(i)

Fizzbuzz 0
1
2
3
4
5
Fizz 6
7
8
Fizz 9
Buzz 10
11
Fizz 12
13
14
Fizzbuzz 15
16
17
Fizz 18
19
Buzz 20
Fizz 21
22
23
Fizz 24
Buzz 25
26
Fizz 27
28
29
Fizzbuzz 30
31
32
Fizz 33
34
Buzz 35
Fizz 36
37
38
Fizz 39
Buzz 40
41
Fizz 42
43
44
Fizzbuzz 45
46
47
Fizz 48
49
Buzz 50
Fizz 51
52
53
Fizz 54
Buzz 55
56
Fizz 57
58
59
Fizzbuzz 60
61
62
Fizz 63
64
Buzz 65
Fizz 66
67
68
Fizz 69
Buzz 70
71
Fizz 72
73
74
Fizzbuzz 75
76
77
Fizz 78
79
Buzz 80
Fizz 81
82
83
Fizz 84
Buzz 85
86
Fizz 87
88
89
Fizzbuzz 90
91
92
Fizz 93
94
Buzz 95
Fizz 96
97
98
Fizz 99


In [15]:
21 % 5

1