In [1]:
'''
These are the notes from the book
Nugues, P. M. (2024). Python for Natural Language Processing: Programming with NumPy, Scikit-learn, Keras, and PyTorch. Springer.
'''

'''
Topics Covered
1. Language Processing -> Python
2. Corpus Processing Tools
3. Encoding and Annotation Schemes
4. Python for Numerical Computations
5. Information Theory and Machine Learning
6. Linear and Logistic Regression
7. Neural Networks
8. Counting and Indexing Words
9. Word Sequences
10. Dense Vector Representations
11. Words, Parts of Speech and Morphology
12. Subword Segmentation
13. Part-of-Speech and Sequence Annotation
14. Self-Attention and Transformers
15. Pretraining an Encoder: The BERT Language Model
16. Sequence-to-sequence Architectures: Encoder-Decoder and Decoders
'''

'\nTopics Covered\n1. Language Processing -> Python\n2. Corpus Processing Tools\n3. Encoding and Annotation Schemes\n4. Python for Numerical Computations\n5. Information Theory and Machine Learning\n6. Linear and Logistic Regression\n7. Neural Networks\n8. Counting and Indexing Words\n9. Word Sequences\n10. Dense Vector Representations\n11. Words, Parts of Speech and Morphology\n12. Subword Segmentation\n13. Part-of-Speech and Sequence Annotation\n14. Self-Attention and Transformers\n15. Pretraining an Encoder: The BERT Language Model\n16. Sequence-to-sequence Architectures: Encoder-Decoder and Decoders\n'

In [None]:
'''
Applications of NLP
1. Spelling and grammar checks
2. Text indexing and information retrieval from the internet
3. Speech Transcription
4. Voice Control of domestic voices
5. Interactive voice response applications
6. Question answering
7. Machine Translation
'''

'\nApplications of NLP\n1. Spelling and grammar checks\n2. Text indexing and information retrieval from the internet\n3. Speech Transcription\n4. Voice Control of domestic voices\n5. Interactive voice response applications\n6. Question answering\n7. Machine Translation\n'

In [2]:
# A simple for loop
for i in [1,2,3,4,5,6,7]:
	print(i)
print('Done')

1
2
3
4
5
6
7
Done


In [3]:
iliad_opening2 = 'Sing, O goddess, the anger of Achilles son of \
Peleus, that brought countless ills upon the Achaeans'
print(iliad_opening2)

Sing, O goddess, the anger of Achilles son of Peleus, that brought countless ills upon the Achaeans


In [4]:
# if else condition
for i in [1,2,3,4,5,6,7]:
	if i%2 == 0:
		print("Even", i)
	else:
		print("Odd", i)
print('Done')

Odd 1
Even 2
Odd 3
Even 4
Odd 5
Even 6
Odd 7
Done


In [6]:
# str.join(list)
''.join(['abc', 'def', 'gi']) # join without spaces

'abcdefgi'

In [7]:
# join stings in the list with spaces
' '.join(['abc', 'def', 'ghi'])

'abc def ghi'

In [8]:
# join the strings in the list with the ', '
', '.join(['abc', 'def', 'ghi'])

'abc, def, ghi'

In [9]:
# Usage of
# str.upper()
# str.lower()
accented_e = "eéèêë"
print(accented_e.upper())
accented_E = "EÉÈÊË"
print(accented_E.lower())

EÉÈÊË
eéèêë


In [10]:
# Searching and replacing substrings in strings
# str.find() - returns the index of the first occurence of the substring
# str.replace() - replaces all the occurences of the substring and returns a new string
alphabet = 'αβγdefghijklmnopqrstuvwxyz'
print(alphabet.find('def'))
print(alphabet.find('é')) # returns -1 if string not found
alphabet.replace('abc', 'αβγ') # 'αβγdefghijklmnopqrstuvwxyz'
print(alphabet)

3
-1
αβγdefghijklmnopqrstuvwxyz


In [11]:
iliad_opening = '''Sing, O goddess, the anger of Achilles son of Peleus, that brought countless ills upon the Achaeans. Many a brave soul did it send hurrying down to Hades, and many a hero did it yield a prey to dogs and vultures, for so were the counsels of Jove fulfilled from the day on which the son of Atreus, king of men, and great Achilles, first fell out with one another.
'''
text_vowels = ''
for c in iliad_opening:
	if c in 'aeiou':
		text_vowels += c
print(text_vowels) # 'ioeeaeoieooeeuaououeiuoeaea'

ioeeaeoieooeeuaououeiuoeaeaaaaeouiieuiooaeaaaeoiiieaeooauueooeeeoueooeuieoeaoieooeuioeaeaieieouioeaoe


In [12]:
# Slices - extract substrings of a string
print(alphabet[0:3])
print(alphabet[:3])
print(alphabet[3:6])
print(alphabet[-3:])
print(alphabet[10:-10])
print(alphabet[:])

αβγ
αβγ
def
xyz
klmnop
αβγdefghijklmnopqrstuvwxyz


In [13]:
# this will result in original string
print(alphabet[:i] + alphabet[i:])

αβγdefghijklmnopqrstuvwxyz


In [14]:
# adding step to the slices
# [start:end:step]
print(alphabet[0::2])

αγegikmoqsuwy


In [15]:
# Special Characters
print('Python\'s strings') # using escape character

Python's strings


In [16]:
# raw strings
print(r"can we see how this works'' ''' "" ")

can we see how this works'' '''  


In [17]:
# Formatting Strings
begin = 'my'
print('{} string {}'.format(begin, 'is empty'))

# Reordering arguments
begin = 'my'
print('{1} string {0}'.format('is empty', begin))

my string is empty
my string is empty


In [18]:
# Data Identities and Types
# Object identity id()
# Object type type()
var1 = 12
print(var1)
print(id(var1)) # Object identity
print(type(var1)) # Object type

12
10758088
<class 'int'>


In [19]:
# None value is a unique member, equivalent to null in C
print(type(12.0))
print(type(True))
print(type(1<2))
print(type(None))

<class 'float'>
<class 'bool'>
<class 'bool'>
<class 'NoneType'>


In [20]:
# String datatype
print('12')
print(id('12'))
print(type('12'))

12
132536116506480
<class 'str'>


In [21]:
print(alphabet)
print(id(alphabet))
print(type(alphabet))

αβγdefghijklmnopqrstuvwxyz
132536126322512
<class 'str'>


In [22]:
# Data Structures
# Lists
list1 = [] # An empty list
print(list1)
list1 = list() # Another way to create an empty list
print(list1)

list2 = [1,2,3]
print(list2)
print(type(list2))

[]
[]
[1, 2, 3]
<class 'list'>


In [23]:
# List can contain elements of different types
var1 = 3.14
var2 = 'my string'
list3 = [1, var1, 'Prolog', 'my string', var2]
print(list3)

[1, 3.14, 'Prolog', 'my string', 'my string']


In [24]:
# Slicing in Lists
print(list3[1:3])

# Assigning a list to slice
list3[1:3] = [2.72, 'Perl', 'Python']

[3.14, 'Prolog']


In [25]:
# Creating a lists of lists
list4 = [list2, list3]
print(list4)

[[1, 2, 3], [1, 2.72, 'Perl', 'Python', 'my string', 'my string']]


In [26]:
# Accessing the elements of the inner list
print(list4[0][1])
print(list4[1][3])

2
Python


In [27]:
# assigning a complete list to a variable
# list of list of variables
list5 = list2
[v1, v2, v3] = list5

print(v1, v2, v3)

1 2 3


In [28]:
# List Copy
# copy() does not copy the inner objects of the list, only the identities
list6 = list2.copy()

print(list2)
print(list5)

print(id(list2))
print(id(list5))

list6 = list2.copy()
print(id(list6))

# list5 = list2, this means the lists are identical
# list6 = list2.copy, this means lists are equal

print(list2 == list5)
print(list2 == list6)

[1, 2, 3]
[1, 2, 3]
132536116549824
132536116549824
132536116549504
True
True


In [29]:
# To create a complete independent list from the original
# use deepcopy()

import copy
print(id(copy.deepcopy(list4)))

132536125886400


In [30]:
# Built-in list operations and functions
print(list2)
print(list3[:-1])
print([1,2,3] + ['a', 'b'])
print(list2[:2] + list3[2:-1])
print(list2 * 2)
print([0.0] * 4)

[1, 2, 3]
[1, 2.72, 'Perl', 'Python', 'my string']
[1, 2, 3, 'a', 'b']
[1, 2, 'Perl', 'Python', 'my string']
[1, 2, 3, 1, 2, 3]
[0.0, 0.0, 0.0, 0.0]


In [31]:
# some common functions for the lists
# list.extend(elements)
# list.append(element)
# list.insert(idx, element)
# list.remove(value)
# list.pop(i)
# del list[i]
# len()
# list.sort()
# sorted()

In [32]:
print(list2)
print(len(list2))
print(list2.extend([4,5]))
print(list2.append(6))
print(list2.append([7,8]))
print(list2.pop(-1))
print(list2.remove(1))
print(list2.insert(0, 'a'))

[1, 2, 3]
3
None
None
None
[7, 8]
None
None


In [33]:
# To know all the functions associated with a type
# dir(list)
# dir(str)
# help(list)
# help(list.append)

In [None]:
# Tuples
# Tuples are sequences enclosed in parentheses
# They are immutable

tuple1 = () # An empty tuple
tuple1 = tuple() # Another way of creating an empty tuple
tuple2 = (1,2,3,4)
tuple2[3]
tuple2[1:4]
# tuple2[3] = 8 # type error, tuples are immutable

(2, 3, 4)

In [None]:
# Enclosing in parenthesis
print(type((1)))
print(type((1, )))

<class 'int'>
<class 'tuple'>


In [None]:
# we can convert list to tuples
# and tuples to list

list7 = ['a', 'b', 'c']
print(list7)
tuple3 = tuple(list7)
print(tuple3)
print(type(tuple3))

['a', 'b', 'c']
('a', 'b', 'c')
<class 'tuple'>


In [None]:
list8 = list(tuple2)
print(list8)
print(tuple([1]))
print(list((1,)))

[1, 2, 3, 4]
(1,)
[1]


In [None]:
# Tuple can include elements of different type

tuple4 = (tuple2, list7)
print(tuple4)

print(tuple4[0])
print(tuple4[1])

print(tuple4[0][2])
print(tuple4[1][1])

((1, 2, 3, 4), ['a', 'b', 'c'])
(1, 2, 3, 4)
['a', 'b', 'c']
3
b


In [None]:
# Sets
# Sets are collections that have no duplicates

set1 = set() # An empty set
set2 = {'a', 'b', 'c', 'c', 'b'}
print(set2)
print(type(set2))

{'a', 'b', 'c'}
<class 'set'>


In [None]:
# adding and removing elements
# add()
# remove()

set2.add('d')
print(set2)

set2.remove('a')
print(set2)

{'d', 'a', 'b', 'c'}
{'d', 'b', 'c'}


In [None]:
# Sets are useful to extract the unique elements of lists or strings
list9 = ['a', 'b', 'c', 'c', 'b']
print(list9)
set3 = set(list9)
print(set3)

['a', 'b', 'c', 'c', 'b']
{'a', 'b', 'c'}


In [None]:
iliad_chars = set(iliad_opening.lower())
print(iliad_opening)
print(iliad_chars)

Sing, O goddess, the anger of Achilles son of Peleus, that brought countless ills upon the Achaeans. Many a brave soul did it send hurrying down to Hades, and many a hero did it yield a prey to dogs and vultures, for so were the counsels of Jove fulfilled from the day on which the son of Atreus, king of men, and great Achilles, first fell out with one another.

{'e', 's', '.', 'g', 't', 'o', 'd', 'n', ',', 'm', '\n', 'p', 'w', 'l', 'b', 'u', 'j', 'y', 'i', 'r', 'a', 'f', ' ', 'k', 'h', 'v', 'c'}


In [None]:
# Sets are unordered
# sorted()
print(sorted(iliad_chars))

['\n', ' ', ',', '.', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y']


In [None]:
# Built-in Set Functions
# The set methods include the classical set operations
# set1.intersection(set2, ...)
# set1.union(set2, ...)
# set1.difference(set2, ...)
# set1.symmetric_difference(set2)
# set1.issuperset(set2)
# set1.issubset(set2)

In [None]:
print(set2.intersection(set3))
print(set2 & set3)

print(set2.union(set3))
print(set2 | set3)

set2.symmetric_difference(set3)
set2.issubset(set3)

print(sorted(iliad_chars.intersection(set(alphabet))))

{'b', 'c'}
{'b', 'c'}
{'d', 'a', 'b', 'c'}
{'d', 'a', 'b', 'c'}
['d', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y']


In [None]:
# Dictionaries
# Dictionaries are the collections
# The values are indexed by keys instead of ordered positions

wordcount = {} # Creating an empty dictionary
wordcount = dict() # Another way to create an empty dictionary

wordcount['a'] = 21 # stored in a key value pair
wordcount['And'] = 10
wordcount['the'] = 18

print(wordcount)
print(type(wordcount))

{'a': 21, 'And': 10, 'the': 18}
<class 'dict'>


In [None]:
print(wordcount['a'])
print(wordcount['And'])

21
10


In [None]:
# To access a key in a dictionary without risking an error
# we can use the get() function that has a default value
# if the key is undefined

print(wordcount.get('And'))
print(wordcount.get('is', 0))

10
0


In [None]:
from collections import defaultdict

missing_proof = defaultdict(int)
print(missing_proof['the'])
print(missing_proof)

0
defaultdict(<class 'int'>, {'the': 0})


In [None]:
# Built-in Dictionary Functions
# keys() - returns the keys of a dictionary
# values() - returns the values of a dictionary
# items() - returns the key-value pairs of a dictionary

# the keys can be strings, numbers, or imutable structures
# Mutable keys, like a list will generate an error

In [None]:
print(wordcount.keys())
print(wordcount.values())
print(wordcount.items())

dict_keys(['a', 'And', 'the'])
dict_values([21, 10, 18])
dict_items([('a', 21), ('And', 10), ('the', 18)])


In [None]:
# Counting the letters of a text

letter_count = {}
for letter in iliad_opening.lower():
  if letter in alphabet:
    if letter in letter_count:
      letter_count[letter] += 1
    else:
      letter_count[letter] = 1
print(letter_count)

{'s': 22, 'i': 15, 'n': 21, 'g': 8, 'o': 27, 'd': 16, 'e': 31, 't': 20, 'h': 17, 'r': 15, 'f': 11, 'l': 17, 'p': 3, 'u': 12, 'm': 4, 'y': 6, 'v': 3, 'w': 4, 'j': 1, 'k': 1}


In [None]:
print(letter_count.keys())
print(letter_count.values())
print(letter_count.items())

dict_keys(['s', 'i', 'n', 'g', 'o', 'd', 'e', 't', 'h', 'r', 'f', 'l', 'p', 'u', 'm', 'y', 'v', 'w', 'j', 'k'])
dict_values([22, 15, 21, 8, 27, 16, 31, 20, 17, 15, 11, 17, 3, 12, 4, 6, 3, 4, 1, 1])
dict_items([('s', 22), ('i', 15), ('n', 21), ('g', 8), ('o', 27), ('d', 16), ('e', 31), ('t', 20), ('h', 17), ('r', 15), ('f', 11), ('l', 17), ('p', 3), ('u', 12), ('m', 4), ('y', 6), ('v', 3), ('w', 4), ('j', 1), ('k', 1)])


In [None]:
# printing the letter in an alphabetical order
for letter in sorted(letter_count.keys()):
  print(letter, letter_count[letter])

d 16
e 31
f 11
g 8
h 17
i 15
j 1
k 1
l 17
m 4
n 21
o 27
p 3
r 15
s 22
t 20
u 12
v 3
w 4
y 6


In [None]:
# sorting the letters from the least frequent to the most frequent
for letter in sorted(letter_count.keys(),
                     key=letter_count.get, reverse=True):
  print(letter, letter_count[letter])

e 31
o 27
s 22
n 21
t 20
h 17
l 17
d 16
i 15
r 15
u 12
f 11
g 8
y 6
m 4
w 4
p 3
v 3
j 1
k 1


In [None]:
# Control Structures
# Two parts header and the suite
# if, for, while

In [None]:
# Conditionals
# if, elif, else

digits = '0123456789'
punctuation = '.,;:?!'

char = '.'
if char in alphabet:
  print('Letter')
elif char in digits:
  print('Number')
elif char in punctuation:
  print('Punctuation')
else:
  print('Other')

Punctuation


In [None]:
# For Loop
# iterates over the elements of a sequence
# range(start, stop, step)
# range() uses a constant memory

sum = 0
for i in range(100):
  sum += 1
print(sum)

100


In [None]:
# converting range to list
list10 = list(range(5))
print(list10)

[0, 1, 2, 3, 4]


In [None]:
# enumerate()
# takes a sequence as argument and returns a sequence of (index, element) pairs

for idx, letter in enumerate(alphabet):
  print(idx, letter)

0 α
1 β
2 γ
3 d
4 e
5 f
6 g
7 h
8 i
9 j
10 k
11 l
12 m
13 n
14 o
15 p
16 q
17 r
18 s
19 t
20 u
21 v
22 w
23 x
24 y
25 z


In [None]:
# While loop
sum, i = 0, 0
while i < 100:
  sum += i
  i += 1
print(sum)

4950


In [None]:
# Another possible structure using an infinite loop and a break statement to exit another loop

sum, i = 0, 0
while True:
  sum += i
  i += 1
  if i >= 100:
    break
print(sum)

4950


In [None]:
a = bool(True)
b = bool(False)

print(a, b)
print(int(a), int(b))
print(int(True), int(False))

True False
1 0
1 0


In [None]:
# Exceptions
# Python has a mechanism to handle errors
# so that they do not stop a program
# it uses try and except keywords

In [None]:
try:
  int(alphabet)
  int('12.0')
except:
  pass
  print('Cleared the exception!')

Cleared the exception!


In [None]:
try:
  int(alphabet)
  int('12.0')
except ValueError:
  print('Caught a value error!')
except TypeError:
  print('Caught a type error!')

Caught a value error!


In [None]:
# Functions
def count_letters(text, lc):
  letter_count = {}
  if lc:
    text = text.lower()
  for letter in text:
    if letter.lower() in alphabet:
      if letter in letter_count:
        letter_count[letter] += 1
      else:
        letter_count[letter] = 1
  return letter_count

In [None]:
# calling the function
count_letters(iliad_opening, True)

{'s': 22,
 'i': 15,
 'n': 21,
 'g': 8,
 'o': 27,
 'd': 16,
 'e': 31,
 't': 20,
 'h': 17,
 'r': 15,
 'f': 11,
 'l': 17,
 'p': 3,
 'u': 12,
 'm': 4,
 'y': 6,
 'v': 3,
 'w': 4,
 'j': 1,
 'k': 1}

In [None]:
print(type(count_letters))

<class 'function'>


In [None]:
def count_letters(text, lc=True):
  """
  Count the letters in a text
  Arguments:
    text: input text
    lc: lowercase. If true, sets the characters
    in lowercase
  Returns: The letter counts
  """

In [None]:
# Type Hinting (a.k.a. Type Annotations)
def count_letters(text: str, lc=True: bool) -> dict[str, int]:
  """
  Count the letters in a text
  Arguments:
    text: input text
    lc: lowercase. If true, sets the characters
    in lowercase
  Returns: The letter counts
  """

SyntaxError: invalid syntax (<ipython-input-85-931af20fcb0c>, line 1)

In [None]:
# Comprehensions and Generators

# Instead of loops the comprehensions are an alternative, consise
# syntactic notation to create lists, sets, or dictionaries

In [None]:
# Given an input word, we can generate all the one-character
# deletions in two steps
# first, we split the word into two parts
# then we delete the first letter of the second part

word = 'acress'
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
print(word)
print(splits)

acress
[('', 'acress'), ('a', 'cress'), ('ac', 'ress'), ('acr', 'ess'), ('acre', 'ss'), ('acres', 's'), ('acress', '')]


In [None]:
deletes = [a + b[1:] for a, b in splits if b]
print(deletes)

['cress', 'aress', 'acess', 'acrss', 'acres', 'acres']


In [None]:
# without comprehension
splits = []
for i in range(len(word) + 1):
  splits.append((word[:i], word[i:]))

print(splits)

[('', 'acress'), ('a', 'cress'), ('ac', 'ress'), ('acr', 'ess'), ('acre', 'ss'), ('acres', 's'), ('acress', '')]


In [None]:
deletes = []
for a,b in splits:
  if b:
    deletes.append(a+b[1:])
print(deletes)

['cress', 'aress', 'acess', 'acrss', 'acres', 'acres']


In [None]:
# Generators
# List comprehensions are stored in memory
# If the list is large, it can exceed the computer capacity
# Generators generate the elements on demand instead and can
# handle much longer sequences
# same syntax structure as comprehensions, just replaced by ()

In [None]:
# same example as above
splits_generator = ((word[:1], word[i:])
    for i in range(len(word) + 1)
)

for i in splits_generator: print(i)

('a', 'acress')
('a', 'cress')
('a', 'ress')
('a', 'ess')
('a', 'ss')
('a', 's')
('a', '')


In [None]:
def splits_generator_function():
  for i in range(len(word) + 1):
    yield (word[:i], word[i:])

splits_generator = splits_generator_function()
print(splits_generator)

<generator object splits_generator_function at 0x7fce3c4b4d60>


In [None]:
# Iterators
# iter() and next()

my_iterator = iter('abc')
print(next(my_iterator)) # a
print(next(my_iterator)) # b
print(next(my_iterator)) # c

a
b
c


In [None]:
# zip
# zip() weaves strings, lists, or tuples and creates an iterator of tuples
# where each tuple contains the iterms with the same index

latin_alphabet = "abcdefghijklmnopqrstuvwxyz"
print(len(latin_alphabet))

greek_alphabet = "αβγδεζηθικλμνξοπρστυφχψω"
print(len(greek_alphabet))

cyrillic_alphabet = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя"
print(len(cyrillic_alphabet))

26
24
33


In [None]:
la_gr = zip(latin_alphabet[:3], greek_alphabet[:3])
la_gr_cy = zip(latin_alphabet[:3], greek_alphabet[:3], cyrillic_alphabet[:3])
print(la_gr)
print(la_gr_cy)

<zip object at 0x7fce3c4d2540>
<zip object at 0x7fce3c4d09c0>


In [None]:
# Start at page 33