# Imports

In [1]:
import nltk
import re

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Lowercase

In [2]:
# Change the corpus to lowercase
corpus = "Learning% makes 'me' happy. I am happy be-cause I am learning! :)"
corpus = corpus.lower()

# Note that word 'learning' will now be the same regardless of its position in
# the sentence
print(corpus)

learning% makes 'me' happy. i am happy be-cause i am learning! :)


# Remove Special Characters

In [3]:
# Remove special characters
corpus = "learning% makes 'me' happy. i am happy be-cause i am learning! :)"

# a-zA-Z: Matches any lowercase or uppercase letter.
# 0-9: Matches any digit.
# .!? : Matches a period, exclamation mark, question mark, or space.
corpus = re.sub(r'[^a-zA-z0-9.?! ]+',"",corpus)

print(corpus)

learning makes me happy. i am happy because i am learning! 


# Text Spliting

In [4]:
# Split text by a delimiter to array
input_date = "Sat May  9 07:33:35 CEST 2020"

# Get the date parts in array
date_parts = input_date.split(" ")
print(f"date parts = {date_parts}")

# Get the time parts in array
time_parts = date_parts[4].split(":")
print(f"time parts = {time_parts}")

date parts = ['Sat', 'May', '', '9', '07:33:35', 'CEST', '2020']
time parts = ['07', '33', '35']


# Sentence Tokenizing

In [5]:
# Sentence tokenizing
# Tokenize the sentence into an array of words
sentence = 'i am happy because i am learning'
tokenized_sentence = nltk.word_tokenize(sentence)
print(f"{sentence} -> {tokenized_sentence}")

i am happy because i am learning -> ['i', 'am', 'happy', 'because', 'i', 'am', 'learning']


In [6]:
# Find length of each word in the tokenized sentence
sentence = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.']
# Create a list with the word lengths using a list comprehension
word_lengths = [(word,len(word)) for word in sentence]
print(f"Lengths of the words : \n{word_lengths}")

Lengths of the words : 
[('i', 1), ('am', 2), ('happy', 5), ('because', 7), ('i', 1), ('am', 2), ('learning', 8), ('.', 1)]


# N-grams

In [7]:
def sentence_to_trigam(tokenized_sentence):
  for i in range(len(tokenized_sentence) - 3 + 1):
    # The sliding window starts at position i and contains 3 words
    trigram = tokenized_sentence[i : i + 3]
    print(trigram)

tokenized_sentence = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.']

print(f"List all trigrams of sentence : {tokenized_sentence}\n")
sentence_to_trigam(tokenized_sentence)

List all trigrams of sentence : ['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.']

['i', 'am', 'happy']
['am', 'happy', 'because']
['happy', 'because', 'i']
['because', 'i', 'am']
['i', 'am', 'learning']
['am', 'learning', '.']


# Prefix of an n-gram

In [8]:
# Get trigam prefix from a 4-gram
fourgram = ['i', 'am', 'happy', 'because']
 # Get the elements from 0, included, up to the last element, not included.
trigram = fourgram[0:-1]
print(trigram)

['i', 'am', 'happy']


# Start and end of sentence word  <𝑠> and  <𝑒>



In [9]:
# when working with trigrams, you need to prepend 2 <s> and append one </s>
n = 3
tokenized_sentence = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.']
tokenized_sentence = ["<s>"] * (n-1) + tokenized_sentence + ["<e>"]
print(tokenized_sentence)

['<s>', '<s>', 'i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.', '<e>']
