<a href="https://colab.research.google.com/github/ugurklc/Deep_Learning/blob/Master/Natural_Language_Processing/nlp_start.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
sentences = ["Today is a sunny day", "Today is a rainy day","Is it sunny today?"]

In [None]:
tokenizer = Tokenizer(num_words=100)

In [None]:
tokenizer.fit_on_texts(sentences)

In [None]:
word_index = tokenizer.word_index
word_index

{'today': 1, 'is': 2, 'a': 3, 'sunny': 4, 'day': 5, 'rainy': 6, 'it': 7}

In [None]:
word_count = tokenizer.word_counts
word_count

OrderedDict([('today', 3),
             ('is', 3),
             ('a', 2),
             ('sunny', 2),
             ('day', 2),
             ('rainy', 1),
             ('it', 1)])

In [None]:
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[1, 2, 3, 4, 5], [1, 2, 3, 6, 5], [2, 7, 4, 1]]

In [None]:
test_data = ['Today is a snowy day','Will it be rainy tomorrow']

In [None]:
tokenizer.texts_to_sequences(test_data)

[[1, 2, 3, 5], [7, 6]]

In [None]:
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
tokenizer.texts_to_sequences(test_data)

[[2, 3, 4, 1, 6], [1, 8, 1, 7, 1]]

In [None]:
tokenizer.sequences_to_texts(tokenizer.texts_to_sequences(test_data))

['today is a <OOV> day', '<OOV> it <OOV> rainy <OOV>']

# Understanding Padding

In [None]:
sentences = [
    'Today is a sunny day',
    'Today is a rainy day',
    'Is it sunny today?',
    'I really enjoyed walking in the snow today'
]

In [None]:
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)

In [None]:
tokenized = tokenizer.texts_to_sequences(sentences)
tokenized

[[2, 3, 4, 5, 6],
 [2, 3, 4, 7, 6],
 [3, 8, 5, 2],
 [9, 10, 11, 12, 13, 14, 15, 2]]

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
padded = pad_sequences(tokenized)
padded

array([[ 0,  0,  0,  2,  3,  4,  5,  6],
       [ 0,  0,  0,  2,  3,  4,  7,  6],
       [ 0,  0,  0,  0,  3,  8,  5,  2],
       [ 9, 10, 11, 12, 13, 14, 15,  2]], dtype=int32)

In [None]:
padded = pad_sequences(tokenized, padding='post')
padded

array([[ 2,  3,  4,  5,  6,  0,  0,  0],
       [ 2,  3,  4,  7,  6,  0,  0,  0],
       [ 3,  8,  5,  2,  0,  0,  0,  0],
       [ 9, 10, 11, 12, 13, 14, 15,  2]], dtype=int32)

In [None]:
padded = pad_sequences(tokenized, padding='post', maxlen=6)
padded

array([[ 2,  3,  4,  5,  6,  0],
       [ 2,  3,  4,  7,  6,  0],
       [ 3,  8,  5,  2,  0,  0],
       [11, 12, 13, 14, 15,  2]], dtype=int32)

In [None]:
padded = pad_sequences(tokenized, padding='post', maxlen=6, truncating='post')
padded

array([[ 2,  3,  4,  5,  6,  0],
       [ 2,  3,  4,  7,  6,  0],
       [ 3,  8,  5,  2,  0,  0],
       [ 9, 10, 11, 12, 13, 14]], dtype=int32)

# Removing Stopwords and Cleaning Text

In [None]:
sentences = [
    "Today is the sunniest day",
    "Today is a rainy day <br> and it's cold",
    "Is it sunny today?",
    "I really enjoyed walking in <br> the snow today but it was cold"
]

In [None]:
# Use BeautifulSoup to remove HTML tags in the sentence
sentence = "I really enjoyed walking in <br> the snow today but it was cold"

In [None]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(sentence)
souped = soup.get_text()
souped

'I really enjoyed walking in  the snow today but it was cold'

In [None]:
# Remove stopwords
stopwords = ['a','about','yours','the','but','and']

In [None]:
sentence = "I really enjoyed walking in <br> the snow today but it was cold"

words = sentence.split()
filtered_sentence = ""
new_sentence = []

for word in words:
  if word not in stopwords:
    filtered_sentence = filtered_sentence + word + " "
new_sentence.append(filtered_sentence)
new_sentence

['I really enjoyed walking in <br> snow today it was cold ']

In [None]:
# Strip punctuation
sentence = "I really enjoyed walking in <br> the snow today but it was cold"

import string
table = str.maketrans("","",string.punctuation)
words = sentence.split()
filtered_sentence = ""
new_sentence = []

for word in words:
  word = word.translate(table)
  if word not in stopwords:
    filtered_sentence = filtered_sentence + word + " "
new_sentence.append(filtered_sentence)
new_sentence

['I really enjoyed walking in br snow today it was cold ']

In [None]:
# Do it all

# 1. Remove HTML tags
# 2. Remove stopwords
# 3. Strip punctuation

sentence = "I really enjoyed walking in <br> the snow today but it was cold"

from bs4 import BeautifulSoup
soup = BeautifulSoup(sentence)
souped = soup.get_text()
souped

import string
table = str.maketrans("","",string.punctuation)
words = souped.split()
filtered_sentence = ""
new_sentence = []

for word in words:
  word = word.translate(table)
  if word not in stopwords:
    filtered_sentence = filtered_sentence + word + " "
new_sentence.append(filtered_sentence)
new_sentence

['I really enjoyed walking in snow today it was cold ']