<a href="https://colab.research.google.com/github/vishwvir-singh/DeepLearningForNLP/blob/main/natural_language_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Preprocessing**

1.   Tokenizations
2.   To Lower case (Vishwvir -> vishwvir)
3.   Remove Stopwords
4.   Remove Punctuations
5.   Stemming or Lemmatization
6.   Handle n-grams ([vishwvir, singh] -> bigram -> [vishwvir_singh])

In [1]:
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.stem.porter import *
from nltk.corpus import gutenberg, stopwords
nltk.download('punkt')
nltk.download('gutenberg')
nltk.download('stopwords')

import string

import spacy

import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.phrases import Phraser, Phrases

import sklearn
from sklearn.manifold import TSNE

import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import figure, show

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
gberg_sents = gutenberg.sents()

In [3]:
#Example of each
#1. Tokenization

#using nltk
print('a sentence :', gutenberg.raw()[:289])
print('sentence token: ', sent_tokenize(gutenberg.raw()[:289]))
print('word token: ', word_tokenize(sent_tokenize(gutenberg.raw()[:289])[0]))

a sentence : [Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.
sentence token:  ['[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.']
word token:  ['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER', 'I', 'Emma', 'Woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich', ',', 'with', 'a', 'comfortable', 'home', 'and', 'happy', 'disposition', ',', 'seemed', 'to', 'unite', 'some', 'of', 'the', 'best', 'blessings', 'of', 'existence', ';', 'and', 'had', 'lived', 'nearly', 'twenty

In [4]:
#2. To Lowercase

gberg_sent_4 = [word.lower() for word in gberg_sents[4]]
print(gberg_sent_4)

['she', 'was', 'the', 'youngest', 'of', 'the', 'two', 'daughters', 'of', 'a', 'most', 'affectionate', ',', 'indulgent', 'father', ';', 'and', 'had', ',', 'in', 'consequence', 'of', 'her', 'sister', "'", 's', 'marriage', ',', 'been', 'mistress', 'of', 'his', 'house', 'from', 'a', 'very', 'early', 'period', '.']


In [5]:
#3 & 4. Remove Stopwords and Punctuations

stpwords = stopwords.words('english') + list(string.punctuation)
print(stpwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
gberg_sent_4 = [word for word in gberg_sent_4 if word not in stpwords]
print(gberg_sent_4)

['youngest', 'two', 'daughters', 'affectionate', 'indulgent', 'father', 'consequence', 'sister', 'marriage', 'mistress', 'house', 'early', 'period']


In [7]:
#5. Apply Stemming

stem = PorterStemmer()

In [8]:
gberg_sent_4_pstem = [stem.stem(word) for word in gberg_sent_4]
print(gberg_sent_4_pstem)

['youngest', 'two', 'daughter', 'affection', 'indulg', 'father', 'consequ', 'sister', 'marriag', 'mistress', 'hous', 'earli', 'period']


In [None]:
####### Lemmatization

nlp = spacy.load('en_core_web_sm')

In [None]:
gutenberg.raw()[291:477]

In [None]:
spacy_doc = nlp(gutenberg.raw()[291:477])
gberg_sent_4_lemm = [word.lemma_ for word in spacy_doc.sents]
print(gberg_sent_4_lemm)

In [None]:
# 6. Handle bigrams collocations


phrases = Phrases(gberg_sents) # train detector
print(phrases)
bigram = Phraser(phrases) # create a more efficient Phraser object for transforming sentences
print(bigram.phrasegrams) # output score of each bigram


In [None]:
tokenized_sentence = "Jon lives in New York City".split()
print(bigram[tokenized_sentence])

**Preprocess the corpus**

gberg_sents is a Tokenized dataset.

1.   To Lowercase and Remove Punctuations
2.   Handle bigram collocation
3.   Create word2vector
4.   Evaluate Model 
5.   Plot in 2D



In [None]:
gberg_sent_preprs = [[word.lower() for word in sent if word not in list(string.punctuation)] for sent in gberg_sents]
print(gberg_sent_preprs[:10])

In [None]:
# bigram = Phraser(Phrases(gberg_sent_preprs))
# print(bigram.phrasegrams)
#applying certain conditions to create bigrams
bigram = Phraser(Phrases(gberg_sent_preprs, min_count=32, threshold=64))
print(bigram.phrasegrams)
gberg_sent_preprs = [bigram[sent] for sent in gberg_sent_preprs]
print(gberg_sent_preprs[:10])

In [16]:
model = Word2Vec(sentences=gberg_sent_preprs, size=64, window=10, min_count=10, workers=8, sg=1, iter=5)

In [17]:
model.wv.most_similar('dog', topn=5)

[('puppy', 0.7834013104438782),
 ('pet', 0.7651119232177734),
 ('brahmin', 0.7613773345947266),
 ('fox', 0.7533220052719116),
 ('ginger', 0.7437171936035156)]

In [18]:
model.wv.most_similar('king', topn=5)

[('babylon', 0.743516206741333),
 ('governor', 0.7386270761489868),
 ('haman', 0.7130187749862671),
 ('vashti', 0.7023959755897522),
 ('ahasuerus', 0.695083498954773)]

In [20]:
model.wv.most_similar(positive=['king', 'queen'], negative=['man'], topn=5)

[('vashti', 0.6396568417549133),
 ('banquet', 0.628044605255127),
 ('ahasuerus', 0.6165652871131897),
 ('haman', 0.6091808080673218),
 ('governor', 0.5966606140136719)]

In [21]:
model.wv.most_similar(positive=['husband', 'woman'], negative=['man'])

[('sister', 0.7125319242477417),
 ('wife', 0.6884117126464844),
 ('daughter', 0.6449052095413208),
 ('conceived', 0.6437048316001892),
 ('elder', 0.6412549614906311),
 ('maid', 0.6367870569229126),
 ('child', 0.6325640678405762),
 ('mother', 0.6313331127166748),
 ('nurse', 0.6146925687789917),
 ('widow', 0.6055256128311157)]

In [28]:
#Reduce dimensions

words = model.wv.vocab.keys()
print(list(words)[:1])
word_vector_nd = model.wv[words]
print(word_vector_nd[:1])


tsne = TSNE(n_components=2, n_iter=1000)
word_vector_2d = tsne.fit_transform(word_vector_nd)
print(word_vector_2d[:1])

['emma']
[[ 4.2879564e-01 -2.7309585e-02  4.1091430e-01  8.4065415e-02
  -6.3307948e-02 -2.5161645e-01  1.5850502e-01  1.2286302e-01
  -6.0415232e-01 -1.5979947e-01  5.6659406e-01 -4.6423998e-01
   3.7104055e-01 -4.1548055e-01  3.6612841e-01 -2.8310094e-02
  -3.3924177e-01 -2.1649595e-01  5.9744227e-01  2.4929845e-01
   4.1158038e-01 -2.2750539e-01  2.1515630e-01 -8.9843982e-01
   3.7863460e-01 -3.1438455e-01 -4.0062022e-01  2.7327988e-01
  -4.5707256e-01 -6.0167819e-02 -2.4086465e-01  4.6665373e-01
  -2.3591351e-02  3.4608859e-01  7.1361944e-02 -1.4508757e-01
  -7.0988983e-01  2.2660539e-01 -5.0377172e-01  5.1288396e-01
   5.2390915e-01  2.8564909e-01 -8.6089045e-01 -2.6839727e-01
  -3.1854957e-01  3.0735999e-02 -2.0463902e-01 -8.3088758e-04
   5.1767665e-01  9.1752332e-01  7.4425071e-02  2.0915511e-01
   2.3749003e-02  9.6467078e-02  6.5241106e-02 -6.2591982e-01
  -1.3873084e-03 -2.3203838e-01  2.7807847e-01  1.6536984e-01
   2.9808125e-01 -4.7697228e-01 -2.7500945e-01  1.4074080e-01

In [30]:
df = pd.DataFrame(word_vector_2d, columns=['x', 'y'])
df['word'] = words
print(df.head())

           x          y    word
0  25.964874  59.073162    emma
1 -34.341629  30.957411      by
2  23.674562  60.877571    jane
3  -5.358441  16.970301  volume
4  51.950905   8.095816       i


In [34]:
sample_df = df.sample(n=5000)

In [39]:
output_notebook()

In [40]:
plot = figure(plot_width=800, plot_height=800)
plot.text(x_offset=sample_df.x, y_offset=sample_df.y, text=sample_df.word)

In [41]:
show(plot)