In [1]:
para = '''A paragraph is a series of sentences that are organized and coherent, and are all related to a single topic. Almost every piece of writing you do that is longer than a few sentences should be organized into paragraphs. This is because paragraphs show a reader where the subdivisions of an essay begin and end, and thus help the reader see the organization of the essay and grasp its main points. Paragraphs can contain many different kinds of information. A paragraph could contain a series of brief examples or a single long illustration of a general point. It might describe a place, character, or process; narrate a series of events; compare or contrast two or more things; classify items into categories; or describe causes and effects. Regardless of the kind of information they contain, all paragraphs share certain characteristics. One of the most important of these is a topic sentence.'''

In [2]:
para

'A paragraph is a series of sentences that are organized and coherent, and are all related to a single topic. Almost every piece of writing you do that is longer than a few sentences should be organized into paragraphs. This is because paragraphs show a reader where the subdivisions of an essay begin and end, and thus help the reader see the organization of the essay and grasp its main points. Paragraphs can contain many different kinds of information. A paragraph could contain a series of brief examples or a single long illustration of a general point. It might describe a place, character, or process; narrate a series of events; compare or contrast two or more things; classify items into categories; or describe causes and effects. Regardless of the kind of information they contain, all paragraphs share certain characteristics. One of the most important of these is a topic sentence.'

In [3]:
import numpy as np
import nltk
import torch
import transformers
import spacy
import tensorflow_hub as hub
import tensorflow.compat.v1 as tf
from spacy import displacy
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import PunktSentenceTokenizer

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [38]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
# Convert paragraph into list of sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentences = tokenizer.tokenize(para.strip())
sentences = [sentence for sentence in raw_sentences if len(sentence) > 0]
print('Sentences :')
print(sentences, end='\n\n\n')

Sentences :
['A paragraph is a series of sentences that are organized and coherent, and are all related to a single topic.', 'Almost every piece of writing you do that is longer than a few sentences should be organized into paragraphs.', 'This is because paragraphs show a reader where the subdivisions of an essay begin and end, and thus help the reader see the organization of the essay and grasp its main points.', 'Paragraphs can contain many different kinds of information.', 'A paragraph could contain a series of brief examples or a single long illustration of a general point.', 'It might describe a place, character, or process; narrate a series of events; compare or contrast two or more things; classify items into categories; or describe causes and effects.', 'Regardless of the kind of information they contain, all paragraphs share certain characteristics.', 'One of the most important of these is a topic sentence.']




In [5]:
# 1.i) Word2Vec

wordvecs = [nltk.word_tokenize(sentence) for sentence in sentences]
stop_words = list(set(stopwords.words("english")))

for sentence in wordvecs:
    for word in sentence:
        if word in stop_words:
            sentence.remove(word)

model = Word2Vec(wordvecs, min_count=1)

# Vector for word 'sentence'
print('Vector for word "sentence" : ')
print(model.wv['sentence'])



Vector for word "sentence" : 
[ 6.0347753e-04  3.7514158e-03 -4.5773359e-03 -1.3764277e-03
  2.0012059e-03  2.0728500e-03  2.6279264e-03  4.3870611e-03
 -2.5364948e-03 -2.0148912e-04 -1.1314824e-03  1.7957615e-05
  1.7568787e-03 -4.1548072e-04 -2.4156375e-03  1.6446401e-03
  1.3864461e-03 -6.9752429e-04 -1.9017775e-03  7.9472450e-04
 -1.7543996e-03  2.4456582e-03  4.3933643e-03 -1.0579716e-03
  1.8010638e-04 -6.7879102e-04  4.2462624e-03 -1.4603051e-03
 -1.8175399e-03 -4.2822547e-03 -4.4280519e-03 -3.9219777e-03
  4.9678935e-03  2.3596936e-03 -3.5515763e-03  2.2862533e-03
  8.9634839e-04 -1.2832240e-03  6.6427368e-05  1.8717718e-03
 -1.2962266e-03 -4.4023818e-03 -1.0836938e-03  2.9238665e-03
  1.5134945e-03 -7.7210505e-05  2.2671104e-03 -5.3429761e-04
  4.9312217e-03  2.4417371e-03  3.7531301e-03  4.5012143e-03
  2.3243505e-04  1.6363548e-03  3.4421652e-03  3.6332740e-03
  2.6970885e-03  1.7557279e-03 -1.9867085e-03  1.8741086e-03
  3.1472641e-04  3.1674630e-03  3.8985212e-04  1.261958

In [6]:
# 1.ii) USE

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embeddings = embed(sentences)
print(embeddings, '\n\n')
for i in range(2):
    print('Sentence :')
    print(sentences[i])
    print('Converted to :')
    print(embeddings[i], '\n\n')

tf.Tensor(
[[ 0.01168496 -0.03060572  0.06116336 ... -0.08641755  0.0002505
   0.05482749]
 [ 0.02972507 -0.03655469  0.08002593 ... -0.07038905 -0.0283213
   0.04804676]
 [ 0.07221661 -0.04182237  0.05336686 ... -0.06942353  0.01795934
   0.06641504]
 ...
 [ 0.01586947 -0.05243036  0.0606509  ... -0.06435591  0.04215747
   0.06304204]
 [ 0.04141247  0.02588907 -0.0062563  ... -0.02162989  0.00910817
   0.03623575]
 [ 0.01578411 -0.02142678  0.00402448 ... -0.09605585 -0.06707881
   0.07970381]], shape=(8, 512), dtype=float32) 


Sentence :
A paragraph is a series of sentences that are organized and coherent, and are all related to a single topic.
Converted to :
tf.Tensor(
[ 1.16849625e-02 -3.06057241e-02  6.11633621e-02  8.47723782e-02
 -5.83404116e-03  2.84162583e-03  2.59479079e-02  3.90261016e-03
 -5.55586144e-02  5.68111017e-02 -8.95012915e-03 -4.50471742e-03
 -6.06310219e-02  3.18566561e-02 -6.86047673e-02 -9.39451605e-02
 -4.23613675e-02  3.93056758e-02 -9.02280435e-02 -5.536637

In [7]:
# 1. iii) ELMO
import tensorflow_hub as hub
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()
#1024 sized vectors
elmo=hub.Module("https://tfhub.dev/google/elmo/3",trainable=True)
embeddings=elmo(
    sentences,
    signature="default",
    as_dict=True)["elmo"]
init=tf.initialize_all_variables()
sess=tf.Session()
sess.run(init)
print("\n\n")
print(sess.run(embeddings[0]))
print("shape=",embeddings[0].shape)

Instructions for updating:
Use `tf.global_variables_initializer` instead.





[[ 0.29287004 -0.14378013 -0.32574052 ... -0.39559263 -0.35758853
  -0.03588088]
 [-0.59441584  0.09640743  0.50537694 ...  0.22031914  0.269769
   0.46307266]
 [-0.1708326  -0.18744111 -0.27626696 ... -0.67550904  0.25389987
   0.6540271 ]
 ...
 [-0.0284084  -0.04353216  0.04130162 ...  0.02583168 -0.01429836
  -0.01650422]
 [-0.0284084  -0.04353216  0.04130162 ...  0.02583168 -0.01429836
  -0.01650422]
 [-0.0284084  -0.04353216  0.04130162 ...  0.02583168 -0.01429836
  -0.01650422]]
shape= (32, 1024)


In [8]:
# 1.iv) GPT2

gp2tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2-large')
model = transformers.GPT2LMHeadModel.from_pretrained('gpt2-large')
res_vectors = gp2tokenizer.encode(para, add_special_tokens=False, return_tensors="pt")
print("shape=", res_vectors.shape)
print(res_vectors)

shape= torch.Size([1, 171])
tensor([[   32,  7322,   318,   257,  2168,   286, 13439,   326,   389,  8389,
           290, 24870,    11,   290,   389,   477,  3519,   284,   257,  2060,
          7243,    13, 16699,   790,  3704,   286,  3597,   345,   466,   326,
           318,  2392,   621,   257,  1178, 13439,   815,   307,  8389,   656,
         23549,    13,   770,   318,   780, 23549,   905,   257,  9173,   810,
           262, 45944,  3279,   286,   281, 14268,  2221,   290,   886,    11,
           290,  4145,  1037,   262,  9173,   766,   262,  4009,   286,   262,
         14268,   290, 13180,   663,  1388,  2173,    13,  2547,  6111,    82,
           460,  3994,   867,  1180,  6982,   286,  1321,    13,   317,  7322,
           714,  3994,   257,  2168,   286,  4506,  6096,   393,   257,  2060,
           890, 20936,   286,   257,  2276,   966,    13,   632,  1244,  6901,
           257,  1295,    11,  2095,    11,   393,  1429,    26,  6664,   378,
           257,  2168,  

In [9]:
# 1.v) Sentence-BERT

bert = hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim128/2")
embeddings = bert(sentences)
print(embeddings)
print("shape=",embeddings[0].shape)
print("The sentence in the paragraph:\n",sentences[0],"\nis converted into vector as :\n", embeddings[0])

Tensor("keras_layer/StatefulPartitionedCall:0", shape=(None, 128), dtype=float32)
shape= (128,)
The sentence in the paragraph:
 A paragraph is a series of sentences that are organized and coherent, and are all related to a single topic. 
is converted into vector as :
 Tensor("strided_slice_3:0", shape=(128,), dtype=float32)


In [10]:
# 2) Named Entity Recognition

nlp = spacy.load("en_core_web_sm")
res = nlp(para)

for word in res.ents:
    print(word.text,word.label_)
    
print(spacy.explain('GPE'))

print(displacy.render(res,style="ent",jupyter=True))

two CARDINAL
One CARDINAL
Countries, cities, states


None


In [11]:
# 3) Find similar sentences (repeated sentences) from the above paragraph? (Cosine Similarity, use BERT to encode)

sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

se_embeddings = sbert_model.encode(sentences)
q1_vec= sbert_model.encode(sentences[0])

def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

for sent in sentences:
    sim = cosine(q1_vec, sbert_model.encode([sent])[0])
    # similarity == 1 - repeated sentence
    # similarity > 0.5 - similar sentence
    if sim>0.5:
        print("Sentence1 =",sentences[0],"\n \nSentence2=", sent, "\n\nsimilarity = ", sim,end="\n ----------------------------- \n")

Sentence1 = A paragraph is a series of sentences that are organized and coherent, and are all related to a single topic. 
 
Sentence2= A paragraph is a series of sentences that are organized and coherent, and are all related to a single topic. 

similarity =  1.0
 ----------------------------- 
Sentence1 = A paragraph is a series of sentences that are organized and coherent, and are all related to a single topic. 
 
Sentence2= Almost every piece of writing you do that is longer than a few sentences should be organized into paragraphs. 

similarity =  0.6477537
 ----------------------------- 
Sentence1 = A paragraph is a series of sentences that are organized and coherent, and are all related to a single topic. 
 
Sentence2= This is because paragraphs show a reader where the subdivisions of an essay begin and end, and thus help the reader see the organization of the essay and grasp its main points. 

similarity =  0.5238008
 ----------------------------- 
Sentence1 = A paragraph is a se

In [12]:
# 4) POS Tagging for Above Given Paragraph
    
tokenized = sent_tokenize(para)
for i in tokenized:

    wordList = nltk.word_tokenize(i)
    wordList = [word for word in wordList if word not in stop_words]

    tagged = nltk.pos_tag(wordList)
 
    print(tagged)

[('A', 'DT'), ('paragraph', 'NN'), ('series', 'NN'), ('sentences', 'NNS'), ('organized', 'VBN'), ('coherent', 'NN'), (',', ','), ('related', 'VBN'), ('single', 'JJ'), ('topic', 'NN'), ('.', '.')]
[('Almost', 'RB'), ('every', 'DT'), ('piece', 'NN'), ('writing', 'VBG'), ('longer', 'JJR'), ('sentences', 'NNS'), ('organized', 'VBN'), ('paragraphs', 'NN'), ('.', '.')]
[('This', 'DT'), ('paragraphs', 'NN'), ('show', 'NN'), ('reader', 'NN'), ('subdivisions', 'NNS'), ('essay', 'VBP'), ('begin', 'JJ'), ('end', 'NN'), (',', ','), ('thus', 'RB'), ('help', 'NN'), ('reader', 'VB'), ('see', 'VB'), ('organization', 'NN'), ('essay', 'VB'), ('grasp', 'NN'), ('main', 'JJ'), ('points', 'NNS'), ('.', '.')]
[('Paragraphs', 'NNP'), ('contain', 'VBP'), ('many', 'JJ'), ('different', 'JJ'), ('kinds', 'NNS'), ('information', 'NN'), ('.', '.')]
[('A', 'DT'), ('paragraph', 'NN'), ('could', 'MD'), ('contain', 'VB'), ('series', 'NN'), ('brief', 'NN'), ('examples', 'VBZ'), ('single', 'JJ'), ('long', 'JJ'), ('illustr