<a href="https://colab.research.google.com/github/vi0908/Machine-Learning/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## NATURAL LANGUAGE PROCESSING (NLP) IN PYTORCH

In [1]:
from urllib.request import urlretrieve
import os 
if not os.path.isfile('movie-simple.txt'):
  text_url = 'https://raw.githubusercontent.com/duke-mlss/Duke-MLSS-2018/master/movie-simple.txt'
  urlretrieve(text_url, 'movie-simple.txt')


In [2]:
# DOWNLOAD PRE-TRAINED WORD VECTORS (WORD EMBEDDINGS) 
from urllib.request import urlretrieve
import os
if not os.path.isfile('mini.h5'):
    print("Downloading Conceptnet Numberbatch word embeddings")
    conceptnet_url = 'http://conceptnet.s3.amazonaws.com/precomputed-data/2016/numberbatch/17.06/mini.h5'
    urlretrieve(conceptnet_url, 'mini.h5')

Downloading Conceptnet Numberbatch word embeddings


In [3]:
# USE THE h5py PACKAGE TO READ THE FILE. 
import h5py

# EXTRACT FROM THE FILE A LIST OF utf-8 ENCODED WORDS
with h5py.File('mini.h5', 'r') as f:
  all_words = [word.decode('utf-8') for word in f['mat']['axis1'][:]] # List of 362891 strings
  all_embeddings = f['mat']['block0_values'][:] # 362891x300 Matrix

print("all_words dimensions: {} type: {}".format(len(all_words), type(all_words)))
print("all_embeddings dimensions: {} type: {}".format(all_embeddings.shape, type(all_embeddings)))
print("Random example: \n word:{}".format(all_words[1118]))


all_words dimensions: 362891 type: <class 'list'>
all_embeddings dimensions: (362891, 300) type: <class 'numpy.ndarray'>
Random example: 
 word:/c/de/armenier


In [4]:
# RESTRICT OUR VOCABULARY TO JUST THE ENGLISH WORDS
english_words = [word[6:] for word in all_words if word.startswith('/c/en/')]
english_word_indices = [i for i, word in enumerate(all_words) if word.startswith('/c/en/')]
english_embeddings = all_embeddings[english_word_indices]

print("English words: {}".format(len(english_words)))
print("English_embeddings dimension: {}".format(english_embeddings.shape))
print("Word: {}".format(english_words[888]))
print("Embedding: {}".format(english_embeddings[888]))


English words: 150875
English_embeddings dimension: (150875, 300)
Word: accessibility
Embedding: [  0   0   1   5  -5   0   1   1   3   6  -1   4  -3  -3  -1   0  -2  -3
   3  -7   2  -1  -3  -7  -1   2  -5   1   0  -3   0   0   1   1  -4  -3
   3   2   3  -1  -1   4   0   2   3   3  -3  -4   5  -2   0   4   1   5
   4  -2   1   0  -8   2   2  -2   1   0  -5   0   2   4   1  -5  -8   1
   1   3   0   0   1  10  -6   0  -3   6  -6  -1  -1   1  -2   4  -4  -2
  -3  -1  -3   0   2  -2  -4   1   3   5  -3   7   1   0   0   1  -6 -11
   9  -7  -1   0   0   0   6   1   3   3   2   1   4  -1  -1   6   0   0
  -2  -1  -4  -4   0  -4   1  -3  -1   2  -3   6  -4  -4  -1   5  -4   4
   5  -5  -4   1   3   3  -8   0  -2   4  -1  -5   2   0   1  -4  -7   0
  -4   3   2   7  -3   0  -2   1  -1  -1   0  -3  -7   3   3  -2  -2  -2
   2   7   3  -2   2   9   2   3  -5  -2  -4   1   3  -1  -3  -2  -6   0
  -5  -7   0   0   3  -1  -4   1  -6  -4  -5  -6  -1   4   6   2  -1   2
  -1   0   2   2  -2   0  -

In [5]:
import numpy as np
# Magnitude of a word vector represents frequency of use. 
# The direction represents its semantics. 

norms = np.linalg.norm(english_embeddings, axis=1)
# As direction is more important than magnitude, we normalize (divide each by its length) our vectors.
norm_embeddings = english_embeddings.astype('float32')/norms.astype('float32').reshape((-1,1)) 

# Dictionary that maps a word to its index in the word embeddings matrix
index = {word: i for i, word in enumerate(english_words)}



In [10]:
def similarity_score(w1, w2):
  score = np.dot(norm_embeddings[index[w1], :], norm_embeddings[index[w2],: ])
  return score
w1 = 'dog'
w2 = 'sun'
print("similarity between {} and {}: {}".format(w1, w2, similarity_score(w1,w2)))
print("\n {} norm_embedding: \n{} ".format(w1, norm_embeddings[index[w1]]))
print("\n {} norm_embedding: \n{} ".format(w2, norm_embeddings[index[w2]]))

similarity between dog and sun: 0.06799643486738205

 dog norm_embedding: 
[ 0.          0.          0.03474716 -0.08686789  0.01737358  0.
 -0.03474716 -0.06949431  0.01737358 -0.03474716  0.          0.03474716
  0.01737358  0.         -0.05212073  0.         -0.05212073 -0.06949431
  0.          0.05212073  0.01737358 -0.08686789  0.01737358 -0.05212073
  0.          0.08686789 -0.01737358 -0.03474716  0.          0.03474716
 -0.06949431  0.01737358  0.05212073  0.         -0.05212073  0.03474716
 -0.05212073  0.08686789 -0.03474716  0.         -0.06949431  0.06949431
 -0.05212073  0.         -0.01737358 -0.08686789 -0.03474716 -0.10424147
  0.05212073 -0.05212073  0.05212073  0.06949431 -0.03474716  0.
  0.03474716  0.         -0.03474716  0.05212073  0.08686789  0.
  0.10424147  0.01737358 -0.01737358 -0.01737358 -0.08686789 -0.08686789
  0.05212073  0.          0.01737358  0.         -0.03474716 -0.06949431
  0.06949431  0.         -0.08686789  0.01737358 -0.03474716 -0.08686789


In [7]:
def closest_to_wordvector(vector, number):
  all_scores = np.dot(norm_embeddings, vector)
  best_words = list(map(lambda i: english_words[i], reversed(np.argsort(all_scores))))
  return best_words[:number]

def most_similar(word, number):
  return closest_to_wordvector(norm_embeddings[index[word]], number)

# Most similar words to a given word:
w = 'star'
print("word: {} \tmost similar words: {}".format(w, most_similar(w, 10)))


word: star 	most similar words: ['star', 'stars', 'red_supergiant', 'epsilon_aurigae', 'starred', 'circumstellar', 'proxima_centauri', 'red_dwarf', 'starring', 'starlet']


In [8]:
def distant_to_wordvector(vector, number):
  all_scores = np.dot(norm_embeddings, vector)
  distant_words = list(map(lambda i: english_words[i], np.argsort(all_scores)))
  return distant_words[:number]

def different_words(word, number):
  return(distant_to_wordvector(norm_embeddings[index[word]], number))

# Most different words to a given word:
w = 'star'
print("word: {} \tmost different words: {}".format(w, different_words(w, 10)))


word: star 	most different words: ['asshats', 'agnew', 'exhibitionists', 'brautigan', 'djuna', 'longtail', 'fetishists', 'fariba', 'upshot', 'webpages']


# SIMPLE WORD EMBEDDING MODEL (SWEM)
Classifying the review of a movie as positive or negative (sentiment analysis).

In [9]:
  import string
  remove_punct = str.maketrans('','',string.punctuation)

  #Function that converts a line into a tuple (x,y), "x" is 300-dimensional representation of the words, "y" is its label.
  def convert_line_to_example(line):
    #First character: label (0 negative, 1 positive) 
    y = int(line[0])

    #Split the line into words
    words = line[2:].translate(remove_punct).lower().split()

    #Embeddings of each word
    embeddings = [norm_embeddings[index[w]] for w in words if w in index]

    #Mean of the embeddings 
    x = np.mean(np.vstack(embeddings), axis = 0) # columns
    return x, y
# Apply the function to each line in the file.
xs = []
ys = []
with open("movie-simple.txt", "r", encoding = 'utf-8', errors='ignore') as f:
   for line in f:
     x,y = convert_line_to_example(line)
     xs.append(x)
     ys.append(y)
# Concatenate all examples into a numpy array
xs = np.vstack(xs)
ys = np.vstack(ys)

In [11]:
print("Shape of inputs: {}".format(xs.shape))
print("Shape of labels: {}".format(ys.shape))

Shape of inputs: (1411, 300)
Shape of labels: (1411, 1)


In [18]:
num_examples = xs.shape[0]

print("First 20 labels before shuffling: {}".format(ys[:20,0]))
index_shuffle = np.random.permutation(num_examples)
xs = xs[index_shuffle,:]
ys = ys[index_shuffle,:]
print("First 20 labels after shuffling: {}".format(ys[:20, 0]))

First 20 labels before shuffling: [0 1 1 1 1 0 0 0 0 1 1 0 0 0 1 0 0 1 1 1]
First 20 labels after shuffling: [0 1 0 1 1 1 0 0 1 0 0 1 0 1 1 1 0 1 1 0]


In [19]:
import torch
# Floor division. Rounds the result down to the nearest integer
num_train = 4*num_examples // 5

# Train set
x_train = torch.tensor(xs[:num_train])
y_train = torch.tensor(ys[:num_train], dtype=torch.float32)

# Test set
x_test = torch.tensor(xs[num_train:])
y_test = torch.tensor(ys[num_train:], dtype=torch.float32)

In [20]:
reviews_train = torch.utils.data.TensorDataset(x_train, y_train)
reviews_test = torch.utils.data.TensorDataset(x_test, y_test)

# DataLoaders
train_loader = torch.utils.data.DataLoader(reviews_train, batch_size=100, shuffle=True)
test_loader = torch.utils.data.DataLoader(reviews_test, batch_size=100, shuffle=False)

In [21]:
import torch.nn as nn
import torch.nn.functional as F
from tqdm.notebook import tqdm

In [22]:
class SWEM(nn.Module):
  def __init__(self):
    super().__init__()
    self.fc1 = nn.Linear(300,64)
    # We are only doing binary classification, output will be a single value
    self.fc2 = nn.Linear(64,1)
  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = self.fc2(x)
    return x

In [None]:
## TRAINING
# Instantiate model
model = SWEM()

# Loss function -> Binary cross-entropy (BCE) and optimizer -> Adam Optimizer 
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)

for epoch in range(250):
  correct = 0
  num_examples = 0
  for inputs, labels in tqdm(train_loader):
    optimizer.zero_grad()

    # Forward pass
    y = model(inputs)
    loss = criterion(y, labels)

    #Backward pass
    loss.backward()
    optimizer.step()

    predictions = torch.round(torch.sigmoid(y))
    correct += torch.sum((predictions==labels).float())
    num_examples += len(inputs)
  
  # Training progress
  if epoch % 25 == 0:
    acc = correct / num_examples 
    print("Epoch: {} \tTrain loss: {} \tTrain acc: {}".format(epoch, loss, acc))



In [41]:
## Testing
correct = 0
num_test = 0

with torch.no_grad():
  for inputs, labels in test_loader:
    y = model(inputs)

    predictions = torch.round(torch.sigmoid(y))
    correct += torch.sum((predictions == labels).float())
    num_test += len(inputs)

print("Test accuracy: {}".format(correct/num_test)) 
  

Test accuracy: 0.9328621625900269


In [40]:
# Test some words
test_words = ["men", "you", "physics"]

for word in test_words:
  x = torch.tensor(norm_embeddings[index[word]].reshape(1,300))
  y = torch.round(torch.sigmoid(model(x)))
  if y == 1:
    label = 'positive'
  else:
    label = 'negative'  
  print("Sentiment of the word '{}': {}".format(word, label)) 

Sentiment of the word 'men': negative
Sentiment of the word 'you': positive
Sentiment of the word 'physics': positive
