In [3]:
import csv
import string

import numpy as np
import pandas as pd

# packages from torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import re

from io import StringIO
import requests

from textblob import TextBlob

# Load Data


In [4]:
# Function to load data from scifi.txt
def load_scifi(url):
    with open(url) as f:
        lines = f.readlines()
    text = lines[0]
    return text 

# Function to load data from trip advisor
def load_tripadvisor(url):
    df = pd.read_csv(url)
    df.columns = ['Review','Rating']
    text = ''
    for review in df['Review']:
        text+=review
    return text

Depending on where you run the jupyter notebook, please consider changing the url. Since we run locally, we have the files in data folder (same directory as the notebook).

In [1]:
# replace your url here
scifi_url ='./dataset/scifi.txt'
tripadvisor_url = './dataset/tripadvisor_hotel_reviews.csv'

In [6]:
trip_text = load_tripadvisor(tripadvisor_url)
scifi_text = load_scifi(scifi_url)

# Preprocessing

### Todo 
- Sentence segmentation. (Split sentences by full stop "."). 
- Remove HTML tags
- Remove URLs (if needed)
- Lower case
- Correct spelling
- Emojies (if needed)
- Chat word treatment (if needed)
- Remove special characters and punctuation
- Remove rare words because we dont have enough statistic (how infrequent does a word appear to be considered rare?)
- Things that DONT HAVE to implement:
    - Remove stopword? (not needed if we choose a large window size). Because stop words are frequent and does not have much meaning so think about increasing the window size so that it is not so relevant anymore. If we choose a smaller window size, it makes sense to remove predicting context using stop word as the target word in CBOW)
    - Stemming is not needed.

In [9]:
# Sentence segmentation 
def sentence_segmentation(text):
    pass


In [27]:
class TextCleaner:
    # text is an array of string. Each element is a sentence.
    def __init__(self,text):
        self.text = text
    
    # Add function call here to choose which cleaning function is used 
    def get_clean_text(self):
        self.remove_html_tags()
        self.remove_url()
        self.convert_lowercase()
        self.correct_spelling()
        self.remove_emojies()
        self.chatword_treatment()
        self.remove_spec()
        self.remove_rare_word()

        return self.text
    
    # Remove html tags
    def remove_html_tags(self):
        re_html = re.compile('<.*?>')
        self.text = re_html.sub(r'', self.text)

    # Remove url
    def remove_url(self):
        pass

    # Correct spelling function
    def correct_spelling(self):
        textblob_ = TextBlob(text)
        return textblob_.correct().string

    # Remove emojies
    def remove_emojies(self):
        pass

    # Lowercase convert
    def convert_lowercase(self):
        text = text.str.lower()
        return text
    
    # Chat word treatment (abbreviation)
    def chatword_treatment(self):
        pass

    # Remove special characters and punctuation
    def remove_spec_char(self):
        pass

    # Remove rare words
    def remove_rare_words(self):
        pass
    

In [None]:
scifi_cleaner = TextCleaner(scifi_text)
scifi = scifi_cleaner.get_clean_text()

# Build CBOW model

Create class CBOW. Take parameters as vocab size and embedding dimensions as input to contruct layers

In [31]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        
        # Out (context_size*2) x embedding_dimensions
        # But will take sum in forward 
        # => Out 1 x embedding_dimensions
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

        # Out 1 x 128
        self.linear1 = nn.Linear(embedding_dim, 128)
        
        # Out 1 x 128
        # Activation function Relu
        self.act1 = nn.ReLU()

        # Out 1 x vocab_size
        self.linear2 = nn.Linear(128, vocab_size)

        # Out 1 x vocab_size
        self.act2 = nn.LogSoftmax(dim=1)

    def forward(self, inputs):
        # Start of layer 1
        l1 = self.embeddings(inputs)
        # l1 is of shape (context_size*2) x embedding_dimensions
        # sum them up an transform them to 1 x embedding_dimension
        l1 = sum(l1).view(1,-1)
        l1 = self.linear1(l1)
        l1 = self.act1(l1)
        
        # Start of layer 2, probability is the output of log softmax
        l2 = self.linear2(l1)
        prob = self.act2(l2)
        return prob

def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

Define vocab, etc.

In [32]:
vocab = set()
vocab_size = ...
word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))

Define parmeters, network and loss function

In [None]:
CONTEXT_SIZE = 2 # 2 words to the left, 2 to the right
EMBEDDING_DIM = 300 # As recommended in the lecture

# Set device to cuda to run on GPU
device = torch.device("cuda")

# choose NLLLoss we want to minize the negative log of softmax
loss_function = nn.NLLLoss()
losses = []

# Set up model
model = CBOW(vocab_size, EMBEDDING_DIM).to(device)

# Choose optimizer as SGD
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

Trainning loop

In [14]:
for epoch in range(100):
    total_loss=0
    for context, target in data:
        # Vector representaion of the context words (by its index from vocab)
        # Send to GPU
        context_vector = make_context_vector(context, word_to_ix).to(device)

        # Calculate output of the model 
        log_probs = model(context_vector)

        # Target vector based on the dataset 
        # Send to GPU
        target_vector = torch.tensor([word_to_ix[target]], dtype=torch.long).to(device)

        # Loss of log negative softmax
        loss = loss_function(log_probs,target_vector)

        # Back propagation and update weight
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
    losses.append(total_loss)

# Result

Nearest neighbor

In [None]:
def get_closest_word(word, topn=5):
    word_distance = []
    emb = net.embeddings_target
    pdist = nn.PairwiseDistance()
    i = word_to_index[word]
    lookup_tensor_i = torch.tensor([i], dtype=torch.long)
    v_i = emb(lookup_tensor_i)
    for j in range(len(vocabulary)):
        if j != i:
            lookup_tensor_j = torch.tensor([j], dtype=torch.long)
            v_j = emb(lookup_tensor_j)
            2
            word_distance.append((index_to_word[j], float(pdist(v_i, v_j))))
    word_distance.sort(key=lambda x: x[1])
    return word_distance[:topn]