# Demos

This is a temporary notebook created in order to try various stuff without mess around too much in the assignment code

## Sentence Embedding

In [16]:
!pip install -U gensim

import nltk

from nltk.tokenize import word_tokenize
import numpy as np

nltk.download('punkt')

Requirement already up-to-date: gensim in /Users/enrico/anaconda3/lib/python3.8/site-packages (4.1.2)


[nltk_data] Downloading package punkt to /Users/enrico/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
# define a list of sentences
sentences = ["I ate dinner.", 
       "We had a three-course meal.", 
       "Brad came to dinner with us.",
       "He loves fish tacos.",
       "In the end, we all felt like we ate too much.",
       "We all agreed; it was a magnificent evening."]

# Tokenization of each document
tokenized_sent = []
for s in sentences:
    tokenized_sent.append(word_tokenize(s.lower()))

In [11]:
# Function for computing cosine similarity
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

### Doc2Vec

In [17]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]
tagged_data

[TaggedDocument(words=['i', 'ate', 'dinner', '.'], tags=[0]),
 TaggedDocument(words=['we', 'had', 'a', 'three-course', 'meal', '.'], tags=[1]),
 TaggedDocument(words=['brad', 'came', 'to', 'dinner', 'with', 'us', '.'], tags=[2]),
 TaggedDocument(words=['he', 'loves', 'fish', 'tacos', '.'], tags=[3]),
 TaggedDocument(words=['in', 'the', 'end', ',', 'we', 'all', 'felt', 'like', 'we', 'ate', 'too', 'much', '.'], tags=[4]),
 TaggedDocument(words=['we', 'all', 'agreed', ';', 'it', 'was', 'a', 'magnificent', 'evening', '.'], tags=[5])]

In [27]:
## Train doc2vec model
model = Doc2Vec(tagged_data, vector_size = 20, window = 2, min_count = 1, epochs = 100)

'''
vector_size = Dimensionality of the feature vectors.
window = The maximum distance between the current and predicted word within a sentence.
min_count = Ignores all words with total frequency lower than this.
alpha = The initial learning rate.
'''

## Print model vocabulary
model.wv.key_to_index 

{'.': 0,
 'we': 1,
 'ate': 2,
 'dinner': 3,
 'a': 4,
 'all': 5,
 'evening': 6,
 'came': 7,
 'us': 8,
 'with': 9,
 'to': 10,
 'three-course': 11,
 'brad': 12,
 'meal': 13,
 'loves': 14,
 'had': 15,
 'he': 16,
 'fish': 17,
 'magnificent': 18,
 'tacos': 19,
 'in': 20,
 'the': 21,
 'end': 22,
 ',': 23,
 'felt': 24,
 'like': 25,
 'too': 26,
 'much': 27,
 'agreed': 28,
 ';': 29,
 'it': 30,
 'was': 31,
 'i': 32}

We now take up a new test sentence and find the top 5 most similar sentences from our data. We will also display them in order of decreasing similarity. The infer_vector method returns the vectorized form of the test sentence(including the paragraph vector). The most_similar method returns similar sentences

In [31]:
test_doc = word_tokenize("I had pizza and pasta".lower())
test_doc_vector = model.infer_vector(test_doc)
model.dv.most_similar(positive = [test_doc_vector])

[(4, 0.7299023866653442),
 (3, 0.5823500156402588),
 (1, 0.5418941378593445),
 (2, 0.5351356267929077),
 (5, 0.44702044129371643),
 (0, 0.2542485296726227)]

## Recurrent Neural Networks

From this article: https://towardsdatascience.com/recurrent-neural-networks-by-example-in-python-ffd204f99470 i've found a series of notebooks with some stuff on RNN

These functions below might be useful (maybe) for creating sequences 

In [None]:
def get_data(file, filters='!"%;[\\]^_`{|}~\t\n', training_len=50,
             lower=False):
    """Retrieve formatted training and validation data from a file"""
    
    data = pd.read_csv(file, parse_dates=['patent_date']).dropna(subset = ['patent_abstract'])
    
    # cleans sentences
    abstracts = [format_sequence(a) for a in list(data['patent_abstract'])]
    
    # create sequences of integers from texts
    word_idx, idx_word, num_words, word_counts, texts, sequences, features, labels = make_sequences(
        abstracts, training_len, lower, filters)
    
    # create train and validation sets
    X_train, X_valid, y_train, y_valid = create_train_valid(features, labels, num_words)
    training_dict = {'X_train': X_train, 'X_valid': X_valid, 
                     'y_train': y_train, 'y_valid': y_valid}
    return training_dict, word_idx, idx_word, sequences

def make_sequences(texts, training_length = 50,
                   lower = True, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'):
    """Turn a set of texts into sequences of integers"""
    
    # Create the tokenizer object and train on texts
    tokenizer = Tokenizer(lower=lower, filters=filters)
    tokenizer.fit_on_texts(texts)
    
    # Create look-up dictionaries and reverse look-ups
    word_idx = tokenizer.word_index
    idx_word = tokenizer.index_word
    num_words = len(word_idx) + 1
    word_counts = tokenizer.word_counts
    
    print(f'There are {num_words} unique words.')
    
    # Convert text to sequences of integers
    sequences = tokenizer.texts_to_sequences(texts)
    
    # Limit to sequences with more than training length tokens
    seq_lengths = [len(x) for x in sequences]
    over_idx = [i for i, l in enumerate(seq_lengths) if l > (training_length + 20)]
    
    new_texts = []
    new_sequences = []
    
    # Only keep sequences with more than training length tokens
    for i in over_idx:
        new_texts.append(texts[i])
        new_sequences.append(sequences[i])
        
    features = []
    labels = []
    
    # Iterate through the sequences of tokens
    for seq in new_sequences:
        
        # Create multiple training examples from each sequence
        for i in range(training_length, len(seq)):
            # Extract the features and label
            extract = seq[i - training_length: i + 1]
            
            # Set the features and label
            features.append(extract[:-1])
            labels.append(extract[-1])
    
    print(f'There are {len(features)} sequences.')
    
    # Return everything needed for setting up the model
    return word_idx, idx_word, num_words, word_counts, new_texts, new_sequences, features, labels