### Add Tokenized Text and Embeddings to CSVS
Run all of the following cells, in order

In [2]:
import sklearn
from sklearn.model_selection import train_test_split
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
import pandas as pd
from scipy.special import softmax
imLoad DatasetLoad Datasetport timeLoad Dataset
import nltk

Load Dataset

In [24]:
def load_datasets():
    """
    Load training and test sets
    """

    train = pd.read_csv("./new_dataset/train_preprocessed.csv")
    dev = pd.read_csv("./new_dataset/val_preprocessed.csv")
    test = pd.read_csv("./new_dataset/test_preprocessed.csv")
    
    return train, dev, test

Tokenizing Input Text and Adding to Train and Dev Dataframes

In [35]:
def tokenizer(text: str):
    '''
    NLTK Tweet Tokenizer -- removes handles

    @param text        string tweet
    @ret tokens        list of tokens
    '''
    text = text.lower()
    tokenizer = nltk.tokenize.TweetTokenizer(strip_handles=True)
    tokens = tokenizer.tokenize(text)
    
    return tokens

Embedding Input Text Tokens

In [17]:
def load_embeddings(filename):
    """
    Loads embeddings from embedding file and creates 
    1) dictionary of embedding words to indices
    2) list of embedding indices to words
    3) dense word embedding matrix
    """
    embeddings = KeyedVectors.load_word2vec_format(filename, binary=False, no_header=True)
    vocab2indx = dict(embeddings.key_to_index)
    idx2vocab = list(embeddings.index_to_key)
    embed_array = embeddings.vectors # matrix of dense word embeddings 
                                     # rows: a word 
                                     # columns: dimensions (50) of the dense embeddings
    return vocab2indx, idx2vocab, embed_array


def add_the_embedding(embed_array, vocab2indx): 
    """
    Adds "the" embedding to the embed_array matrix
    """
    the_embedding = embed_array[vocab2indx["the"]]
    out = np.vstack((embed_array, the_embedding))
    return out


def add_oov(idx2vocab, vocab2indx, embed_array):
    """
    Adds <OOV> token to embedded vocabulary
    """
    print("len embed array: ", len(embed_array))
    new_oov_entry = len(embed_array)
    idx2vocab += ["<OOV>"]
    vocab2indx["<OOV>"] = new_oov_entry
    embed_array_w_oov = add_the_embedding(embed_array, vocab2indx)

    return idx2vocab, vocab2indx, embed_array_w_oov


def add_pad(idx2vocab, vocab2indx, embed_array):
    """
    Adds <PAD> token to embedded vocabulary
    """
    print("len embed array: ", len(embed_array))
    new_pad_entry = len(embed_array)
    idx2vocab += ["<PAD>"]
    vocab2indx["<PAD>"] = new_pad_entry
    embed_array_w_pad = add_the_embedding(embed_array, vocab2indx)
    
    return idx2vocab, vocab2indx, embed_array_w_pad


def truncate(original_indices_list: list, maximum_length=128) -> list: 
    """
    Truncates the original_indices_list to the maximum_length
    """
    return original_indices_list[0:maximum_length]


def pad(original_indices_list: list, pad_index: int, maximum_length=128) -> list: 
    """
    Given original_indices_list, concatenates the pad_index enough times 
    to make the list to maximum_length. 
    """
    while len(original_indices_list) < maximum_length:
        original_indices_list.append(pad_index)
        
    return original_indices_list


def get_padded_oov_embeddings():
    """
    Get embedding array which includes the <PAD> and <OOV> tokens
    """
    vocab2indx, idx2vocab, embed_array = load_embeddings("glove.twitter.27B.50d.txt")
    idx2vocab, vocab2indx, embed_array_w_oov = add_oov(idx2vocab, vocab2indx, embed_array)
    idx2vocab, vocab2indx, embed_array_w_oov_pad = add_pad(idx2vocab, vocab2indx, embed_array_w_oov)
    
    return embed_array_w_oov_pad, vocab2indx, idx2vocab

def create_word_indices(tokens, vocab2indx): 
    """
    For each example, translate each token into its corresponding index from vocab2indx
    
    Replace words not in the vocabulary with the symbol "<OOV>" 
        which stands for 'out of vocabulary'
        
    Arguments: 
       - tokens (List[str]): list of strings of tokens 
       - vocab2indx (dict): each vocabulary word as strings and its corresponding int index 
                           for the embeddings 
                           
    Returns: 
        - (List[int]): list of integers
    """
    indices = []
    num_oov = 0

    for token in tokens:
        if token not in vocab2indx:
            token = "<OOV>"
            num_oov += 1
        indices.append(vocab2indx[token])
    
    return indices, num_oov, len(tokens)


def convert_X(Xmat, embeddings, vocab2indx, idx2vocab):
    MAXIMUM_LENGTH = 128
    
    X_list_embedded = []
    X_list_tokenized = []
    num_total_tokens = 0
    num_oov = 0
    
    for one_example in Xmat:
        one_example = str(one_example)
        one_example_tokenized = tokenizer(one_example)
        X_list_tokenized.append(one_example_tokenized)
        example_indices, num_oov_in_example, num_tokens_in_example = create_word_indices(one_example, vocab2indx)
        example_indices = truncate(example_indices, maximum_length=MAXIMUM_LENGTH)
        example_indices = pad(example_indices, len(vocab2indx)-1, maximum_length=MAXIMUM_LENGTH)
        
        example_embeddings = [] # A list of token embeddings
        
        for index in example_indices:
            example_embeddings.append(embeddings[index])
        
        X_list_embedded.append(example_embeddings)
        
        num_total_tokens += num_tokens_in_example
        num_oov += num_oov_in_example
        percent_oov = (num_oov/num_total_tokens)
        
    return X_list_tokenized, X_list_embedded, percent_oov

Adding Embeddings and Tokenized Text to Train and Test Dataframes

In [31]:
# Load data
train, dev, test = load_datasets()

In [32]:
# Get GloVE embeddings
embeddings, vocab2indx, idx2vocab = get_padded_oov_embeddings()

len embed array:  1193514
len embed array:  1193515


In [43]:
def add_tokenized_embeddings_to_csvs():
    '''
    Converts twitter data into arrays of GloVE embeddings for each dataset
    and adds them to each DataFrame

    @ret       updated train, dev, and test datasets as Pandas DataFrames
    '''
    # Convert twitter data into arrays of GloVE embeddings for each dataset
    X_train_tokens, X_train_embedded, percent_train_oov = convert_X(train["text"], embeddings, vocab2indx, idx2vocab)
    X_dev_tokens, X_dev_embedded, percent_dev_oov = convert_X(dev["text"], embeddings, vocab2indx, idx2vocab)
    X_test_tokens, X_test_embedded, percent_test_oov = convert_X(test["text"], embeddings, vocab2indx, idx2vocab)

    # Add 'Embedding' and 'Tokenized Text' Columns to Dataframes
    train["Embedding"] = X_train_embedded
    train["Tokenized Text"] = X_train_tokens
    dev["Embedding"] = X_dev_embedded
    dev["Tokenized Text"] = X_dev_tokens
    test["Embedding"] = X_test_embedded
    test["Tokenized Text"] = X_test_tokens

    print("Percentage of train tokens out-of-vocabulary: ", percent_train_oov)
    print("Percentage of dev tokens out-of-vocabulary: ", percent_dev_oov)
    print("Percentage of test tokens out-of-vocabulary: ", percent_test_oov)
    
    return train, dev, test

In [44]:
# Update train, dev, and test sets with tokenized text and embeddings
train, dev, test = add_tokenized_embeddings_to_csvs()

Percentage of train tokens out-of-vocabulary:  0.2545306824346183
Percentage of dev tokens out-of-vocabulary:  0.2516598166297819
Percentage of test tokens out-of-vocabulary:  0.23862955833600685


In [None]:
# Sanity Check: Do the dataframes now contain 'Embedding' and 'Tokenized Text' columns?
print(train[0:5])

Adding Embeddings and Tokenized Text to Train and Test CSVs

In [None]:
train.to_csv("new_dataset/train_final.csv")
dev.to_csv("new_dataset/val_final.csv")
test.to_csv("new_dataset/test_final.csv")