In [21]:
import torch
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from torchtext.vocab import GloVe
from collections import Counter
import pandas as pd

# csv loaded up from 'text_cleaning/utterances_flattener.ipynb'
df = pd.read_csv('utterances_clean2014-2018.csv')

# pre trained model for tokenizing
nltk.download('punkt')

def get_glove_embeddings(df_column, dim=100):
    """
    Inputs:
    df_column: a single column of a data frame of textual data
    dim: choose dim of embedding vectors

    Outputs:
    returns tensor object of embeddings
    """
    glove = GloVe(name='6B', dim=dim)

    # set of unique words in the dataframe column, tried this to speed it up, not sure if it worked
    # tokenize to clean formatting
    unique_words = set(word for sentence in df_column for word in nltk.word_tokenize(sentence))
    
    # initialize dictionary
    word_to_vec = {word: glove[word].numpy() for word in unique_words if word in glove.stoi}

    # maps each word to embedding
    def map_to_embedding(sentence):
        return [word_to_vec.get(word, glove.get_vecs_by_tokens('<unk>').numpy()) 
                for word in nltk.word_tokenize(sentence)]

    # Use a new list to store embeddings instead of modifying the dataframe
    # supposedly faster, idk
    embeddings_list = df_column.apply(map_to_embedding).tolist()
    #convert each list to tensor
    embeddings = [torch.tensor(sentence) for sentence in embeddings_list]
    # padding
    embeddings = torch.nn.utils.rnn.pad_sequence(embeddings, batch_first=True)

    return embeddings



def get_bow_embeddings(df_column, vocab_size=10000):
    """
    Inputs:
    df_column: a single column of a data frame of textual data
    vocab_size: size of vocabulary to be used

    Outputs:
    returns tensor object of BoW representations
    """
    # Flatten the dataframe column into a single list of words and count the word frequencies
    words = [word for sentence in df_column for word in nltk.word_tokenize(sentence)]
    word_freqs = Counter(words)

    # Sort the words by frequency and keep only the top vocab_size words
    reduced_vocab = {word for word, _ in word_freqs.most_common(vocab_size)}

    # Initialize a CountVectorizer object with the reduced vocabulary
    vectorizer = CountVectorizer(vocabulary=reduced_vocab)

    # Fit the vectorizer on the text data and transform the data
    bow = vectorizer.fit_transform(df_column)

    # Convert the result to a dense matrix and then to a DataFrame
    df_bow = pd.DataFrame(bow.todense(), columns=vectorizer.get_feature_names_out())

    # Convert the DataFrame to a PyTorch tensor
    tensor_bow = torch.tensor(df_bow.values)

    return tensor_bow


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/michaelwagner/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Testing GloVe

In [5]:
glove_test = get_glove_embeddings(df['text'], 100)

In [6]:
# 73531 text entries
# sentences padded to 1915 (longest sentence)
# each word dimension 100
glove_test.shape

torch.Size([73531, 1915, 100])

In [18]:
# one sentence
glove_test[0]

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.2252,  0.5219,  0.4974,  ...,  0.0550,  0.4561,  0.6579],
        [-0.0378,  0.5388,  0.7541,  ...,  0.1866,  0.4622,  0.3770],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

### Testing BoW

In [22]:
bow_test = get_bow_embeddings(df['text'], 5000)



In [23]:
# 73531 text
# 5000 width for vocab
bow_test.shape

torch.Size([73531, 5000])

In [24]:
# one sentence
bow_test[0]

tensor([0, 0, 0,  ..., 0, 0, 0])

In [36]:
# look at most frequent words in a text value
bow_test[9].topk(10)

torch.return_types.topk(
values=tensor([8, 6, 5, 4, 3, 2, 2, 2, 2, 2]),
indices=tensor([4589, 4590, 2719, 2887, 3348, 4143, 1150, 1023, 4221, 2880]))