In [None]:
import numpy as np
import pandas as pd

# Goodreads Books Reviews example

In [None]:
BOOK_REVIEW_TRAIN_PATH = 'goodreads_train.csv'

In [None]:
train = pd.read_csv(BOOK_REVIEW_TRAIN_PATH, usecols=['review_id', 'review_text'], nrows=100000)
train.set_index('review_id', inplace=True)

In [None]:
subsample = train.sample(10).copy()
subsample

In [None]:
subsample.review_text.iloc[0]

## Step 1: Cleaning

In [None]:
import string
import re

punct = re.compile('[' + re.escape(string.punctuation) + ']')
digits = re.compile('[' + re.escape(string.digits) + ']')

def clean (str_):  
    
    str_ = str_.lower()
    str_ = re.sub('\n',' ',str_)
    str_ = re.sub(punct,r' ', str_)
    str_ = re.sub(digits,r' ', str_)
    str_ = re.sub(r'\s+',r' ', str_)
    str_ = str_.strip()
    
    return str_


In [None]:
string.punctuation + 

In [None]:
subsample['review_text'].apply(clean).iloc[0]

## Step 2: Reducing each word into a common base (root)
https://blog.bitext.com/what-is-the-difference-between-stemming-and-lemmatization/

### Stemming
Works by cutting off the end or the beginning of the word, taking into account a list of common prefixes and suffixes. <br> 
A stem can be the same for the inflectional forms of different lemmas.

In [None]:
from nltk.stem import SnowballStemmer

def stem(str_):
    
    str_ = clean(str_)
    
    words = str_.strip().split(' ')
    words = ' '.join([SnowballStemmer('english').stem(word) for word in words])
    
    return words

In [None]:
subsample['review_text_stem'] = subsample['review_text'].apply(stem)
subsample['review_text_stem'].iloc[0]

### Lemmatization
Takes into consideration the morphological analysis of the words. <br>
Same lemma can correspond to forms with different stems.<br>
A lemma is the base form of all its inflectional forms, whereas a stem isn’t.<br>

Lemmatization is a common technique to increase recall (to make sure no relevant document gets lost).

In [None]:
from nltk.stem import WordNetLemmatizer
# import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

def lemmatize (str_):
    
    str_ = clean(str_)
    
    words = str_.strip().split(' ')
    words = ' '.join([lemmatizer.lemmatize(word) for word in words ])
    
    return words

In [None]:
subsample['review_text_lemme'] = subsample['review_text'].apply(lemmatize)
subsample['review_text_lemme'].iloc[0]

## Step 4: Stopwords elimination

In [None]:
# spacy download en_core_web_sm 
# import nltk
# nltk.download('stopwords')

from spacy import load 
from nltk.corpus import stopwords
from sklearn.feature_extraction import _stop_words

en = load('en_core_web_sm')

nltk_stopwords = stopwords.words('english')
sklearn_stopwords = _stop_words.ENGLISH_STOP_WORDS
spacy_stopwords = en.Defaults.stop_words

print(f'There are {len(nltk_stopwords)} stopwords in nltk list.')
print(f'There are {len(sklearn_stopwords)} stopwords in sklearn list.')
print(f'There are {len(spacy_stopwords)} stopwords in spacy list.')

In [None]:
def remove_stop_words(str_, reduce_funct, stopwords):
    
    str_ = reduce_funct(str_)
    words = str_.strip().split(' ')
    
    words = ' '.join([word for word in words if word not in stopwords])
    
    return words
    

In [None]:
subsample['review_text_stem_no_sw'] = subsample['review_text'].apply(remove_stop_words, stem, args=[stem, spacy_stopwords])
subsample['review_text_stem_no_sw'].iloc[0]

## Step 5: Vectorizing the text

### Bag-of-words + TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

max_features=10

word_vectorizer = TfidfVectorizer(min_df=.05, max_df=.5, 
                                  token_pattern=r'\w{2,}',
                                  ngram_range=(1, 1),
                                  max_features=max_features
                                  )

word_vectorizer.fit(subsample['review_text_stem_no_sw'])

pd.DataFrame(word_vectorizer.transform(subsample['review_text_stem_no_sw']).todense(), 
             columns = word_vectorizer.get_feature_names_out())


In [None]:
# All transformations in place

word_vectorizer_raw = TfidfVectorizer(min_df=.05, max_df=.5,
                                      token_pattern=r'\w{2,}',
                                      stop_words=spacy_stopwords,
                                      analyzer='word',
                                      ngram_range=(1, 1),
                                      max_features=max_features)

word_vectorizer_raw.fit(subsample['review_text'])

pd.DataFrame(word_vectorizer_raw.transform(subsample['review_text']).todense(), 
             columns = word_vectorizer_raw.get_feature_names_out())

### Word2Vec embedding
https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-auto-examples-tutorials-run-word2vec-py

#### Initial learn

In [None]:
subsample['review_words'] = subsample['review_text_stem_no_sw'].str.split(' ')

In [None]:
import gensim

w2v_model = gensim.models.Word2Vec(subsample['review_words'],
                                   vector_size=10,
                                   window=5, # context
                                   min_count=2,
                                   sg=0, # 0=CBOW, 1=Skip-gram
                                   epochs=5)

w2v_model.save('word2vec_model')

In [None]:
w2v_words = set(w2v_model.wv.index_to_key)

print(len(w2v_words))

w2v_model.wv.index_to_key[:30]

In [None]:
w2v_model.wv.get_vector('book')

In [None]:
# Aggregate review vectors based on the word vectors for each word in the given review

word_vector = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_words]) for ls in subsample['review_words']], 
                       dtype=object)

word_vector[:1]

In [None]:
# Not each word has a vector

for i, v in enumerate(word_vector):
    print(len(subsample['review_words'].iloc[i]), len(v))

In [None]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word_model_vectors):
        self.word_model_vector = word_model_vectors
        self.vector_size = word_model_vectors.vector_size

    def fit(self): 
        return self

    def transform(self, docs):  
        doc_word_vector = np.vstack([self.word_average(sent) for sent in docs])
        return doc_word_vector

    def word_average(self, sent):
        mean = []
        for word in sent:
            if word in self.word_model_vector.index_to_key:
                mean.append(self.word_model_vector.get_vector(word))

        if not mean:  # empty words
            return np.zeros(self.vector_size)
        else:
            mean = np.array(mean).mean(axis=0)
            return mean

In [None]:
mean_embedding_vectorizer = MeanEmbeddingVectorizer(w2v_model.wv)

words_w2v_vectorized = mean_embedding_vectorizer.transform(subsample['review_words'])

words_w2v_vectorized[:1]

In [None]:
for i, v in enumerate(words_w2v_vectorized):
    print(len(subsample['review_words'].iloc[i]), len(v))

#### Updating

In [None]:
subsample2 = train.sample(10).copy()
subsample2['review_text_stem'] = subsample2['review_text'].apply(stem)
subsample2['review_text_stem_no_sw'] = subsample2['review_text'].apply(remove_stop_words, stem, args=[stem, spacy_stopwords])
subsample2['review_words'] = subsample2['review_text_stem_no_sw'].str.split(' ')

In [None]:
w2v_model_reloaded = gensim.models.Word2Vec.load('word2vec_model')

In [None]:
w2v_model_reloaded.build_vocab(subsample2['review_words'], update=True)

In [None]:
len(w2v_model_reloaded.wv.index_to_key)

In [None]:
w2v_model_reloaded.train(subsample2['review_words'], 
                         total_examples=w2v_model_reloaded.corpus_count, 
                         epochs=5)

In [None]:
w2v_model_reloaded.save('w2v_model_updated')

w2v_model_reloaded.wv.save('w2v_model_vectors_updated')

In [None]:
w2v_model_vectors_reloaded = gensim.models.KeyedVectors.load('w2v_model_vectors_updated', mmap='r')
w2v_model_vectors_reloaded

### Word2Vec + TF-IDF

In [None]:
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

class TfidfEmbeddingVectorizer(object):

    def __init__(self, word_model_vectors):

        self.word_model_vectors = word_model_vectors
        self.word_idf_weight = None
        self.vector_size = word_model_vectors.vector_size

    def fit(self, docs): 
        text_docs = []
        for doc in docs:
            text_docs.append(' '.join(doc))

        tfidf = TfidfVectorizer()
        tfidf.fit(text_docs)  # must be list of text strings

        max_idf = max(tfidf.idf_)  # used as default value for defaultdict
        
        self.word_idf_weight = defaultdict(lambda: max_idf,
                           [(word, tfidf.idf_[i]) for word, i in tfidf.vocabulary_.items()])
        return self


    def transform(self, docs):  
        doc_word_vector = np.vstack([self.word_average(sent) for sent in docs])
        return doc_word_vector


    def word_average(self, sent):
        mean = []
        for word in sent:
            if word in self.word_model_vectors.index_to_key:
                mean.append(self.word_model_vectors.get_vector(word) * self.word_idf_weight[word])  # idf weighted

        if not mean: 
            return np.zeros(self.vector_size)
        else:
            mean = np.array(mean).mean(axis=0)
            return mean


In [None]:
tfidf_vec = TfidfEmbeddingVectorizer(w2v_model.wv)

tfidf_vec.fit(subsample['review_words'])  # fit tfidf model first

words_w2v_tfidf_vectorized = tfidf_vec.transform(subsample['review_words'])

words_w2v_tfidf_vectorized[:1]

### Pretrained GloVe model from gensim
https://radimrehurek.com/gensim/auto_examples/howtos/run_downloader_api.html

In [None]:
# Models and corpora available in gensim

import json
import gensim.downloader as api

info = api.info()

print(json.dumps(info, indent=4))

In [None]:
# Models available in gensim with short description

for model_name, model_data in sorted(info['models'].items()):
    print(
        '%s (%d records): %s' % (
            model_name,
            model_data.get('num_records', -1),
            model_data['description'][:100] + '...',
        )
    )

In [None]:
glove_model_vectors = api.load("glove-wiki-gigaword-50") 
glove_model_vectors

In [None]:
len(glove_model_vectors.index_to_key)

In [None]:
glove_model_vectors.get_vector('book')

In [None]:
mev_glove = MeanEmbeddingVectorizer(glove_model_vectors)

words_glove_vectorized = mev_glove.transform(subsample['review_words'])

words_glove_vectorized[:1]

### BERT embeddings
The BERT (Bidirectional Encoder Representations from Transformers) family of models uses the Transformer encoder architecture to process each token of input text in the full context of all tokens before and after. <br>
https://huggingface.co/docs/transformers/training#train-with-pytorch-trainer
https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#1-loading-pre-trained-bert

https://www.kaggle.com/code/colearninglounge/vectorization-embeddings-elmo-bert-gpt#Using-the-Transfomer-Method

In [None]:
from transformers import AutoTokenizer, BertModel
import tensorflow as tf
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

encoded_reviews = tokenizer(subsample['review_text'].tolist(), padding='max_length', truncation=True, return_tensors='pt')

tokens_tensor = encoded_reviews['input_ids']
segments_tensors = encoded_reviews['token_type_ids']

encoded_reviews

In [None]:
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )


In [None]:
model.eval()

In [None]:
with torch.no_grad():
    outputs = model(tokens_tensor, segments_tensors)
    hidden_states = outputs[2]

In [None]:
print (f'Number of layers: {len(hidden_states)} (initial embeddings + 12 BERT layers)')
layer_i = 0

print ('Number of batches', len(hidden_states[layer_i]))
batch_i = 0

print ('Number of tokens', len(hidden_states[layer_i][batch_i]))
token_i = 0

print ('Number of hidden units', len(hidden_states[layer_i][batch_i][token_i]))


In [None]:
# `hidden_states` is a Python list.
print('Type of hidden_states: ', type(hidden_states))

In [None]:
# Each layer in the list is a torch tensor.
print('Tensor shape for each layer: ', hidden_states[0].size())

In [None]:
# Concatenate the tensors for all layers. We use `stack` here to
# create a new dimension in the tensor.
token_embeddings = torch.stack(hidden_states, dim=0)

token_embeddings.size()


In [None]:
# `hidden_states` has shape [13 x 10 x 512 x 768]

# `token_vecs` is a tensor with shape [512 x 768]
token_vecs = hidden_states[-2][0]

# Calculate the average of all 512 token vectors.
sentence_embedding = torch.mean(token_vecs, dim=0)


In [None]:
embeddings = [torch.mean(hidden_states[-2][i], dim=0) for i in range(len(hidden_states[0]))]
embeddings[:1]

In [None]:
len(hidden_states[0])