In [1]:
import pandas as pd
import collections
from collections import Counter
import string
import numpy as np

In [2]:
df_all = pd.read_csv("reviews_with_splits_lite.csv")

In [3]:
print("shape of the data: ", df_all.shape)
print('-'*60)
print(df_all.head())

shape of the data:  (56000, 3)
------------------------------------------------------------
     rating                                             review  split
0  negative  terrible place to work for i just heard a stor...  train
1  negative   hours , minutes total time for an extremely s...  train
2  negative  my less than stellar review is for service . w...  train
3  negative  i m granting one star because there s no way t...  train
4  negative  the food here is mediocre at best . i went aft...  train


In [4]:
class Vocabulary(object):
    """Class to process text and extract vocabulary for mapping"""
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
            add_unk (bool): a flag that indicates whether to add the UNK token
            unk_token (str): the UNK token to add into the Vocabulary
        """
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        
        self._add_unk   = add_unk
        self._unk_token = unk_token      
        self.unk_index  = -999
        ### the unk_token, i.e, "<UNK>" is the first added token if add_unk=True
        ### self.unk_index is changed from -999 to 0
        if add_unk:
            self.unk_index = self.add_token(unk_token) 

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            ### add a new element to _token_to_idx
            self._token_to_idx[token] = index
            ### add a new element to _idx_to_token
            self._idx_to_token[index] = token
        return index
   
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            ### .get(): return self.unk_index if the key "token" does not exist. 
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
    
    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    def __len__(self):
        return len(self._token_to_idx)

# 1. ReviewVectorizer class

In [5]:
class ReviewVectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use"""
    def __init__(self, review_vocab, rating_vocab):
        """
        Args:
            review_vocab (Vocabulary): maps words to integers
            rating_vocab (Vocabulary): maps class labels to integers
        """
        self.review_vocab = review_vocab
        self.rating_vocab = rating_vocab
         
    @classmethod
    def from_dataframe(cls, review_df, cutoff):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            review_df (pandas.DataFrame): the review dataset
            cutoff (int): the parameter for frequency-based filtering
        Returns:
            an instance of the ReviewVectorizer
        """
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)
        
        ########## Add tokens to rating_vocab ('positive' and 'negative')
        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)
            
        ########## Add tokens to review_vocab
        ### Create a Counter() to count all tokens appears in review_df.review
        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
        ### execute add_token if a word appears more than "cutoff" times
        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)
        return cls(review_vocab, rating_vocab)

    ### This is the key functionality of the Vectorizer.
    ### It takes as an argument a string representing a review,
    ### and returns a vectorized representation of the review.
    def vectorize(self, review):
        """
        Create a collapsed one-hot representation vector for the review
        Limitations of the one-hot method:
        1 - Sparseness, n_unique_words in a review << n_unique_words in a vocabulary
        2 - Discarding the order of the words' appearance
        
        Args:
            review (str): the review 
        Returns:
            one_hot (np.ndarray): the collapsed one-hot encoding 
        """
        ### Create an array where each element corresponds to each word in the vocabulary
        one_hot = np.zeros(len(self.review_vocab), dtype=np.float32)
        ### Run lookup_token() for each word in the review sequentially, return an index
        ### Assign the corresponding element in the array to 1.
        for token in review.split(" "):
            if token not in string.punctuation:
                one_hot[self.review_vocab.lookup_token(token)] = 1
        return one_hot

# 2. Instantiate the ReviewVectorizer from the training data

### First draw a (static, fixed random seed) from the entire dataset

In [6]:
df_sample = df_all.sample(100,random_state=100)

In [7]:
df_sample.head()

Unnamed: 0,rating,review,split
1834,negative,we re not fans . the cake itself is nothing sp...,train
12249,negative,"service at the bar was good , food not so good...",train
31400,positive,this place is just great ! we have been here f...,train
49759,positive,had some time to kill between the chandler bbq...,val
7228,negative,i m disappointed that people actually go here ...,train


In [8]:
pd.crosstab(df_sample['rating'], df_sample['split'])

split,test,train,val
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,6,34,9
positive,11,29,11


In [9]:
### A Vectorizer with cutoff = 8 
### i.e., only add tokens that have appeared 8 times or more in the reviews
vectorizer_cutoff_8 = ReviewVectorizer.from_dataframe(df_sample,8)

### A vectorizer has two vocabularies(attributes), one for review, one for rating 

In [10]:
vars(vectorizer_cutoff_8)

{'review_vocab': <__main__.Vocabulary at 0x7f99c4ded4c0>,
 'rating_vocab': <__main__.Vocabulary at 0x7f99c4deda60>}

In [11]:
print('rating_vocab')
print(vectorizer_cutoff_8.rating_vocab._token_to_idx)
print(vectorizer_cutoff_8.rating_vocab._idx_to_token)
print('-'*60)
print('review_vocab')
print(f"Includes {len(vectorizer_cutoff_8.review_vocab)} tokens")

rating_vocab
{'negative': 0, 'positive': 1}
{0: 'negative', 1: 'positive'}
------------------------------------------------------------
review_vocab
Includes 207 tokens


# 3. Methods

### (classmethod) from_dataframe(review_df, cutoff): Instantiate the vectorizer from the dataset dataframe.
1. First instantiate two Vocabularies based on the input data "review_df". [See a walkthrough of Vocabulary class here](https://github.com/houzhj/Machine_Learning/blob/main/ipynb/Yelp_Reviews/class_Vocabulary.ipynb).
2. Use the review_vocab and rating_vocab as inputs to instantiate a vectorizer.

### vectorize(review): It takes as an argument a string representing a review, and returns a vectorized representation of the review. This is the key functionality of the Vectorizer.

In [12]:
example_text = "the sun is shining and it is a beautiful day"

In [13]:
cut_off_list = [10,50,100]
for c in cut_off_list:
    vectorizer = ReviewVectorizer.from_dataframe(df_sample,c)
    one_hot    = vectorizer.vectorize(example_text)
    print(f"cutoff={c}")
    print(f'Review Vocabulary: the words appear >{c} times')
    print(vectorizer.review_vocab._idx_to_token)
    print('One-hot representation:', one_hot)
    print('-'*100)

cutoff=10
Review Vocabulary: the words appear >10 times
{0: '<UNK>', 1: 'we', 2: 're', 3: 'not', 4: 'the', 5: 'is', 6: 'and', 7: 'to', 8: 'be', 9: 'on', 10: 'for', 11: 'us', 12: 'that', 13: 'or', 14: 's', 15: 'have', 16: 'n', 17: 'should', 18: 'a', 19: 'will', 20: 'but', 21: 'this', 22: 'like', 23: 'of', 24: 'nthe', 25: 'has', 26: 'much', 27: 'better', 28: 'than', 29: 'it', 30: 'what', 31: 'as', 32: 'in', 33: 'too', 34: 'time', 35: 'service', 36: 'at', 37: 'bar', 38: 'was', 39: 'good', 40: 'food', 41: 'so', 42: 'my', 43: 'restaurant', 44: 'very', 45: 'place', 46: 'just', 47: 'great', 48: 'been', 49: 'here', 50: 'lunch', 51: 'you', 52: 'always', 53: 'get', 54: 'they', 55: 'menu', 56: 'which', 57: 'are', 58: 'take', 59: 'nice', 60: 'had', 61: 'some', 62: 'night', 63: 'when', 64: 'i', 65: 'all', 66: 'before', 67: 'after', 68: 'can', 69: 'way', 70: 'out', 71: 'sure', 72: 'your', 73: 'still', 74: 'decent', 75: 'ordered', 76: 'chicken', 77: 'their', 78: 'our', 79: 'were', 80: 'if', 81: 'by',