In [1]:
import pandas as pd
import collections
from collections import Counter
import string
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
df_all = pd.read_csv("reviews_with_splits_lite.csv")
print("shape of the data: ", df_all.shape)
print('-'*60)
print(df_all.head())

shape of the data:  (56000, 3)
------------------------------------------------------------
     rating                                             review  split
0  negative  terrible place to work for i just heard a stor...  train
1  negative   hours , minutes total time for an extremely s...  train
2  negative  my less than stellar review is for service . w...  train
3  negative  i m granting one star because there s no way t...  train
4  negative  the food here is mediocre at best . i went aft...  train


# Define two relevent classes
### - Vocabulary ([see a walkthrough here](https://github.com/houzhj/Machine_Learning/blob/main/ipynb/Yelp_Reviews/class_Vocabulary.ipynb))
### - ReviewVectorizer ([see a walkthrough here](https://github.com/houzhj/Machine_Learning/blob/main/ipynb/Yelp_Reviews/class_Vectorizer.ipynb))

In [3]:
class Vocabulary(object):
    """Class to process text and extract vocabulary for mapping"""
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
            add_unk (bool): a flag that indicates whether to add the UNK token
            unk_token (str): the UNK token to add into the Vocabulary
        """
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        
        self._add_unk   = add_unk
        self._unk_token = unk_token      
        self.unk_index  = -999
        ### the unk_token, i.e, "<UNK>" is the first added token if add_unk=True
        ### self.unk_index is changed from -999 to 0
        if add_unk:
            self.unk_index = self.add_token(unk_token) 

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            ### add a new element to _token_to_idx
            self._token_to_idx[token] = index
            ### add a new element to _idx_to_token
            self._idx_to_token[index] = token
        return index
   
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            ### .get(): return self.unk_index if the key "token" does not exist. 
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
    
    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    def __len__(self):
        return len(self._token_to_idx)
    
class ReviewVectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use"""
    def __init__(self, review_vocab, rating_vocab):
        """
        Args:
            review_vocab (Vocabulary): maps words to integers
            rating_vocab (Vocabulary): maps class labels to integers
        """
        self.review_vocab = review_vocab
        self.rating_vocab = rating_vocab
         
    @classmethod
    def from_dataframe(cls, review_df, cutoff):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            review_df (pandas.DataFrame): the review dataset
            cutoff (int): the parameter for frequency-based filtering
        Returns:
            an instance of the ReviewVectorizer
        """
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)
        
        ########## Add tokens to rating_vocab ('positive' and 'negative')
        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)
            
        ########## Add tokens to review_vocab
        ### Create a Counter() to count all tokens appears in review_df.review
        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
        ### execute add_token if a word appears more than "cutoff" times
        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)
        return cls(review_vocab, rating_vocab)

    ### This is the key functionality of the Vectorizer.
    ### It takes as an argument a string representing a review,
    ### and returns a vectorized representation of the review.
    def vectorize(self, review):
        """
        Create a collapsed one-hot representation vector for the review
        Limitations of the one-hot method:
        1 - Sparseness, n_unique_words in a review << n_unique_words in a vocabulary
        2 - Discarding the order of the words' appearance
        
        Args:
            review (str): the review 
        Returns:
            one_hot (np.ndarray): the collapsed one-hot encoding 
        """
        ### Create an array where each element corresponds to each word in the vocabulary
        one_hot = np.zeros(len(self.review_vocab), dtype=np.float32)
        ### Run lookup_token() for each word in the review sequentially, return an index
        ### Assign the corresponding element in the array to 1.
        for token in review.split(" "):
            if token not in string.punctuation:
                one_hot[self.review_vocab.lookup_token(token)] = 1
        return one_hot

# 1. ReviewDataset class
### - The Dataset class will characterize the key features of the dataset.
### - In the initialization function of the class, make the class inherit the properties of torch.utils.data.Dataset so that we can later leverage its functionalities.
### - In the \_\_init\_\_() function and the set_split() function, store important information such as labels and the features that we wish to generate at each pass.
### - Each call requests a sample index for which the upperbound is specified in the \_\_len\_\_() method.
### - When the sample corresponding to a given index is called, the generator executes the \_\_getitem\_\_() method to generate it.

In [4]:
class ReviewDataset(Dataset):
    def __init__(self,review_df,vectorizer):
        self.review_df   = review_df
        self._vectorizer = vectorizer
        
        self.train_df    = self.review_df[self.review_df.split=='train']
        self.train_size  = len(self.train_df)

        self.val_df      = self.review_df[self.review_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df     = self.review_df[self.review_df.split=='test']
        self.test_size   = len(self.test_df)
        
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}
        self.set_split('train')
        
    @classmethod
    def load_csv_and_make_vectorizer(cls,review_csv,cut_off):
        """Load dataset and make a new vectorizer from scratch
        Args:
            review_csv (str): location of the dataset
        Returns:
            an instance of ReviewDataset
        """
        review_df = pd.read_csv(review_csv)
        ### make vectorizer using training dataset
        train_review_df = review_df[review_df.split=='train']
        new_vectorizer  = ReviewVectorizer.from_dataframe(train_review_df,cut_off)
        return cls(review_df,new_vectorizer)
    
    @classmethod
    def load_df_and_make_vectorizer(cls,review_df,cut_off):
        """Load dataset and make a new vectorizer from scratch
        Args:
            review_df: dataset
        Returns:
            an instance of ReviewDataset
        """
        ### make vectorizer using training dataset
        train_review_df = review_df[review_df.split=='train']
        new_vectorizer  = ReviewVectorizer.from_dataframe(train_review_df,cut_off)
        return cls(review_df,new_vectorizer)
    
    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe 
        Args:
            split (str): one of "train", "val", or "test"
        """
        self._target_split = split
        ### when split = 'train', _target_df means the training set
        self._target_df, self._target_size = self._lookup_dict[split]
        
    def __len__(self):
        ### _target_size is defined in set_split() 
        return self._target_size        
        
    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)
        """
        
        row = self._target_df.iloc[index]

        review_vector = \
            self._vectorizer.vectorize(row.review)

        rating_index = \
            self._vectorizer.rating_vocab.lookup_token(row.rating)

        return {'x_data': review_vector,
                'y_target': rating_index}

    def get_vectorizer(self):
        """ returns the vectorizer """
        return self._vectorizer
    
    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size  

# 2. Instantiate a ReviewDataset from the training data
### There are two classmethods can be used to instantiate a ReviewDataset: load_csv_and_make_vectorizer() and load_df_and_make_vectorizer(). The difference is whether the input data is from a csv file or a pd.DataFrame file. 

### First draw a (static, fixed random seed) from the entire datas

In [5]:
df_sample = df_all.sample(100,random_state=100)

In [6]:
df_sample.head()

Unnamed: 0,rating,review,split
1834,negative,we re not fans . the cake itself is nothing sp...,train
12249,negative,"service at the bar was good , food not so good...",train
31400,positive,this place is just great ! we have been here f...,train
49759,positive,had some time to kill between the chandler bbq...,val
7228,negative,i m disappointed that people actually go here ...,train


In [7]:
pd.crosstab(df_sample['rating'], df_sample['split'])

split,test,train,val
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,6,34,9
positive,11,29,11


### Create a ReviewDataset, with a cutoff = 50 (i.e., add a token to the revew_vocab if the tokan appears more than 50 times)

In [8]:
dataset_sample = ReviewDataset.load_df_and_make_vectorizer(df_sample,50)

## 2.1 - Attributes of a ReviewDataset

### .review_df: the input dataframe

In [9]:
dataset_sample.review_df

Unnamed: 0,rating,review,split
1834,negative,we re not fans . the cake itself is nothing sp...,train
12249,negative,"service at the bar was good , food not so good...",train
31400,positive,this place is just great ! we have been here f...,train
49759,positive,had some time to kill between the chandler bbq...,val
7228,negative,i m disappointed that people actually go here ...,train
...,...,...,...
39446,positive,my favorite place to park as they have great r...,train
55650,positive,jenni at the southwest airlines helping my dau...,test
55382,positive,always pleased . good portions of fish in roll...,test
38349,positive,we had our year anniversary dinner at differen...,train


In [10]:
dataset_sample.review_df.equals(df_sample)

True

### ._vectorizer

In [11]:
### Note that the vectorizer is derived from the training split. 
v = dataset_sample._vectorizer

In [12]:
example_text = "the sun is shining and it is a beautiful day"
one_hot      = v.vectorize(example_text)

In [13]:
print(f'Review Vocabulary: the words appear >50 times')
print('-'*60)
print("_idx_to_token: ", v.review_vocab._idx_to_token)
print('-'*60)
print('One-hot representation:', one_hot)

Review Vocabulary: the words appear >50 times
------------------------------------------------------------
_idx_to_token:  {0: '<UNK>', 1: 'not', 2: 'the', 3: 'is', 4: 'and', 5: 'to', 6: 'on', 7: 'for', 8: 'that', 9: 's', 10: 'have', 11: 'n', 12: 'a', 13: 'this', 14: 'of', 15: 'it', 16: 'in', 17: 'at', 18: 'was', 19: 'my', 20: 'you', 21: 'they', 22: 'are', 23: 'i', 24: 'with', 25: 'had', 26: 't', 27: 'there'}
------------------------------------------------------------
One-hot representation: [1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]


### ._target_df, _target_size
**Defined by method set_split()**

In [14]:
dataset_sample._target_df

Unnamed: 0,rating,review,split
1834,negative,we re not fans . the cake itself is nothing sp...,train
12249,negative,"service at the bar was good , food not so good...",train
31400,positive,this place is just great ! we have been here f...,train
7228,negative,i m disappointed that people actually go here ...,train
42913,positive,"after the construction completed , this hyatt ...",train
...,...,...,...
9245,negative,my job forced me to spend long summer days in ...,train
11195,negative,horrible service . nobody to even help find ou...,train
39446,positive,my favorite place to park as they have great r...,train
38349,positive,we had our year anniversary dinner at differen...,train


In [15]:
dataset_sample._target_size

63

### ._lookup_dict - will be used in the method set_split()

In [16]:
dataset_sample._lookup_dict

{'train': (         rating                                             review  split
  1834   negative  we re not fans . the cake itself is nothing sp...  train
  12249  negative  service at the bar was good , food not so good...  train
  31400  positive  this place is just great ! we have been here f...  train
  7228   negative  i m disappointed that people actually go here ...  train
  42913  positive  after the construction completed , this hyatt ...  train
  ...         ...                                                ...    ...
  9245   negative  my job forced me to spend long summer days in ...  train
  11195  negative  horrible service . nobody to even help find ou...  train
  39446  positive  my favorite place to park as they have great r...  train
  38349  positive  we had our year anniversary dinner at differen...  train
  31507  positive  the chicken bryan is one of my favorite dishes...  train
  
  [63 rows x 3 columns],
  63),
 'val': (         rating                    

In [17]:
### A dictionary which contains a df and a scalar
dataset_sample._lookup_dict['train']

(         rating                                             review  split
 1834   negative  we re not fans . the cake itself is nothing sp...  train
 12249  negative  service at the bar was good , food not so good...  train
 31400  positive  this place is just great ! we have been here f...  train
 7228   negative  i m disappointed that people actually go here ...  train
 42913  positive  after the construction completed , this hyatt ...  train
 ...         ...                                                ...    ...
 9245   negative  my job forced me to spend long summer days in ...  train
 11195  negative  horrible service . nobody to even help find ou...  train
 39446  positive  my favorite place to park as they have great r...  train
 38349  positive  we had our year anniversary dinner at differen...  train
 31507  positive  the chicken bryan is one of my favorite dishes...  train
 
 [63 rows x 3 columns],
 63)

In [18]:
### the dataframe
dataset_sample._lookup_dict['train'][0]

Unnamed: 0,rating,review,split
1834,negative,we re not fans . the cake itself is nothing sp...,train
12249,negative,"service at the bar was good , food not so good...",train
31400,positive,this place is just great ! we have been here f...,train
7228,negative,i m disappointed that people actually go here ...,train
42913,positive,"after the construction completed , this hyatt ...",train
...,...,...,...
9245,negative,my job forced me to spend long summer days in ...,train
11195,negative,horrible service . nobody to even help find ou...,train
39446,positive,my favorite place to park as they have great r...,train
38349,positive,we had our year anniversary dinner at differen...,train


In [19]:
### the sample size
dataset_sample._lookup_dict['train'][1]

63

## 2.2 - Methods of a ReviewDataset

### \_\_len()\_\_

In [20]:
len(dataset_sample)

63

### \_\_getitem()\_\_

In [21]:
### The 4th element in the "train" split
### In the __init__ function, self.set_split('train') defines ._target_df
dataset_sample[3]

{'x_data': array([1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.,
        1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32),
 'y_target': 0}

In [22]:
df_sample.head(5)

Unnamed: 0,rating,review,split
1834,negative,we re not fans . the cake itself is nothing sp...,train
12249,negative,"service at the bar was good , food not so good...",train
31400,positive,this place is just great ! we have been here f...,train
49759,positive,had some time to kill between the chandler bbq...,val
7228,negative,i m disappointed that people actually go here ...,train


### set_split()

In [23]:
dataset_sample = ReviewDataset.load_df_and_make_vectorizer(df_sample,50)

In [24]:
### Now the split for ._target_df and _target_size is 'train'
dataset_sample._target_df

Unnamed: 0,rating,review,split
1834,negative,we re not fans . the cake itself is nothing sp...,train
12249,negative,"service at the bar was good , food not so good...",train
31400,positive,this place is just great ! we have been here f...,train
7228,negative,i m disappointed that people actually go here ...,train
42913,positive,"after the construction completed , this hyatt ...",train
...,...,...,...
9245,negative,my job forced me to spend long summer days in ...,train
11195,negative,horrible service . nobody to even help find ou...,train
39446,positive,my favorite place to park as they have great r...,train
38349,positive,we had our year anniversary dinner at differen...,train


In [25]:
len(dataset_sample)

63

In [26]:
dataset_sample[3]

{'x_data': array([1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.,
        1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32),
 'y_target': 0}

In [27]:
### run set_split, switch the split to 'val'
dataset_sample.set_split('val')
# or 
# ReviewDataset.set_split(dataset_sample,'val')

In [28]:
### Now the split for ._target_df and _target_size is 'val'
dataset_sample._target_df

Unnamed: 0,rating,review,split
49759,positive,had some time to kill between the chandler bbq...,val
48068,positive,"love it here ! family favorite , we always mee...",val
20434,negative,the associates are slow and rude . one time i ...,val
49390,positive,sarah is one of the best stylists around vegas...,val
22769,negative,worst service ever . walked in and the woman r...,val
47653,positive,i seriously love this store . i wish i didn t ...,val
23209,negative,this place needs a minus nbecause i so have a ...,val
21945,negative,"it was okay , but meh , i ve experienced bette...",val
50744,positive,great place . perfect service . can t wait to ...,val
47945,positive,i ve been going to this same shop for about yr...,val


In [29]:
len(dataset_sample)

20

In [30]:
dataset_sample[3]

{'x_data': array([1., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0.,
        0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0.], dtype=float32),
 'y_target': 1}

### get_vectorizer()

In [31]:
dataset_sample.get_vectorizer()

<__main__.ReviewVectorizer at 0x7fe1e3730460>

In [32]:
### Equivalently
dataset_sample._vectorizer

<__main__.ReviewVectorizer at 0x7fe1e3730460>

### get_num_batches()

In [33]:
dataset_sample.get_num_batches(10)

2

In [34]:
len(dataset_sample._target_df)/10

2.0

In [35]:
len(dataset_sample)/10

2.0

In [36]:
### Switch the split to 'train'
dataset_sample.set_split('train')

In [37]:
dataset_sample.get_num_batches(10)

6

In [38]:
len(dataset_sample._target_df)/10

6.3

In [39]:
len(dataset_sample._target_df)//10

6

# 3. Define a batch generator
### - Wrap the DataLoader
### - Switch the data between the CPU and the GPU.

In [40]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device='cpu'):
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

## 3.1 Dataset Class
### - The Dataset class characterizes the key features of the dataset you want to generate.
### - The class uses \_\_init\_\_(), \_\_len\_\_(), and \_\_getitem\_\_() to store important information, and generate samples. 
### - The Dataset class is an important argument of the DataLoader class.

In [41]:
data = {'x1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
        'x2': [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24],
        'y': [0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0]}
data
df = pd.DataFrame(data)
print("data:" ,data)
print("-"*60)
print("df:",df)

data: {'x1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 'x2': [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], 'y': [0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0]}
------------------------------------------------------------
df:     x1  x2  y
0    1  13  0
1    2  14  1
2    3  15  0
3    4  16  1
4    5  17  1
5    6  18  0
6    7  19  0
7    8  20  1
8    9  21  1
9   10  22  0
10  11  23  1
11  12  24  0


In [42]:
##### Define Dataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = torch.tensor(self.data.iloc[index, :-1].values, dtype=torch.float32)
        target = torch.tensor(self.data.iloc[index, -1], dtype=torch.float32)
        return sample, target

##### Instantiate the Dataset class
custom_dataset = CustomDataset(df)

##### Instantiate the DataLoader class
batch_size  = 3
data_loader = DataLoader(dataset=custom_dataset, batch_size=batch_size, shuffle=False)

##### Obtain the batch
i = 0
for batch in data_loader:
    print('Batch '+str(i))
    i+=1
    print(batch)
    print('-' * 60)

Batch 0
[tensor([[ 1., 13.],
        [ 2., 14.],
        [ 3., 15.]]), tensor([0., 1., 0.])]
------------------------------------------------------------
Batch 1
[tensor([[ 4., 16.],
        [ 5., 17.],
        [ 6., 18.]]), tensor([1., 1., 0.])]
------------------------------------------------------------
Batch 2
[tensor([[ 7., 19.],
        [ 8., 20.],
        [ 9., 21.]]), tensor([0., 1., 1.])]
------------------------------------------------------------
Batch 3
[tensor([[10., 22.],
        [11., 23.],
        [12., 24.]]), tensor([0., 1., 0.])]
------------------------------------------------------------


### An alternative is to use TensorDataset() directly

In [43]:
from torch.utils.data import TensorDataset

In [44]:
x1 = torch.from_numpy(df['x1'].values).float()
x2 = torch.from_numpy(df['x2'].values).float()
y  = torch.from_numpy(df['y'].values).float()
print("x1:", x1)
print("x2:", x2)
print("y:", y)

x1: tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.])
x2: tensor([13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.])
y: tensor([0., 1., 0., 1., 1., 0., 0., 1., 1., 0., 1., 0.])


In [45]:
features = torch.stack([x1, x2], dim=1)
features

tensor([[ 1., 13.],
        [ 2., 14.],
        [ 3., 15.],
        [ 4., 16.],
        [ 5., 17.],
        [ 6., 18.],
        [ 7., 19.],
        [ 8., 20.],
        [ 9., 21.],
        [10., 22.],
        [11., 23.],
        [12., 24.]])

In [46]:
##### Create Tensor dataset
dataset     = TensorDataset(features, y)
batch_size  = 3

##### Instantiate the DataLoader class
data_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False)

##### Obtain the batch
i = 0
for batch in data_loader:
    print('Batch '+str(i))
    i+=1
    print(batch)
    print('-' * 60)

Batch 0
[tensor([[ 1., 13.],
        [ 2., 14.],
        [ 3., 15.]]), tensor([0., 1., 0.])]
------------------------------------------------------------
Batch 1
[tensor([[ 4., 16.],
        [ 5., 17.],
        [ 6., 18.]]), tensor([1., 1., 0.])]
------------------------------------------------------------
Batch 2
[tensor([[ 7., 19.],
        [ 8., 20.],
        [ 9., 21.]]), tensor([0., 1., 1.])]
------------------------------------------------------------
Batch 3
[tensor([[10., 22.],
        [11., 23.],
        [12., 24.]]), tensor([0., 1., 0.])]
------------------------------------------------------------


### The two methods below are equivalent

In [47]:
x1 = torch.from_numpy(df['x1'].values).float()
x2 = torch.from_numpy(df['x2'].values).float()
torch.stack([x1, x2], dim=1)

tensor([[ 1., 13.],
        [ 2., 14.],
        [ 3., 15.],
        [ 4., 16.],
        [ 5., 17.],
        [ 6., 18.],
        [ 7., 19.],
        [ 8., 20.],
        [ 9., 21.],
        [10., 22.],
        [11., 23.],
        [12., 24.]])

In [48]:
numpy_array = df[['x1', 'x2']].to_numpy()
torch.from_numpy(numpy_array)

tensor([[ 1, 13],
        [ 2, 14],
        [ 3, 15],
        [ 4, 16],
        [ 5, 17],
        [ 6, 18],
        [ 7, 19],
        [ 8, 20],
        [ 9, 21],
        [10, 22],
        [11, 23],
        [12, 24]])

## 3.2 DataLoader
### - batch_size: denotes the number of samples contained in each generated batch.
### - shuffle: if set to True, we will get a new order of exploration at each pass (or just keep a linear exploration scheme otherwise). Shuffling the order in which examples are fed to the classifier is helpful so that batches between epochs do not look alike. Doing so will eventually make our model more robust.
### - drop_last: set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch size. If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller. (default: False)

In [49]:
type(dataset_sample[0]['x_data'])

numpy.ndarray

In [50]:
dataset_sample = ReviewDataset.load_df_and_make_vectorizer(df_sample,50)
batch_size     = 10
shuffle        = True
drop_last      = True
dataloader     = DataLoader(dataset=dataset_sample, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

In [51]:
one_batch = next(iter(dataloader))
print('x in one batch')
print(one_batch['x_data'])
print('size of x_data:', one_batch['x_data'].shape)
print('-' * 60)
print('y in one batch')
print(one_batch['y_target'])
print('size of y_data:', one_batch['y_target'].shape)

x in one batch
tensor([[1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0.,
         1., 1., 0., 0., 0., 1., 0., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.,
         1., 1., 0., 1., 0., 1., 1., 1., 1., 1.],
        [1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
         1., 1., 1., 1., 0., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1.,
         1., 0., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 0., 1., 0., 0., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.,

### In this example, dataloader utilizes the return from the \_\_getitem\_\_() method, which extracts related rows from the _target_df of dataset, with _target_size=65. Also, batch_size=10, and drop_last=True so there are 6 batches created (the last 5 rows are dropped)

In [52]:
print('number of rows in the target_df: ', len(dataset_sample._target_df))
print('number of rows in the target_df: ', dataset_sample._target_size)
print("The number of batches is:",dataset_sample.get_num_batches(batch_size = 10))

number of rows in the target_df:  63
number of rows in the target_df:  63
The number of batches is: 6


In [53]:
i = 0
for data_dict in dataloader:
    print('Batch '+str(i))
    i+=1
    print(data_dict)
    print(data_dict['x_data'].shape)
    print('-' * 60)

Batch 0
{'x_data': tensor([[1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
         1., 1., 1., 1., 1., 1., 0., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 0., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 0., 1., 0., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 1.,
         1., 1., 0., 1., 0., 1., 0., 1., 0., 1.],
        [1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
         0., 0., 0., 1., 1., 0., 0., 0., 0., 1.],
        [1., 1., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1., 1., 0., 1., 1., 0., 1.,
         1., 0., 0., 0., 0., 0., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 0., 1., 0.,
         1., 1., 0., 0., 0., 1., 0., 0., 1., 1.],
        [1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.,

### This is equvalent to defining and using the generator function generate_batches().

In [54]:
i = 0
for data_dict in dataloader:
    print('Batch '+str(i))
    i+=1
    print(data_dict)
    print(data_dict['x_data'].shape)
    print('-' * 60)

Batch 0
{'x_data': tensor([[1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
         1., 1., 1., 1., 0., 1., 1., 1., 1., 0.],
        [1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.,
         1., 0., 1., 0., 0., 1., 1., 1., 1., 1.],
        [1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0.,
         1., 1., 1., 1., 1., 1., 1., 0., 1., 1.],
        [1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 1., 0., 0., 0., 1., 0.],
        [1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
         1., 1., 1., 1., 1., 1., 0., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,

## 3.3 Generator
### - Generator functions declare a function that behaves like an iterator, i.e. it can be used in a for loop.
### - A generator function is defined just like a normal function, but whenever it needs to generate a value, it does so with the yield keyword rather than return. 
### - Yield is used in Python generators. If the body of a def contains yield, the function automatically becomes a generator function. 
### - *return* sends a specified value back to its caller whereas *yield* can produce a sequence of values. We should use *yield* when we want to iterate over a sequence, but don’t want to store the entire sequence in memory.

### Consider a task to calculate the sum of the first n integers

In [55]:
##### The function below builds the full list in memory
def first_n(n):
    num, nums = 0, []
    while num < n:
        nums.append(num)
        num += 1
    return nums
sum(first_n(100))

4950

In [56]:
##### The following implements generator as an iterable object.
class first_n(object):

    def __init__(self, n):
        self.n = n
        self.num = 0

    def __iter__(self):
        return self

    # Python 3 compatibility
    def __next__(self):
        return self.next()

    def next(self):
        if self.num < self.n:
            cur, self.num = self.num, self.num+1
            return cur
        raise StopIteration
        
a = first_n(10)
print('vars(a):', vars(a))
print('sum(a):', sum(a))

vars(a): {'n': 10, 'num': 0}
sum(a): 45


In [57]:
##### a generator that yields items instead of returning a list

def first_n(n):
    num = 0
    while num < n:
        yield num
        num += 1

a = first_n(10)

print('next(a):', next(a))
print('sum(a):', sum(a))
##### If the generator has already produced all its values, calling next() 
##### again will raise a StopIteration exception, indicating that the 
##### generator has been exhausted. use next(generator, default) to 
##### provide a default value, avoiding the occurrence of an exception.
print('next(a):', next(a,None))

next(a): 0
sum(a): 45
next(a): None


In [58]:
##### Now next(a) = None so the code will not print anything 
for i in a:
    print (i)

In [59]:
##### using a new generator
for i in first_n(10):
    print (i)

0
1
2
3
4
5
6
7
8
9
