In [1]:
import numpy as np
import torchtext
import random
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import torch
import torchvision
from collections import defaultdict
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import torch.nn.functional as F
from torchvision import transforms, utils, datasets
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader, random_split, SubsetRandomSampler, WeightedRandomSampler
from torch.utils.data import Dataset, Sampler, DataLoader
import operator 
import os
from sklearn.utils import shuffle

from tqdm import tqdm
tqdm.pandas()

In [2]:
#import tensorflow_datasets as tfds

In [3]:
path = '../input/imdb-dataset-of-50k-movie-reviews'
df = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df = df.head(200)
df.head()

In [4]:

reviews=['No man is an island itself itself','Entire of itself',
'Every man is a piece of the continent itself','part of the main',
'If a clod be washed away by the sea','Europe itself is the less',
'As well as if a promontory were focus meaning','As well as if a itself manor of thy friend focus focus focus',
'Or of thine own were meaning','Any man’s death diminishes me meaning',
'Because I am involved in mankind focus focus island',
'And therefore never send to know for whom the bell tolls',
'It tolls for thee','brrrr wrrrr drrrr island']


In [5]:
class tokenizer:
    
    def __init__(self,texts, max_features ,max_len , zero_first=False):
        
        self.texts = texts
        self.max_features = max_features
        self.max_len = max_len
        self.token2id  = self.buildvocab()
        #self.tokens = self.tokenize()
        self.zero_first = zero_first

    def buildvocab(self):
        counter = Counter()
        '''
            UNK - "unknown token" - is used to replace the rare words that did not fit in your vocabulary. 
            So your sentence My name is guotong1988 will be translated into My name is _unk_.
            PAD - your GPU (or CPU at worst) processes your training data in batches and all the sequences in your batch should have the same length. 
            If the max length of your sequence is 8, your sentence .
            My name is guotong1988 will be padded from either side to fit this length: My name is guotong1988 _pad_ _pad_ _pad_ _pad_
            ---------------------------------------------------------------------------------------------------------------------------------------
            Example :: texts = ['i love barcelona' , 'barcelona is in spain','spain is in europe' , 'europe spain barcelona']
        '''

        vocab = {'<PAD>':0 , '<UNK>':1 }
        for text in self.texts:
            counter.update(text.split())

        for idx,(token , count) in enumerate(counter.most_common(self.max_features)):
            vocab.update({token:idx+2})

        ## Build tokenizer dictionary
        token2id = {k:v for v,k in enumerate(vocab.keys())}
        id2token = {v:k for v,k in enumerate(vocab.keys())}

        return token2id 
    
    def tokenize(self):
        
        return  [[(self.token2id.get(token,1)) for token in text.split()[:self.max_len]] for text in self.texts ]
    
    def padded_sequence(self , tokens):
        
        lens = [len(seq) for seq in tokens]
        max_len = max(lens)
        #print('max_len' , max_len)
        
        padded_seqs = torch.zeros(len(tokens),max_len).long()

        for idx,seq in enumerate(tokens):
            if self.zero_first:
                ok = self.max_len - len(tokens[idx])
                padded_seqs[idx][ok:len(tokens[idx])+ok] = torch.LongTensor(tokens[idx])
            else:
                padded_seqs[idx][:len(tokens[idx])] = torch.LongTensor(tokens[idx])

        return padded_seqs

In [6]:
texts = df['review'].to_list()
max_features = 1500
max_len = 250
t = tokenizer(texts , max_features , max_len)

In [7]:
#tokens = t.tokenize()
#tokens = t.padded_sequence(tokens)
#index, seqs = zip(*train_dataset)

In [8]:
class CLRPDataset(Dataset):
    def __init__(self , toks , max_len = 250):
        self.seqs =  toks        
        self.maxlen = max_len
        #self.targets =  df['sentiment'].to_list()
    def __len__(self):
        return len(self.seqs)
    
    def get_keys(self):
        lens = np.fromiter(
            tqdm(((min(self.maxlen, len(c))) for c in self.seqs), desc='generate lens'),
            dtype=np.int32)
        return lens       
    
    def __getitem__(self,idx):
        return idx,self.seqs[idx]

In [9]:
class BucketSampler(Sampler):

    def __init__(self, data_source, sort_keys, bucket_size=None, batch_size=64, shuffle_data=True):
        super().__init__(data_source)
        self.shuffle = shuffle_data
        self.batch_size = batch_size
        self.sort_keys = sort_keys
        self.bucket_size = bucket_size if bucket_size is not None else len(sort_keys)
        self.weights = None

        if not shuffle_data:
            self.index = self.prepare_buckets()
        else:
            self.index = None

    def set_weights(self, weights):
        assert weights >= 0
        total = np.sum(weights)
        if total != 1:
            weights = weights / total
        self.weights = weights

    def __iter__(self):
        indices = None
        if self.weights is not None:
            total = len(self.sort_keys)
            indices = np.random.choice(total, (total,), p=self.weights)
        if self.shuffle:
            self.index = self.prepare_buckets(indices)
        return iter(self.index)

    def get_reverse_indexes(self):
        indexes = np.zeros((len(self.index),), dtype=np.int32)
        for i, j in enumerate(self.index):
            indexes[j] = i
        return indexes

    def __len__(self):
        return len(self.sort_keys)
        
    def prepare_buckets(self, indices=None):
        lens = - self.sort_keys
        assert self.bucket_size % self.batch_size == 0 or self.bucket_size == len(lens)

        if indices is None:
            if self.shuffle:
                indices = shuffle(np.arange(len(lens), dtype=np.int32))
                lens = lens[indices]
            else:
                indices = np.arange(len(lens), dtype=np.int32)

        #  bucket iterator
        def divide_chunks(l, n):
            if n == len(l):
                yield np.arange(len(l), dtype=np.int32), l
            else:
                # looping till length l
                for i in range(0, len(l), n):
                    data = l[i:i + n]
                    yield np.arange(i, i + len(data), dtype=np.int32), data
    
        new_indices = []
        extra_batch = None
        for chunk_index, chunk in divide_chunks(lens, self.bucket_size):
            # sort indices in bucket by descending order of length
            indices_sorted = chunk_index[np.argsort(chunk, axis=-1)]
            batches = []
            for _, batch in divide_chunks(indices_sorted, self.batch_size):
                if len(batch) == self.batch_size:
                    batches.append(batch.tolist())
                else:
                    assert extra_batch is None
                    assert batch is not None
                    extra_batch = batch
    
            # shuffling batches within buckets
            if self.shuffle:
                batches = shuffle(batches)
            for batch in batches:
                new_indices.extend(batch)
    
        if extra_batch is not None:
            new_indices.extend(extra_batch)
        return indices[new_indices]

In [10]:
def collate_fn(data):
    index, seqs_ = zip(*data)
    seqs = t.padded_sequence(seqs_)
    return index, seqs

In [11]:
#tokens = t.tokenize()
#tokens = t.padded_sequence(tokens)
#index, seqs = zip(*train_dataset)

In [12]:
train_dataset  = CLRPDataset(t.tokenize() , max_len=max_len)
train_sampler = BucketSampler(train_dataset , train_dataset .get_keys(),
                                  bucket_size=64, batch_size=64)
train_loader = DataLoader(train_dataset,sampler=train_sampler, batch_size=16, num_workers=0, collate_fn=collate_fn)
for i,data in enumerate(train_loader):
    print(data[1].shape[1])
    print('********************************')

In [13]:
train_dataset  = CLRPDataset(t.tokenize() , max_len=max_len)

train_loader = DataLoader(train_dataset, batch_size=16, num_workers=0, collate_fn=collate_fn)
for i,data in enumerate(train_loader):
    print(data[1].shape[1])
    print('********************************')

In [14]:
def build_vocab(texts, max_features):
    counter = Counter()
    for text in texts:
        counter.update(text.split())

    vocab = {
        'token2id': {'<PAD>': 0, '<UNK>': max_features + 1},
        'id2token': {}
    }
    vocab['token2id'].update(
        {token: _id + 1 for _id, (token, count) in
         enumerate(counter.most_common(max_features))})
    vocab['id2token'] = {v: k for k, v in vocab['token2id'].items()}
    return vocab


def tokenize(texts, vocab):
    
    def text2ids(text, token2id , max_len = max_len):
        return [
            token2id.get(token, len(token2id) - 1)
            for token in text.split()[:max_len]] # 
    
    return [
        text2ids(text, vocab['token2id'])
        for text in texts]



In [15]:
vocab = build_vocab(df['review'].to_list(), max_features)
train_x = np.array(tokenize(df['review'].to_list(), vocab))

In [16]:
def collate_fn(data):

    def _pad_sequences(seqs):
        lens = [len(seq) for seq in seqs]
        max_len = max(lens)

        padded_seqs = torch.zeros(len(seqs), max_len).long()
        for i, seq in enumerate(seqs):
            start = max_len - lens[i]
            padded_seqs[i, start:] = torch.LongTensor(seq)
        return padded_seqs

    index, seqs = zip(*data)
    seqs = _pad_sequences(seqs)
    return index, seqs#, torch.FloatTensor(targets)


In [17]:
class CLRPDataset(Dataset):
    def __init__(self , seqs,maxlen):
        self.seqs =  train_x #df['review'].to_list()        
        self.maxlen = maxlen
        #self.targets =  df['sentiment'].to_list()
    
    def __len__(self):
        return len(self.seqs)
    
    def get_keys(self):
        lens = np.fromiter(
            tqdm(((min(self.maxlen, len(c))) for c in self.seqs), desc='generate lens'),
            dtype=np.int32)
        return lens
    
    def __getitem__(self,idx):
        return idx,self.seqs[idx]    

In [18]:
max_len = 250 
train_dataset  = CLRPDataset(train_x , maxlen = max_len)
train_sampler = BucketSampler(train_dataset , train_dataset .get_keys(),
                                  bucket_size=64, batch_size=64)
train_loader = DataLoader(train_dataset,sampler=train_sampler, batch_size=16, num_workers=0, collate_fn=collate_fn)
for i,data in enumerate(train_loader):
    print(data[1].shape[1])
    print('********************************')