# Text preprocessing

In [28]:
import pandas as pd
from pathlib import Path
import numpy as np 
from functools import partial
from spacy.symbols import ORTH
from concurrent.futures import ProcessPoolExecutor
import spacy
import re
import collections
from tqdm import tqdm_notebook
import html

In [2]:
from exp.nb_11a import *

In [3]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In this notebook, we will go through the text processing that we will use to create the data used to train the models. (Most of the code is directly taken from fastai's deep learning course : [Deep Learning from the foundations](https://course.fast.ai/part2) )

## Data structures 

The idea is to gradually build our data in order to get our data ready to use for training our models. The steps are :
1. Load the data into an list of items (for text it can be a list containing chunks of text for example)
2. Split the data into train and validation datasets
3. For each of the datasets, process the elements (numericalize the text...) and label them (sentiment for seentiment analysis)  

In [4]:
def compose(x, funcs, *args, order_key='_order', **kwargs):
    """
    apllies (ordered) functions in funcs sqeuentially to x and return result
    """
    key = lambda o: getattr(o, order_key, 0)
    for f in sorted(listify(funcs), key=key): x = f(x, **kwargs)
    return x

We first need a list data structure : **ListContainer**

In [5]:
class ListContainer():
    """
    At simple data structure for creating lists
    """
    def __init__(self, items): self.items = items
    def __getitem__(self, idx):
        try: return self.items[idx]
        except TypeError:
            if isinstance(idx[0],bool):
                assert len(idx)==len(self) # bool mask
                return [o for m,o in zip(idx,self.items) if m]
            return [self.items[i] for i in idx]
    def __len__(self): return len(self.items)
    def __iter__(self): return iter(self.items)
    def __setitem__(self, i, o): self.items[i] = o
    def __delitem__(self, i): del(self.items[i])
    def __repr__(self):
        res = f'{self.__class__.__name__} ({len(self)} items)\n{self.items[:10]}'
        if len(self)>10: res = res[:-1]+ '...]'
        return res

**ItemList** is a Listcontainer that can transform the elements before they are beeing accessed. Transforms are mainly used for data augmentation (in images ofr example) which we will not use for this project.

In [6]:
class ItemList(ListContainer):
    """
    A listContainer containing items that can be tranformed before accessed 
    """
    def __init__(self, items, path='.', tfms=None):
        super().__init__(items)
        self.path,self.tfms = Path(path),tfms

    def __repr__(self): return f'{super().__repr__()}\nPath: {self.path}'
    
    def new(self, items, cls=None):
        if cls is None: cls=self.__class__
        return cls(items, self.path, tfms=self.tfms)
    
    def  get(self, i): return i
    def _get(self, i): return compose(self.get(i), self.tfms)
    
    def __getitem__(self, idx):
        res = super().__getitem__(idx)
        if isinstance(res,list): return [self._get(o) for o in res]
        return self._get(res)

A **SplitData** object will contain two ItemLists, one for the training data and the other for validation data. We can create one from an ItemList and a splitting function using the class method *split_by_func*

In [7]:
def split_by_func(items, f):
    """
    Splits an list of items into two lists with the spliting func f
    """
    mask = [f(o) for o in items]
    # `None` values will be filtered out
    f = [o for o,m in zip(items,mask) if m==False]
    t = [o for o,m in zip(items,mask) if m==True ]
    return f,t

class SplitData():
    """
    Contains a training and validation list of items
    """
    def __init__(self, train, valid): self.train,self.valid = train,valid
        
    def __getattr__(self,k): return getattr(self.train,k)
    #This is needed if we want to pickle SplitData and be able to load it back without recursion errors
    def __setstate__(self,data): self.__dict__.update(data) 
    
    @classmethod
    def split_by_func(cls, il, f):
        lists = map(il.new, split_by_func(il.items, f))
        return cls(*lists)

    def __repr__(self): return f'{self.__class__.__name__}\nTrain: {self.train}\nValid: {self.valid}\n'

In [8]:
class Processor():
    """
    Parent class for processors
    """
    def process(self, items): return items

**LabeledData** is kind of an equivalent to t he pytorch dataset, it contains the data x and the data target y and you can access it by index to get the tuple $(x_i, y_i)$. You can also add processes for x and y which we will use to preprocess (clean, tokenize and numericalize)

In [9]:
def _label_by_func(ds, f, cls=ItemList): 
    """
    Returns a new ItemList containig the labels of evrey item in list ds using function f 
    """
    return cls([f(o) for o in ds.items], path=ds.path)


class LabeledData():
    """
    Contains label data that have been processed 
    """
    def process(self, il, proc): return il.new(compose(il.items, proc))

    def __init__(self, x, y, proc_x=None, proc_y=None):
        self.x, self.y = self.process(x, proc_x),self.process(y, proc_y)
        self.proc_x, self.proc_y = proc_x,proc_y
        
    def __repr__(self): return f'{self.__class__.__name__}\nx: {self.x}\ny: {self.y}\n'
    def __getitem__(self,idx): return self.x[idx],self.y[idx]
    def __len__(self): return len(self.x)
    
    def x_obj(self, idx): return self.obj(self.x, idx, self.proc_x)
    def y_obj(self, idx): return self.obj(self.y, idx, self.proc_y)
    
    def obj(self, items, idx, procs):
        isint = isinstance(idx, int) or (isinstance(idx,torch.LongTensor) and not idx.ndim)
        item = items[idx]
        for proc in reversed(listify(procs)):
            item = proc.deproc1(item) if isint else proc.deprocess(item)
        return item

    @classmethod
    def label_by_func(cls, il, f, proc_x=None, proc_y=None):
        return cls(il, _label_by_func(il, f), proc_x=proc_x, proc_y=proc_y)


In [10]:
def label_by_func(sd, f, proc_x=None, proc_y=None):
    """
    Transform splitted data sd into splitted labled data using splitter f and processes proc_x and proc_y 
    """
    train = LabeledData.label_by_func(sd.train, f, proc_x=proc_x, proc_y=proc_y)
    valid = LabeledData.label_by_func(sd.valid, f, proc_x=proc_x, proc_y=proc_y)
    return SplitData(train,valid)

## Tweets

We first need our ItemList for the tweets which take the list of all tweets contained in the df (or csv file)

In [11]:
class TextList(ItemList) :
    @classmethod
    def from_df(cls, df, text_col):
        texts = df[text_col]
        texts = texts.values
        return cls(texts)
    @classmethod
    def from_csv(cls,  path, text_col):
        df = pd.read_csv(path)
        return cls.from_df(df, text_col)

In [12]:
tl = TextList.from_csv('data/train_full_m_shuffled.csv', 'tweet')

In [13]:
tl

TextList (2500000 items)
['<number> words , <number> seconds , <number> moment : " thank you god . " rt this and be thankful not today , but everyday ! \n'
 'crime scene investigators : uncovering the truth ( blazers <neutralface> ine of duty describes crime scene investigators , in \n'
 "gosh this is it . the end of our baguio trip . i 'll miss this place , see you next year my beloved baguio city . \n"
 "<hashtag> # teamfollowback kate middleton 's wedding day hairdresser reveals his secret assistant ( people magazine share \n"
 'hope everyone has a good <number>/<number> . ill be at work being my little sober self <hashtag> # gettinthatmoney \n'
 '<user> please follow me . <repeat> ( \n' 'that te <smile> t i just got ( > _ \n'
 '<user> follow me please justin i love you <heart> \n'
 'now hiring : front office administrative assistant at adecco ( westminster , co adecco has an immediate nee . <repeat> <url> <hashtag> # jobs \n'
 "<user> pretty please follow me cher ! i follow <user> 

We then need to split it into two datasets by giving it a random splitter which will, for every tweet, put it into the training set with probability $1-pctg$ or in the validation set with probability $pctg$

In [14]:
def random_splitter(item, pctg=0.2):
    test = np.random.uniform(0, 1)
    if test < pctg :
        return True
    else :
        return False

In [15]:
sd = SplitData.split_by_func(tl, partial(random_splitter, pctg=0.2))

In [16]:
sd

SplitData
Train: TextList (2000253 items)
['<number> words , <number> seconds , <number> moment : " thank you god . " rt this and be thankful not today , but everyday ! \n', 'crime scene investigators : uncovering the truth ( blazers <neutralface> ine of duty describes crime scene investigators , in \n', "gosh this is it . the end of our baguio trip . i 'll miss this place , see you next year my beloved baguio city . \n", "<hashtag> # teamfollowback kate middleton 's wedding day hairdresser reveals his secret assistant ( people magazine share \n", 'hope everyone has a good <number>/<number> . ill be at work being my little sober self <hashtag> # gettinthatmoney \n', '<user> please follow me . <repeat> ( \n', 'that te <smile> t i just got ( > _ \n', '<user> follow me please justin i love you <heart> \n', 'now hiring : front office administrative assistant at adecco ( westminster , co adecco has an immediate nee . <repeat> <url> <hashtag> # jobs \n', 'danny glover ( black americans of ac

### Tokenizing

Now we need to process and tokenize the text. This will go as follows :
- We first process the text non-tokenized with some preprocessing rules
- We then tokenize the text to have a list of tokens (strings)
- And finally we process the list of tokens with some postprocessing rules

The TokenizeProcessor will apply the whole process to chunks of data in parallel in order to make it faster 

**Preprocessing**

In [17]:
#special tokens
UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxunk xxpad xxbos xxeos xxrep xxwrep xxup xxmaj".split()

def sub_br(t):
    "Replaces the <br /> by \n"
    re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
    return re_br.sub("\n", t)

def spec_add_spaces(t):
    "Add spaces around / and #"
    return re.sub(r'([/#])', r' \1 ', t)

def rm_useless_spaces(t):
    "Remove multiple spaces"
    return re.sub(' {2,}', ' ', t)

def replace_rep(t):
    "Replace repetitions at the character level: cccc -> TK_REP 4 c"
    def _replace_rep(m:Collection[str]) -> str:
        c,cc = m.groups()
        return f' {TK_REP} {len(cc)+1} {c} '
    re_rep = re.compile(r'(\S)(\1{3,})')
    return re_rep.sub(_replace_rep, t)
    
def replace_wrep(t):
    "Replace word repetitions: word word word -> TK_WREP 3 word"
    def _replace_wrep(m:Collection[str]) -> str:
        c,cc = m.groups()
        return f' {TK_WREP} {len(cc.split())+1} {c} '
    re_wrep = re.compile(r'(\b\w+\W+)(\1{3,})')
    return re_wrep.sub(_replace_wrep, t)

def fixup_text(x):
    "Various messy things we've seen in documents"
    re1 = re.compile(r'  +')
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>',UNK).replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))
    
default_pre_rules = [fixup_text, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces, sub_br]
default_spec_tok = [UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ]

**Postprocessing**

In [18]:
def replace_all_caps(x):
    "Replace tokens in ALL CAPS by their lower version and add `TK_UP` before."
    res = []
    for t in x:
        if t.isupper() and len(t) > 1: res.append(TK_UP); res.append(t.lower())
        else: res.append(t)
    return res

def deal_caps(x):
    "Replace all Capitalized tokens in by their lower version and add `TK_MAJ` before."
    res = []
    for t in x:
        if t == '': continue
        if t[0].isupper() and len(t) > 1 and t[1:].islower(): res.append(TK_MAJ)
        res.append(t.lower())
    return res

def add_eos_bos(x): return [BOS] + x + [EOS]

default_post_rules = [deal_caps, replace_all_caps, add_eos_bos]

In [19]:
def parallel(func, arr, max_workers=4):
    """
    Applies in parallel the func to the elements of arr
    """
    if max_workers<2: results = list(progress_bar(map(func, enumerate(arr)), total=len(arr)))
    else:
        with ProcessPoolExecutor(max_workers=max_workers) as ex:
            return list(tqdm_notebook(ex.map(func, enumerate(arr)), total=len(arr)))
    if any([o is not None for o in results]): return results

In [20]:
class TokenizeProcessor(Processor):
    def __init__(self, lang="en", chunksize=2000, pre_rules=None, post_rules=None, max_workers=4): 
        self.chunksize,self.max_workers = chunksize,max_workers
        self.tokenizer = spacy.blank(lang).tokenizer
        for w in default_spec_tok:
            self.tokenizer.add_special_case(w, [{ORTH: w}])
        self.pre_rules  = default_pre_rules  if pre_rules  is None else pre_rules
        self.post_rules = default_post_rules if post_rules is None else post_rules

    def proc_chunk(self, args):
        i,chunk = args
        chunk = [compose(t, self.pre_rules) for t in chunk]
        docs = [[d.text for d in doc] for doc in self.tokenizer.pipe(chunk)]
        docs = [compose(t, self.post_rules) for t in docs]
        return docs

    def __call__(self, items): 
        toks = []
        chunks = [items[i: i+self.chunksize] for i in (range(0, len(items), self.chunksize))]
        toks = parallel(self.proc_chunk, chunks, max_workers=self.max_workers)
        return sum(toks, [])
    
    def proc1(self, item): return self.proc_chunk([item])[0]
    
    def deprocess(self, toks): return [self.deproc1(tok) for tok in toks]
    def deproc1(self, tok):    return " ".join(tok)

Example

In [21]:
tp = TokenizeProcessor()

In [22]:
tl[0]

'<number> words , <number> seconds , <number> moment : " thank you god . " rt this and be thankful not today , but everyday ! \n'

In [23]:
' • '.join(tp(tl[:1000])[0])[:400]

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




'xxbos • < • number • > • words • , • < • number • > • seconds • , • < • number • > • moment • : • " • thank • you • god • . • " • rt • this • and • be • thankful • not • today • , • but • everyday • ! • \n • xxeos'

### Numericalizing 

A deep leanrnig model can obviously not take strings as input, so we need to map the tokens to an integer using again a processor:

In [24]:
class NumericalizeProcessor(Processor):
    def __init__(self, vocab=None, max_vocab=60000, min_freq=2): 
        self.vocab,self.max_vocab,self.min_freq = vocab,max_vocab,min_freq
    
    def __call__(self, items):
        #The vocab is defined on the first use.
        if self.vocab is None:
            freq = Counter(p for o in items for p in o)
            self.vocab = [o for o,c in freq.most_common(self.max_vocab) if c >= self.min_freq]
            for o in reversed(default_spec_tok):
                if o in self.vocab: self.vocab.remove(o)
                self.vocab.insert(0, o)
        if getattr(self, 'otoi', None) is None:
            self.otoi = collections.defaultdict(int,{v:k for k,v in enumerate(self.vocab)}) 
        return [self.proc1(o) for o in items]
    def proc1(self, item):  return [self.otoi[o] for o in item]
    
    def deprocess(self, idxs):
        assert self.vocab is not None
        return [self.deproc1(idx) for idx in idxs]
    def deproc1(self, idx): return [self.vocab[i] for i in idx]

## Language model data

For the language model, the input/output of the model are not simple processed tweets and labels, it is a bit more complicated than that.

So the idea of a language model is to learn how to "speak" the language and in order to do that, given a sentence $x = (x_1, x_2, x_3, ..., x_t)$ where a $x_i$ is the i-th word of the sentence, it will learn to predict the vector $y = (y_1, y_2, y_3, ...,y_t)$ where $y_t = x_{t+1}$. In other words it needs to predict at each time step $t$ the next word $x_{t+1}$. Now of course we need batches of those, so we get a matrix of sequences : 

$
    x = 
    \begin{bmatrix} 
        x_{11} & x_{12} & \dots & x_{1bptt} \\
        \vdots & \ddots & \\
        x_{b1} &  x_{b2} & \dots & x_{bbptt} 
    \end{bmatrix}
    ,  y = 
    \begin{bmatrix} 
        x_{12} & x_{13} & \dots & x_{1(bptt+1)} \\
        \vdots & \ddots & \\
        x_{b2} &  x_{b3} & \dots & x_{b(bptt+1)} 
    \end{bmatrix}
$



where b is the batch size (bs) and m is the back propagation through time (bptt). We also need that for every batches, line $i$ of a batch must be the following of line $i$ form the last batch, i.e : Let $b_j$ and $b_{j+1}$ be two consecutive batches, then :

if $(b_j)_i = (x_1, x_2, ..., x_bptt)$ then $(b_{j+1})_i = (x_{bptt+1}, x_{bptt+2},..., x_{2bptt})$. 

Now we have only have tweets, the idea is to concatenate all of them to have a full stream of tokens. So we initially have

$t = [ [t_{11}, t_{12},..., t_{1m_1}], [t_{21}, t_{22},..., t_{2m_2}], ... , [t_{T1}, t_{T2},..., t_{Tm_T}] ]$

where $t$ is the lists of tweets, $t_{ij}$ is the jth token of the i'th tweet, $m_i$ is the size of the i'th tweet and $T$ is the total number of tweets. We then concanete to create the stream 

$stream = [t_{11}, t_{12},..., t_{1m_1}, t_{21}, t_{22},..., t_{2m_2}, ... , t_{T1}, t_{T2},..., t_{Tm_T}]$

we can change notation to ease the explanations :

$stream = [s_{1}, s_{2}, ...,  s_{M}]$

we now split the text into $bs$ number of sequences to get the batched data:

$
    \begin{bmatrix} 
        s_{1} & s_{2} & \dots & s_{sl} \\
        s_{sl+1} & s_{sl+2} & \dots & s_{2sl} \\
        \vdots & \ddots & \\
        s_{((bs-1)sl) + 1} &  s_{((bs-1)sl) + 2} & \dots & s_{(bs)(sl)} 
    \end{bmatrix}
$

we then split this matrix vertically each $bptt$ tokens in orther to get our $n = \frac{sl}{bptt} = \frac{M}{bpttxbs}$ number of batches:

$
    \begin{bmatrix} 
        s_{1}  & \dots & s_{bptt} \\
        s_{sl+1} & \dots & s_{sl + bptt} \\
        \vdots & \ddots & \\
        s_{((bs-1)sl) + 1}  & \dots & s_{((bs-1)sl) + bptt} 
    \end{bmatrix}
    \begin{bmatrix} 
        s_{bptt + 1} & \dots & s_{2bptt} \\
        s_{sl + bptt + 1} & \dots & s_{sl + 2bptt} \\
        \vdots & \ddots & \\
        s_{((bs-1)sl) + bptt + 1} & \dots & s_{((bs-1)sl) + 2bptt} 
    \end{bmatrix}
    ...
    \begin{bmatrix} 
        s_{(n-1)bptt + 1} & \dots & s_{nxbptt} \\
        s_{sl + (n-1)bptt + 1} & \dots & s_{sl + nxbptt} \\
        \vdots & \ddots & \\
        s_{((bs-1)sl) + (n-1)bptt + 1} & \dots & s_{((bs-1)sl) + nxbptt} 
    \end{bmatrix}
$


For the target batches, it is the same, only the elements are shifted by one as explained above.

The implementation is done using the **LM_Preloader** and the paytorch **DataLoader** class. Additionally we can shuffle the tweets before creating the stream in order to randomize the training.

In [25]:
class LM_PreLoader():
    def __init__(self, data, bs=64, bptt=70, shuffle=False):
        self.data,self.bs,self.bptt,self.shuffle = data,bs,bptt,shuffle
        total_len = sum([len(t) for t in data.x])
        self.n_batch = total_len // bs
        self.batchify()
    
    def __len__(self): return ((self.n_batch-1) // self.bptt) * self.bs
    
    def __getitem__(self, idx):
        source = self.batched_data[idx % self.bs]
        seq_idx = (idx // self.bs) * self.bptt
        return source[seq_idx:seq_idx+self.bptt],source[seq_idx+1:seq_idx+self.bptt+1]
    
    def batchify(self):
        texts = self.data.x
        if self.shuffle: texts = texts[torch.randperm(len(texts))]
        stream = torch.cat([tensor(t) for t in texts])
        self.batched_data = stream[:self.n_batch * self.bs].view(self.bs, self.n_batch)

def get_lm_dls(train_ds, valid_ds, bs, bptt, **kwargs):
    """
    Returns the training and validation language model DataLoaders 
    """
    return (DataLoader(LM_PreLoader(train_ds, bs, bptt, shuffle=True), batch_size=bs, **kwargs),
            DataLoader(LM_PreLoader(valid_ds, bs, bptt, shuffle=False), batch_size=2*bs, **kwargs))


## Classifier data 

For the classification, the data preperation is quite simpler. Each data tuple $(x,y)$ is simply a numericalized tweet $x$ and it's label $y$ (0 or 1). This gets a bit more tricky when we have to create batches of those. The tweets won't have the same lengths obviously hence we must use padding for the sake of same lengths tweets to use them in a batch.
What we do is everytime we get a batch of tweets, we measure the length of the longest tweet and pad the rest to have the same length as the longest one.

One situation we would not like to arrive at is for example having simultaneously long and short tweets in a batch, this would mean we would have to do a lot of padding. To address, we use a **Sampler** that we ill be given to the **DataLoader** which can basically change the order of the DataLoader iterator. We then use the **collate_fn** attribute of the **DataLoader** to pad the tweets in a batch.

In [2]:
class Sampler():
    def __init__(self, ds, bs, shuffle=False):
        self.n,self.bs,self.shuffle = len(ds),bs,shuffle
        
    def __iter__(self):
        self.idxs = torch.randperm(self.n) if self.shuffle else torch.arange(self.n)
        for i in range(0, self.n, self.bs): yield self.idxs[i:i+self.bs]

The **SortSampler** implements what we just mentionned and will be used for the validation dataloader

In [4]:
class SortSampler(Sampler):
    def __init__(self, data_source, key): self.data_source,self.key = data_source,key
    def __len__(self): return len(self.data_source)
    def __iter__(self):
        return iter(sorted(list(range(len(self.data_source))), key=self.key, reverse=True))

For the training dataloader we also want to have some randomization in the training order which is done using the **SortishSampler**

In [3]:
class SortishSampler(Sampler):
    def __init__(self, data_source, key, bs):
        self.data_source,self.key,self.bs = data_source,key,bs

    def __len__(self) -> int: return len(self.data_source)

    def __iter__(self):
        idxs = torch.randperm(len(self.data_source))
        megabatches = [idxs[i:i+self.bs*50] for i in range(0, len(idxs), self.bs*50)]
        sorted_idx = torch.cat([tensor(sorted(s, key=self.key, reverse=True)) for s in megabatches])
        batches = [sorted_idx[i:i+self.bs] for i in range(0, len(sorted_idx), self.bs)]
        # We also want to have the boggest batch at the beggining and the smallest one at the end for memory reasons
        max_idx = torch.argmax(tensor([self.key(ck[0]) for ck in batches]))  # find the chunk with the largest key,
        batches[0],batches[max_idx] = batches[max_idx],batches[0]            # then make sure it goes first.
        batch_idxs = torch.randperm(len(batches)-2)
        sorted_idx = torch.cat([batches[i+1] for i in batch_idxs]) if len(batches) > 1 else LongTensor([])
        sorted_idx = torch.cat([batches[0], sorted_idx, batches[-1]])
        return iter(sorted_idx)

In [7]:
def pad_collate(samples, pad_idx=1, pad_first=False):
    max_len = max([len(s[0]) for s in samples])
    res = torch.zeros(len(samples), max_len).long() + pad_idx
    for i,s in enumerate(samples):
        if pad_first: res[i, -len(s[0]):] = LongTensor(s[0])
        else:         res[i, :len(s[0]) ] = LongTensor(s[0])
    return res, tensor([s[1] for s in samples])

In [5]:
def get_clas_dls(train_ds, valid_ds, bs, **kwargs):
    train_sampler = SortishSampler(train_ds.x, key=lambda t: len(train_ds.x[t]), bs=bs)
    valid_sampler = SortSampler(valid_ds.x, key=lambda t: len(valid_ds.x[t]))
    return (DataLoader(train_ds, batch_size=bs, sampler=train_sampler, collate_fn=pad_collate, **kwargs),
            DataLoader(valid_ds, batch_size=bs*2, sampler=valid_sampler, collate_fn=pad_collate, **kwargs))

## Databunch

For simplicity we use a class Databunch (similar to the fastai DataBunch) which will contain the train and valid dataloaders and the vocab. We then create two subclasses for the language model data and the classification data

In [None]:
class Databunch() :
    """
    Container of a Dataloaders for the validation and training datasets 

    Arguments:
        train_dl: The training dataloader (must implement __iter__ method)
        valid_dl: The validaation dataloader (must implement __iter__ method)

    """
    def __init__(self, train_dl, valid_dl, vocab) :
        self.train_dl = train_dl
        self.valid_dl = valid_dl
        self.vocab = vocab
    
    @property
    def train_ds(self): return self.train_dl.dataset
        
    @property
    def valid_ds(self): return self.valid_dl.dataset  
    
    def save(self, path) :
        pickle.dump((self.train_dl, self.valid_dl, self.vocab), open(path, 'wb'))


In [None]:
class LMDatabunch(Databunch) :

    @classmethod
    def from_csv(cls, path, text_col, pctg, bs=64, bptt=70, vocab=None) :
        tl = TextList.from_csv(path, text_col)
        sd = SplitData.split_by_func(tl, partial(random_splitter, pctg=0.2))
        proc_tok,proc_num = TokenizeProcessor(max_workers=8),NumericalizeProcessor(vocab=vocab)
        ll = label_by_func(sd, lambda x: 0, proc_x = [proc_tok,proc_num])
        train_dl, valid_dl = get_lm_dls(ll.train, ll.valid, bs, bptt)
        return cls(train_dl, valid_dl, proc_num.vocab)


In [None]:
class ClasDatabunch(Databunch) :

    def save(self, path) :
        pickle.dump((self.train_dl.dataset, self.valid_dl.dataset, self.vocab, self.train_dl.batch_size), open(path, 'wb'))
    
    @classmethod
    def from_csv(cls, path, text_col, label_col, pctg, bs=64, vocab=None) :
        df = pd.read_csv(path)
        tl = TextList.from_df(df, text_col)
        sd = SplitData.split_by_func(tl, partial(random_splitter, pctg=0.2))
        tweet_to_label = {}
        it = tqdm_notebook(range(df.shape[0]), total=df.shape[0])
        for i in it : 
            tweet_to_label[df[text_col].iloc[i]] = df[label_col].iloc[i]
        proc_tok,proc_num = TokenizeProcessor(max_workers=8),NumericalizeProcessor(vocab=vocab)
        ll = label_by_func(sd, lambda x: tweet_to_label[x], proc_x = [proc_tok,proc_num])
        train_dl, valid_dl = get_clas_dls(ll.train, ll.valid, bs)
        return cls(train_dl, valid_dl, proc_num.vocab)

In [None]:
def load_data(path, data_type=LMDatabunch) :
    """
    Loads the databunch stored in path
    """
    if data_type == LMDatabunch :
        train_dl, valid_dl, vocab = pickle.load(open(path, 'rb'))
        return data_type(train_dl, valid_dl, vocab)
    else : 
        train_ds, valid_ds, vocab, bs = pickle.load(open(path, 'rb'))
        train_dl, valid_dl = get_clas_dls(train_ds, valid_ds, bs)
        return data_type(train_dl, valid_dl, bs)
