In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.model import fit
from fastai.dataset import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle

In [2]:
bs,bptt = 64,70

## Language modeling

### Data

In [3]:
#PATH='/data2/datasets/part1/arxiv/'
PATH='data/arxiv-twitterbot/'

#df_mb = pd.read_csv(f'{PATH}arxiv.csv')
df_mb = pd.read_csv(f'{PATH}brundage_bot.csv') # You can download from https://drive.google.com/file/d/0B34BjUTAgwm6SzdPWDAtVG1vWVU/
df_all = pd.read_pickle(f'{PATH}all_arxiv.pickle')

In [4]:
def get_txt(df):
    return '<CAT> ' + df.category.str.replace(r'[\.\-]','') + ' <SUMM> ' + df.summary + ' <TITLE> ' + df.title
df_mb['txt'] = get_txt(df_mb)
df_all['txt'] = get_txt(df_all)
n=len(df_all); n

49370

In [5]:
os.makedirs(f'{PATH}trn/yes', exist_ok=True)
os.makedirs(f'{PATH}val/yes', exist_ok=True)
os.makedirs(f'{PATH}trn/no', exist_ok=True)
os.makedirs(f'{PATH}val/no', exist_ok=True)
os.makedirs(f'{PATH}all/trn', exist_ok=True)
os.makedirs(f'{PATH}all/val', exist_ok=True)
os.makedirs(f'{PATH}models', exist_ok=True)

In [5]:
for (i,(_,r)) in enumerate(df_all.iterrows()):
    dset = 'trn' if random.random()>0.1 else 'val'
    open(f'{PATH}all/{dset}/{i}.txt', 'w').write(r['txt'])

In [6]:
for (i,(_,r)) in enumerate(df_mb.iterrows()):
    lbl = 'yes' if r.tweeted else 'no'
    dset = 'trn' if random.random()>0.1 else 'val'
    open(f'{PATH}{dset}/{lbl}/{i}.txt', 'w').write(r['txt'])

In [5]:
from spacy.symbols import ORTH

my_tok = spacy.load('en')

my_tok.tokenizer.add_special_case('<SUMM>', [{ORTH: '<SUMM>'}])
my_tok.tokenizer.add_special_case('<CAT>', [{ORTH: '<CAT>'}])
my_tok.tokenizer.add_special_case('<TITLE>', [{ORTH: '<TITLE>'}])
my_tok.tokenizer.add_special_case('<BR />', [{ORTH: '<BR />'}])
my_tok.tokenizer.add_special_case('<BR>', [{ORTH: '<BR>'}])

def my_spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(x)]

In [6]:
TEXT = data.Field(lower=True, tokenize=my_spacy_tok)
FILES = dict(train='trn', validation='val', test='val')
#md = LanguageModelData(f'{PATH}all/', TEXT, **FILES, bs=bs, bptt=bptt, min_freq=10) # This does not work.
md = LanguageModelData.from_text_files(f'{PATH}all/', TEXT, **FILES, bs=bs, bptt=bptt, min_freq=10)
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb'))

In [7]:
len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(2003, 18029, 1, 8979358)

In [8]:
TEXT.vocab.itos[:12]

['<unk>', '<pad>', 'the', ',', '.', 'of', '-', 'and', 'a', 'to', 'in', 'we']

In [9]:
' '.join(md.trn_ds[0].text[:150])

'<cat> cscv <summ> airborne laser scanning ( lidar ) point clouds over large forested areas can be processed to segment individual trees and subsequently extract tree - level information . existing segmentation procedures typically detect more than 90 % of overstory trees , yet they barely detect 60 % of understory trees because of the occlusion effect of higher canopy layers . although understory trees provide limited financial value , they are an essential component of ecosystem functioning by offering habitat for numerous wildlife species and influencing stand development . here we model the occlusion effect in terms of point density . we estimate the fractions of points representing different canopy layers ( one overstory and multiple understory ) and also pinpoint the required density for reasonable tree segmentation ( where accuracy plateaus ) . we show that at a density of ~170 pt / m - sqr understory trees'

### Train

In [10]:
em_sz = 200
nh = 500
nl = 3
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [11]:
learner = md.get_model(opt_fn, em_sz, nh, nl,
    dropout=0.05, dropouth=0.1, dropouti=0.05, dropoute=0.02, wdrop=0.2)
# dropout=0.4, dropouth=0.3, dropouti=0.65, dropoute=0.1, wdrop=0.5
#                dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05)
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip=0.3

In [12]:
learner.fit(3e-3, 1, wds=1e-6)

epoch      trn_loss   val_loss                                
    0      4.500181   4.395005  



[4.395005]

In [13]:
#learner.fit(3e-3, 3, wds=1e-6, cycle_len=1, cycle_mult=2) # 1+2+4=7 epochs
learner.fit(3e-3, 1, wds=1e-6, cycle_len=1, cycle_mult=2) # quick (1epoch)

epoch      trn_loss   val_loss                                
    0      4.258032   4.178511  



[4.1785107]

In [15]:
learner.save_encoder('adam2_enc')

In [16]:
#learner.fit(3e-3, 10, wds=1e-6, cycle_len=5, cycle_save_name='adam3_10') # 10*5=50 epochs
learner.fit(3e-3, 1, wds=1e-6, cycle_len=5, cycle_save_name='adam3_10') # quick (5epochs)

    1      4.132666   4.039716                                
    2      4.063084   3.970557                                
    3      3.990158   3.928183                                
    4      3.966207   3.920478                                



[3.9204779]

In [17]:
learner.save_encoder('adam3_10_enc')  # save_model(self.model[0], self.get_model_path(name))

In [12]:
learner.load_encoder('adam3_10_enc')

In [13]:
learner.model # (encoder, decoder)

SequentialRNN(
  (0): RNN_Encoder(
    (encoder): Embedding(18029, 200, padding_idx=1)
    (encoder_with_dropout): EmbeddingDropout(
      (embed): Embedding(18029, 200, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDrop(
        (module): LSTM(200, 500, dropout=0.1)
      )
      (1): WeightDrop(
        (module): LSTM(500, 500, dropout=0.1)
      )
      (2): WeightDrop(
        (module): LSTM(500, 200, dropout=0.1)
      )
    )
    (dropouti): LockedDropout(
    )
    (dropouths): ModuleList(
      (0): LockedDropout(
      )
      (1): LockedDropout(
      )
      (2): LockedDropout(
      )
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=200, out_features=18029)
    (dropout): LockedDropout(
    )
  )
)

In [14]:
#learner.fit(3e-3, 8, wds=1e-6, cycle_len=10, cycle_save_name='adam3_5') # 8*10=80 epochs
learner.fit(3e-3, 1, wds=1e-6, cycle_len=10, cycle_save_name='adam3_5') # quick (10 epochs)

epoch      trn_loss   val_loss                                
    0      4.093057   4.007401  
    1      4.052145   3.967418                                
    2      4.026157   3.938377                                
    3      3.965393   3.907334                                
    4      3.957945   3.878078                                
    5      3.90114    3.853514                                
    6      3.874079   3.832427                                
    7      3.83595    3.816233                                
    8      3.807765   3.810198                                
    9      3.824142   3.809294                                



[3.8092942]

In [13]:
learner.fit(3e-3, 1, wds=1e-6, cycle_len=20, cycle_save_name='adam3_20')

epoch      trn_loss   val_loss                                
    0      3.808775   3.809474  
    1      3.986442   3.900541                                
    2      3.978463   3.896425                                
    3      3.95704    3.885686                                
    4      3.954956   3.870238                                
    5      3.929249   3.858495                                
    6      3.917366   3.846637                                
    7      3.883471   3.833989                                
    8      3.860409   3.821439                                
    9      3.858275   3.810348                                
    10     3.820453   3.793987                                
    11     3.794936   3.786509                                
    12     3.80208    3.777726                                
    13     3.775804   3.76812                                 
    14     3.742349   3.761266                                
    15     3.799282   

[3.7479005]

In [14]:
learner.save_encoder('adam3_20_enc')

In [15]:
learner.save('adam3_20')

In [12]:
learner.load('adam3_20')

### Test

In [13]:
def proc_str(s): return TEXT.preprocess(TEXT.tokenize(s)) # "<CAT> csni <SUMM> algorithms that" => ['<cat>', 'csni', '<summ>', 'algorithms', 'that']
def num_str(s): return TEXT.numericalize([proc_str(s)]) # "<CAT> csni <SUMM> algorithms that" => [23, 802, 25, 79, 14]

In [14]:
m=learner.model # fastai.lm_rnn.SequentialRNN

In [16]:
s="""<CAT> cscv <SUMM> algorithms that"""

In [17]:
def sample_model(m, s, l=50):
    t = num_str(s)
    m[0].bs=1
    m.eval()
    m.reset()
    res,*_ = m(t)
    print('...', end='')

    for i in range(l):
        n=res[-1].topk(2)[1] # topk(2) 
        n = n[1] if n.data[0]==0 else n[0] # TEXT.vocab.itos[0] = '<unk>'
        word = TEXT.vocab.itos[n.data[0]]
        print(word, end=' ')
        if word=='<eos>': break
        res,*_ = m(n[0].unsqueeze(0)) # n[0]: torch.Size([1]), n[0].unsqueeze(0): torch.Size([1, 1])

    m[0].bs=bs

In [25]:
sample_model(m,"<CAT> csni <SUMM> algorithms that")

...are able to perform a large - scale network of mobile devices are still a challenging task . in this paper , we propose a novel approach to address the problem of network traffic analysis . we propose a novel network architecture that can be used to model the network 

In [26]:
sample_model(m,"<CAT> cscv <SUMM> algorithms that")

...are able to detect and segment objects in a scene are a key component of many computer vision applications . however , the existing methods for object detection are based on a single - view object detection method . in this paper , we propose a novel approach to detect 

In [27]:
sample_model(m,"<CAT> cscv <SUMM> algorithms. <TITLE> on ")

...the ability to learn from a single image <eos> 

In [28]:
sample_model(m,"<CAT> csni <SUMM> algorithms. <TITLE> on ")

...the performance of wireless networks <eos> 

In [29]:
sample_model(m,"<CAT> cscv <SUMM> algorithms. <TITLE> towards ")

...a robust and robust image segmentation method <eos> 

In [30]:
sample_model(m,"<CAT> csni <SUMM> algorithms. <TITLE> towards ")

...a scalable and scalable network architecture for    wireless sensor networks <eos> 

### Sentiment

In [43]:
TEXT = pickle.load(open(f'{PATH}models/TEXT.pkl','rb'))

In [44]:
class ArxivDataset(torchtext.data.Dataset):
    def __init__(self, path, text_field, label_field, **kwargs):
        fields = [('text', text_field), ('label', label_field)]
        examples = []
        for label in ['yes', 'no']:
            for fname in glob(os.path.join(path, label, '*.txt')):
                with open(fname, 'r') as f: text = f.readline()
                examples.append(data.Example.fromlist([text, label], fields))
        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex): return len(ex.text)
    
    @classmethod
    def splits(cls, text_field, label_field, root='.data',
               train='train', test='test', **kwargs):
        return super().splits(
            root, text_field=text_field, label_field=label_field,
            train=train, validation=None, test=test, **kwargs)

In [47]:
ARX_LABEL = data.Field(sequential=False)
splits = ArxivDataset.splits(TEXT, ARX_LABEL, PATH, train='trn', test='val') # dataset splits

In [53]:
md2 = TextData.from_splits(PATH, splits, bs)

In [9]:
#            dropout=0.3, dropouti=0.4, wdrop=0.3, dropoute=0.05, dropouth=0.2)

In [57]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

def prec_at_6(preds,targs):
    precision, recall, _ = precision_recall_curve(targs==2, preds[:,2])
    print(recall[precision>=0.6][0])
    return recall[precision>=0.6][0]

In [58]:
# dropout=0.4, dropouth=0.3, dropouti=0.65, dropoute=0.1, wdrop=0.5
m3 = md2.get_model(opt_fn, 1500, bptt, emb_sz=em_sz, n_hid=nh, n_layers=nl, 
           dropout=0.1, dropouti=0.65, wdrop=0.5, dropoute=0.1, dropouth=0.3)
m3.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
m3.clip=25.

In [61]:
m3.load_encoder(f'adam3_20_enc') # I had to copy ${PATH}/all/models/adam3_20_enc.h5 to ${PATH}/models/adam3_20_enc.h5 ...
lrs=np.array([1e-4,1e-3,1e-2])

In [62]:
m3.freeze_to(-1)
m3.fit(lrs/2, 1, metrics=[accuracy])
m3.unfreeze()
m3.fit(lrs, 1, metrics=[accuracy], cycle_len=1)

epoch      trn_loss   val_loss   accuracy                   
    0      1.166928   1.204889   0.264898  



epoch      trn_loss   val_loss   accuracy                   
    0      1.017439   1.172166   0.729651  



[1.1721665, 0.7296511627906976]

In [63]:
m3.fit(lrs, 2, metrics=[accuracy], cycle_len=4, cycle_save_name='imdb2')

epoch      trn_loss   val_loss   accuracy                    
    0      1.0045     1.027345   0.735828  
    1      1.001093   1.098683   0.739826                    
    2      0.983971   1.007466   0.746003                    
    3      0.9753     1.098899   0.748183                    
    4      1.00253    1.216124   0.763445                    
    5      0.991026   1.153211   0.765625                    
    6      0.991722   1.077172   0.764535                    
    7      0.978311   1.115443   0.752907                    



[1.1154425, 0.752906976744186]

In [64]:
prec_at_6(*m3.predict_with_targs())

0.48903878583473864


0.48903878583473864

In [65]:
m3.fit(lrs, 4, metrics=[accuracy], cycle_len=2, cycle_save_name='imdb2')

epoch      trn_loss   val_loss   accuracy                    
    0      0.973965   0.978446   0.771802  
    1      0.976245   1.028608   0.760538                    
    2      0.980103   1.074136   0.744549                    
    3      0.973563   1.08856    0.740916                    
    4      0.96651    1.103024   0.746366                    
    5      0.98549    1.127944   0.742369                    
    6      0.987562   1.133797   0.755087                    
    7      0.987033   1.149447   0.759811                    



[1.149447, 0.7598110465116279]

In [66]:
prec_at_6(*m3.predict_with_targs())

0.49409780775716694


0.49409780775716694