In [None]:
from local.torch_basics import *
from local.test import *
from local.core import *
from local.layers import *
from local.data.all import *
from local.notebook.showdoc import show_doc
from local.optimizer import *
from local.learner import *
from local.metrics import *
from local.text.core import *
from local.text.data import *
from local.text.models.core import *
from local.text.models.awdlstm import *
from local.callback.rnn import *
from local.callback.all import *

In [None]:
# all_slow

# Integration test on Wikitext-2

> Training a Language Model on WT2

## Data

In [None]:
path = untar_data(URLs.WIKITEXT_TINY)

The dataset comes with all the wrticles concatenated. We split them to be able to shuffle at the beginning of each epoch.

In [None]:
def istitle(line):
    return len(re.findall(r'^ = [^=]* = $', line)) != 0

def read_file(filename):
    articles = L()
    with open(filename, encoding='utf8') as f:
        lines = f.readlines()
    current_article = ''
    for i,line in enumerate(lines):
        current_article += line.replace('<unk>', UNK)
        if i < len(lines)-2 and lines[i+1] == ' \n' and istitle(lines[i+2]):
            articles.append(current_article.split(' '))
            current_article = ''
    articles.append(current_article.split(' '))
    return articles

Then we put our list of tokenized texts together in an `LM_Dataset`. It will return tuples of sequences of `seq_len`, with the second sequence between the first one shifted by one on the right.

In [None]:
trn_txt = read_file(path/'train.txt')
val_txt = read_file(path/'valid.txt')
tst_txt = read_file(path/'test.txt')

In [None]:
all_texts = np.concatenate([val_txt, trn_txt, tst_txt])
df = pd.DataFrame({'texts':all_texts})
df.head()

Unnamed: 0,texts
0,"[, \n, =, Homarus, gammarus, =, \n, \n, Homarus, gammarus, ,, known, as, the, European, lobster, or, common, lobster, ,, is, a, species, of, xxunk, lobster, from, the, eastern, Atlantic, Ocean, ,, Mediterranean, Sea, and, parts, of, the, Black, Sea, ., It, is, closely, related, to, the, American, lobster, ,, H., americanus, ., It, may, grow, to, a, length, of, 60, cm, (, 24, in, ), and, a, mass, of, 6, kilograms, (, 13, lb, ), ,, and, bears, a, conspicuous, pair, of, claws, ., In, life, ,, the, lobsters, are, blue, ,, only, becoming, "", lobster, red, "", on, ...]"
1,"[, \n, =, Frank, xxunk, =, \n, \n, Air, Vice, Marshal, Frank, xxunk, ,, CB, ,, CBE, (, 15, July, 1914, –, 23, December, 1976, ), was, a, senior, commander, in, the, Royal, Australian, Air, Force, (, RAAF, ), ., Born, and, educated, in, Tasmania, ,, he, joined, the, RAAF, as, an, air, cadet, in, January, 1934, ., He, specialised, in, flying, instruction, and, navigation, before, the, outbreak, of, World, War, II, ., In, April, 1941, ,, he, became, commanding, officer, of, No., 2, Squadron, ,, which, operated, Lockheed, xxunk, ., The, squadron, was, deployed, to, Dutch, Timor, in, December, ..."
2,"[, \n, =, M, @-@, 82, (, Michigan, highway, ), =, \n, \n, M, @-@, 82, is, a, state, trunkline, in, the, Lower, Peninsula, in, the, US, state, of, Michigan, that, travels, between, xxunk, and, Howard, City, ., The, section, between, xxunk, and, Howard, City, travels, through, xxunk, and, along, the, southern, edge, of, xxunk, National, Forest, ., The, current, version, of, M, @-@, 82, is, actually, the, second, in, the, state, ;, the, first, usage, appeared, in, the, Upper, Peninsula, by, 1919, ., The, Lower, Peninsula, routing, has, been, in, use, since, the, 1920s, ., Various, extensions,..."
3,"[, \n, =, xxunk, xxunk, =, \n, \n, xxunk, xxunk, (, xxunk, xxunk, ,, xxunk, xxunk, ), is, a, fictional, character, in, the, xxunk, manga, and, anime, series, created, by, xxunk, xxunk, ., In, the, anime, and, manga, ,, xxunk, is, a, ninja, affiliated, with, the, village, of, xxunk, ., He, is, a, member, of, Team, 10, ,, a, group, of, ninja, consisting, of, himself, ,, xxunk, xxunk, ,, xxunk, xxunk, ,, and, team, leader, xxunk, xxunk, ., xxunk, is, portrayed, as, a, lazy, character, ,, unwilling, to, apply, his, prodigious, intelligence, ;, xxunk, has, noted, that, he, likes, xxunk, ...]"
4,"[, \n, =, Meridian, ,, Mississippi, =, \n, \n, Meridian, is, the, sixth, largest, city, in, the, state, of, Mississippi, ,, in, the, United, States, ., It, is, the, county, seat, of, Lauderdale, County, and, the, principal, city, of, the, Meridian, ,, Mississippi, xxunk, Statistical, Area, ., Along, major, highways, ,, the, city, is, 93, mi, (, 150, km, ), east, of, Jackson, ,, Mississippi, ;, 154, mi, (, xxunk, km, ), west, of, Birmingham, ,, Alabama, ;, 202, mi, (, 325, km, ), northeast, of, New, Orleans, ,, Louisiana, ;, and, 231, mi, (, 372, km, ), southeast, of, ...]"


In [None]:
#df_tok,count = tokenize_df(df, ['texts'])

In [None]:
count = Counter([p for t in df["texts"].values for p in t])
vocab = make_vocab(count)

In [None]:
splits = [list(range(len(val_txt), len(df))), list(range(len(val_txt)))]
tfm = Numericalize(make_vocab(count))

In [None]:
dsrc = DataSource(df["texts"].values, [tfm], splits=splits)

In [None]:
bs,sl = 104,72
train_dl = LMDataLoader(dsrc.train, bs=bs,   seq_len=sl, after_batch=[Cuda()], shuffle=True, num_workers=8)
valid_dl = LMDataLoader(dsrc.valid, bs=2*bs, seq_len=sl, after_batch=[Cuda()], num_workers=8)

In [None]:
dbch = DataBunch(train_dl, valid_dl)
dbch.show_batch()

Unnamed: 0,text
0,"\n = French cruiser Sully = \n \n The French cruiser Sully was an armored cruiser of the Gloire class that was built for the French Navy in the early 1900s . She was named in honor of Maximilien de xxunk , Duke of Sully , trusted minister of King Henry IV . The ship struck a rock in xxunk Long Bay , French Indochina in 1905 , only eight months"
1,"2005 . The song was written by band members Beyoncé , Kelly Rowland and Michelle Williams along with Rodney "" xxunk "" Jerkins , Ricky "" Ric Rude "" Lewis and Robert Waller , with Beyoncé , Rude and Jerkins handling its production . An R & B ballad talking about a woman 's desire to cater to the male love interest of her life , "" Cater 2 U "" contains"
2,"first time in life . \n Pokiri was D 'Cruz 's breakthrough film in Telugu . In June 2006 , Trade analyst Sridhar xxunk said that the Andhra Pradesh trade felt that her glamour , screen presence , and on @-@ screen chemistry with Mahesh worked to the film 's advantage . xxunk called her the "" new pin @-@ up girl of Telugu cinema "" . Talking about being typecast after"
3,"Victoria appointed her president of the Queen Victoria Jubilee Institute for xxunk in Scotland , the beginning of the district nurse system , which was to xxunk health care for the rural poor and sick in Britain . She was also interested in general improvements in standards of nursing . \n Like many of her Rothschild relatives she was also deeply involved with the welfare of young working @-@ class women of"
4,"Resources confirmed a cougar sighting in Michigan 's Upper Peninsula . Typically , extreme @-@ range sightings of cougars involve young males , which can travel great distances to establish ranges away from established males ; all four confirmed cougar kills in Iowa since 2000 involved males . \n On April 14 , 2008 , police shot and killed a cougar on the north side of Chicago , Illinois . DNA tests"
5,"expected to be completed for many decades . \n Chess playing programs xxunk and Deep Thought defeated chess masters in 1989 . Both were developed by Carnegie xxunk University ; Deep Thought development paved the way for the Deep Blue . \n \n = = = The money returns : the fifth generation project = = = \n \n In 1981 , the Japanese Ministry of International Trade and Industry set aside"
6,"successful independence from Spain . The city served as the main port for the Texas Navy during the Texas Revolution , and later served as the capital of the Republic of Texas . \n During the 19th century , Galveston became a major U.S. commercial center and one of the largest ports in the United States . It was devastated by the 1900 Galveston Hurricane , whose effects included flooding and a"
7,"architectural style is Georgian . In the Mendip district , the greatest concentrations of these cluster around the cathedral and xxunk in Wells and in Glastonbury . North Somerset features bridges and piers along with a selection of Manor houses . The Sedgemoor district has many buildings related to trade and commerce centered on Bridgwater ; while in South Somerset xxunk , xxunk and xxunk predominate . Taunton Deane includes the defensive"
8,the small harbours at Port @-@ en @-@ Bessin and xxunk @-@ sur @-@ Mer . Most shipments were brought in over the beaches until the port of Cherbourg was cleared of mines and obstructions on 16 July . The most important use of the Mulberry harbour was the unloading of heavy machinery that could not be brought across the beaches . Artificial xxunk ( xxunk ) sheltered hundreds of ships during
9,"to add a musical tone within the poem . \n In terms of poetic meter , Keats relies on xxunk throughout his 1819 odes and in just over 8 % of his lines within "" Ode to a Nightingale "" , including line 12 : \n and line 25 : \n To Walter Jackson Bate , the use of spondees in lines 31 – 34 creates a feeling of slow flight ,"


## Model

In [None]:
config = awd_lstm_lm_config.copy()
config.update({'input_p': 0.6, 'output_p': 0.4, 'weight_p': 0.5, 'embed_p': 0.1, 'hidden_p': 0.2})
model = get_language_model(AWD_LSTM, len(vocab), config=config)

In [None]:
opt_func = partial(Adam, wd=0.1, eps=1e-7)
cb_funcs = [partial(MixedPrecision, clip=0.1), partial(RNNTrainer, alpha=2, beta=1)]

In [None]:
learn = Learner(dbch, model, loss_func=CrossEntropyLossFlat(), opt_func=opt_func, cb_funcs=cb_funcs, metrics=[accuracy, Perplexity()])

In [None]:
learn.fit_one_cycle(1, 5e-3, moms=(0.8,0.7,0.8), div=10)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,7.002256,6.007985,0.170773,406.662903,00:27


Full training

In [None]:
#learn.fit_one_cycle(90, 5e-3, moms=(0.8,0.7,0.8), div=10)