In [1]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

In [2]:
from fastai.text.all import *

In [3]:
print(torch.backends.mps.is_built()) # Apple M-series metal-performance-shaders-framework
print(torch.backends.mps.is_available()) # Apple M-series metal-performance-shaders-framework

mps_device = default_device()
print(mps_device)

True
True
mps


In [5]:
path = untar_data(URLs.IMDB) # https://docs.fast.ai/data.external.html

In [6]:
path.ls()
(path/'train').ls()

(#5) [Path('/Users/kristianbotnen/.fastai/data/imdb/train/.DS_Store'),Path('/Users/kristianbotnen/.fastai/data/imdb/train/neg'),Path('/Users/kristianbotnen/.fastai/data/imdb/train/pos'),Path('/Users/kristianbotnen/.fastai/data/imdb/train/unsupBow.feat'),Path('/Users/kristianbotnen/.fastai/data/imdb/train/labeledBow.feat')]

<img src="imdb_finderview.png" alt="IMDB dataset on disk" width="200"/>
<img src="imdb_observationexample.png" alt="IMDB dataset example" height="200"/>

In [7]:
import shutil
#from pathlib import Path

def create_subset(src, dest, num_samples=256):
    dest.mkdir(parents=True, exist_ok=True)
    files = list(src.glob('*'))[:num_samples]
    for file in files:
        shutil.copy(file, dest/file.name)


train_unsup = path/'unsup'
train_pos = path/'train'/'pos'
train_neg = path/'train'/'neg'
test_pos = path/'test'/'pos'
test_neg = path/'test'/'neg'

# Create subset directories
top_datapath = path.parent
subset_path = top_datapath/'subset'

(subset_path/'unsup').mkdir(parents=True, exist_ok=True)
(subset_path/'train'/'pos').mkdir(parents=True, exist_ok=True)
(subset_path/'train'/'neg').mkdir(parents=True, exist_ok=True)
(subset_path/'test'/'pos').mkdir(parents=True, exist_ok=True)
(subset_path/'test'/'neg').mkdir(parents=True, exist_ok=True)

# Copy files to subset directories
create_subset(train_unsup, subset_path/'unsup')
create_subset(train_pos, subset_path/'train'/'pos')
create_subset(train_neg, subset_path/'train'/'neg')
create_subset(test_pos, subset_path/'test'/'pos')
create_subset(test_neg, subset_path/'test'/'neg')

<img src="imdb_subset_finderview.png" alt="IMDB dataset on disk" width="200"/>

In [8]:
# Prepare the dataset. Both the training set and the validation set.
datablock = DataBlock(
    blocks=(TextBlock.from_folder(subset_path), CategoryBlock), # Input is text, Output is categories (positive / negative).
    get_items=get_text_files, # Get text files in path recursively, only in folders, if specified.
    splitter=GrandparentSplitter(valid_name='test'), # Split items from the grand parent folder names (train_name and valid_name).
    get_y=parent_label, # Label item with the parent folder name.
)

dataloaders = datablock.dataloaders(subset_path, bs=16, device=mps_device) # https://docs.fast.ai/data.transforms.html

In [9]:
datablock.summary(subset_path)

Setting-up type transforms pipelines
Collecting items from /Users/kristianbotnen/.fastai/data/subset
Found 1536 items
2 datasets of sizes 512,512
Setting up Pipeline: Tokenizer -> Numericalize
Setting up Pipeline: parent_label -> Categorize -- {'vocab': None, 'sort': True, 'add_na': False}

Building one sample
  Pipeline: Tokenizer -> Numericalize
    starting from
      /Users/kristianbotnen/.fastai/data/subset/train/neg/1821_4.txt
    applying Tokenizer gives
      ['xxbos', 'xxmaj', 'working', 'with', 'one', 'of', 'the', 'best', 'xxmaj', 'shakespeare', 'sources', ',', 'this', 'film', 'manages', 'to', 'be', 'creditable', 'to', 'it', "'s", 'source', ',', 'whilst', 'still', 'appealing', 'to', 'a', 'wider', 'audience', '.', '\n\n', 'xxmaj', 'branagh', 'steals', 'the', 'film', 'from', 'under', 'xxmaj', 'fishburne', "'s", 'nose', ',', 'and', 'there', "'s", 'a', 'talented', 'cast', 'on', 'good', 'form', '.']
    applying Numericalize gives
      TensorText of size 54
  Pipeline: parent_lab

In [10]:
dataloaders.show_batch(max_n=3)

Unnamed: 0,text,category
0,"xxbos xxmaj that word ' true ' in this film 's title got my alarm xxunk xxunk . xxmaj they rang xxunk when a title card xxunk to xxmaj america 's xxmaj civil xxmaj war as the ' war xxmaj between the xxmaj states ' ( the xxunk preferred by die - hard xxunk ) . xxmaj jesse xxmaj james -- thief , slave - xxunk and murderer -- is described as a quiet , gentle farm boy . \n\n xxmaj how dishonest is this movie ? xxmaj there is xxup no mention of slavery , far less of the documented fact that xxmaj jesse xxmaj james 's poor xxunk mother owned xxunk before the war , and that xxmaj jesse and his brother xxmaj frank actively fought to xxunk slavery . xxmaj according to this movie , all those xxmaj civil xxmaj war soldiers were really fighting to decide",neg
1,"xxbos xxmaj this 1996 movie was the first adaptation of xxmaj jane xxmaj eyre that i ever watched and when i did so i was xxunk by it . xxmaj so much of the novel had been left out and i considered xxmaj william xxmaj hurt to be terribly miscast as xxmaj rochester . xxmaj since then i have watched all the other noteworthy adaptations of the novel , the three short versions of ' xxunk , ' 70 and ' 97 and the three mini series of ' xxunk , ' xxunk and 2006 , and i have noticed that there are worse adaptations and worse xxmaj xxunk . \n\n xxmaj this is without doubt the most exquisite xxmaj jane xxmaj eyre adaptation as far as cinematography is concerned . xxmaj director xxmaj franco xxmaj xxunk xxunk in beautiful long shots of snow falling from a winter sky ,",neg
2,"xxbos xxmaj at the xxunk of the ' celebrity xxmaj big xxmaj brother ' racism row in 2007 ( involving xxmaj xxunk xxmaj xxunk and the late xxmaj xxunk xxmaj goody ) , i condemned on an internet forum those ' xxunk . ' fans who xxunk the show , after years of xxunk ' racist ' ' 70 's sitcoms such as ' xxunk & xxmaj chips ' & ' love xxmaj xxunk xxmaj xxunk ' . i thought they were being hypocritical , and said so . ' it xxmaj ai n't xxmaj half xxmaj hot xxmaj mum ' was then thrown into the argument , with some pointing out it had starred an xxmaj english actor xxunk - up . xxmaj well , yes , but xxmaj michael xxmaj bates had lived in xxmaj india as a boy , and spoke xxmaj xxunk xxunk . xxmaj the",pos


In [11]:
print(type(dataloaders))
print(len(dataloaders))
print(len(dataloaders.train_ds), len(dataloaders.valid_ds))

for i, sample in enumerate(dataloaders.train_ds):
    print(sample)
    if i == 2:
        break

<class 'fastai.data.core.DataLoaders'>
2
512 512
(TensorText([   2,    8,  739,   30,   44,   14,    9,  138,    8, 3284, 5455,
              11,   20,   32,  866,   15,   43,    0,   15,   18,   23, 3134,
              11, 1661,  150, 3073,   15,   13,    0,  313,   10,   25,    8,
               0, 1483,    9,   32,   53,  454,    8,    0,   23, 3883,   11,
              12,   56,   23,   13, 1352,  184,   36,   68,  711,   10]), TensorCategory(0))
(TensorText([   2,    8,   88,   71, 5475,    7,   19,   11,    9,  218,  677,
             152,   17, 2834,   12,   19,  305,    9,   27,  199,  653,   15,
             126,   10,  206,   11,   47, 6665,   15,  117, 5475,    7, 1241,
              12,    5,  156,   19,   10,    8, 1795,   87,   11,  179,  115,
             677,  173, 2941,  208,  125,   47, 2083,    9,   98,   44,   11,
              19,  490,   11, 2096,    0,    5,  156,   72,    8,  172,   11,
              91,    8,  559,  330,   16, 3120,   14, 6666,    9, 1067,   22

## Train and tune our model

In [12]:
# Train and tune our model.
learn = text_classifier_learner(dataloaders, AWD_LSTM, drop_mult=0.5, metrics=accuracy)

  wgts = torch.load(wgts_fname, map_location = lambda storage,loc: storage)


In [13]:
learn.fine_tune(4, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.604538,0.605831,0.673828,00:30


epoch,train_loss,valid_loss,accuracy,time
0,0.470686,0.502423,0.767578,00:27
1,0.38016,0.396072,0.832031,00:20
2,0.293321,0.376568,0.833984,00:20
3,0.205844,0.353899,0.841797,00:20


In [14]:
learn.show_results()

Unnamed: 0,text,category,category_
0,"xxbos i really wanted to be able to give this film a 10 . xxmaj i 've long thought it was my favorite of the four modern live - action xxmaj batman films to date ( and maybe it still will be -- i have yet to watch the xxmaj schumacher films again ) . xxmaj i 'm also starting to become concerned about whether xxmaj i 'm somehow xxunk being xxunk . xxmaj you see , i always liked the xxmaj schumacher films . xxmaj as far as i can remember , they were either xxunk or xxunk to me . xxmaj but the conventional wisdom is that the two xxmaj tim xxmaj burton directed films are far superior . i had serious problems with the first xxmaj burton xxmaj batman this time around -- i ended up giving it a 7 - xxunk xxunk as i might ,",pos,pos
1,"xxbos i wrote this as a two part review . xxmaj part two has spoilers . \n\n xxmaj part 1 : \n\n xxmaj no , this is n't that one about the sex with car xxunk . xxmaj this is the one about racism in xxup l.a . xxmaj you know , the one where everybody is a racist , and race is the topic on everybody 's mind at all times . xxmaj race . \n\n xxmaj its like the movie has a form of xxunk xxunk where race is the constant theme . xxmaj race . xxmaj racist . xxmaj racism . xxmaj race xxmaj relations . xxmaj xxunk race . \n\n xxmaj paul xxmaj xxunk made a movie which took the structure of xxmaj magnolia , which was used to show the xxunk of people who are xxunk connected , and then screwed it into a xxunk",neg,neg
2,"xxbos a film that tends to get buried under xxunk and xxunk - xxmaj it 's a remake ! xxmaj doris xxmaj day is in it ! xxmaj she sings ! - xxmaj hitchcock 's second crack at ' the xxmaj man xxmaj who xxmaj knew xxmaj too xxmaj much ' is his most under - rated film , and arguably a fully xxunk masterpiece in its own right . \n\n xxmaj this is , in more ways than one , xxmaj doris xxmaj day 's film . xxmaj not only does she give the finest performance of her career , more than holding her own against xxmaj james xxmaj stewart , but the whole film is subtly structured around her character rather than his . xxmaj this is , after all , a film in which music is both xxunk and plot device . xxmaj what better casting than",pos,pos
3,"xxbos xxmaj wrestlemania 2 is the only xxmaj xxunk xxunk to be held at three different locations , and xxmaj while it was an interesting idea , it did n't really work . xxmaj there are only really two matches that really struck out , with the rest being decent , or most of them , pretty terrible . xxmaj there are some entertaining celebrity 's on hand , like xxmaj susan xxmaj saint xxmaj james , xxmaj ray xxmaj charles and xxmaj xxunk xxmaj xxunk , but the experience was a waste of time for the most part . xxmaj the xxmaj british xxmaj xxunk xxmaj vs xxmaj the xxmaj dream xxmaj team match , is worth the price of admission itself , but you can honestly see that anywhere . \n\n xxmaj matches . \n\n xxmaj xxunk xxmaj xxunk . \n\n xxmaj paul xxmaj xxunk xxmaj vs xxmaj",neg,neg
4,"xxbos xxmaj xxunk xxmaj xxunk has become famous to the world after his marvelous production xxup the xxup xxunk . xxmaj movie fans got to know the style of the director who introduced himself as one among the post war new xxunk , an aristocrat who developed his individual free thinking and , xxunk , expressed them as an artist . xxmaj however , when applied to this movie , xxup morte a xxup venezia based upon the novel by xxmaj thomas xxmaj mann , it 's a slightly different story . \n\n xxmaj the entire film is , at first view , so unique , so psychological and so much influenced by the various thoughts of an artist ( both director and main character xxmaj gustav von xxmaj xxunk ) that it seems to be "" unwatchable "" for many viewers . xxmaj therefore , such opinions about the",pos,pos
5,"xxbos xxmaj talk xxmaj radio sees a man somewhat accidentally stumble through life , indeed the xxmaj american xxmaj dream , from whatever xxunk - standard and everyday job he has in a store ; to xxunk of a local radio show before going right the way through to the same job only later xxunk nationwide . xxmaj it 's a role he adopts out of his own xxunk and natural mannerisms , a xxunk mad approach to freedom of speech as he attacks just about everyone and everything , even those that often call up to agree with him or compliment him . xxmaj his role as a man that xxunk on all things good , evil , right , wrong , political , religious , moral and immoral is something that people seem to take to in one form ; that of ' it 's entertaining and worth",pos,pos
6,"xxbos "" man of the xxmaj year "" tells the story of xxmaj tom xxmaj dobbs ( robin xxmaj williams ) a political comedian ( like xxmaj jon xxmaj stewart or xxmaj stephen xxmaj xxunk ) who has his own television show . xxmaj on his show he talks about all sorts of things but his main focus are political issues which he is very xxunk about . xxmaj one day on his show , a fan from the audience raises the idea that xxmaj dobbs should run for xxmaj president of the xxmaj united xxmaj states . xxmaj after that episode aired , millions xxunk to the web to create various xxunk and voice their opinions on why xxmaj dobbs would make a great candidate for the xxmaj president for the xxmaj united xxmaj states . a few weeks later , xxmaj dobbs decides to run for xxmaj president",pos,pos
7,"xxbos xxmaj yeah , a long time ago it turned into a tourist attraction . xxmaj now it 's a prison again . xxmaj kind of . xxmaj well , it 's more like an xxunk mixed together with a junior high school but there are lots of guys running around wearing orange xxunk , so i guess in that way it 's like a prison . xxmaj not really though . xxmaj when xxmaj xxunk , xxmaj steven xxmaj seagal 's character , is being admitted into prison , he 's standing xxunk in line and wanders over to a different line so he can talk to his friend , like he 's in line for the security check at the xxunk . xxmaj then before too long he and his friend are throwing punches , xxunk around a couple of security xxunk . \n\n xxmaj let me tell",neg,pos
8,"xxbos xxmaj this is an excellent example of an xxunk bad b - movie . xxmaj there are worse movies than this one ( titanic for example ) , but this definitely shares the pile of steaming crap movies . \n\n xxup ok this was apparently shot in xxmaj kansas xxmaj city , which explains why everyone is so lame . xxmaj the main guy looks like xxmaj steve xxmaj guttenberg , and is even more lame than him ! i did n't even think that was possible ! xxmaj in fact , him and the main girl in the movie are responsible for the xxup worst xxup drama xxup ever ! xxmaj its not just that there acting was w xxrep 9 a y over - dramatic , well actually it was , of course the script was terrible which combines for a deadly one - two punch in",neg,neg


In [16]:
# Use our model by passing it a review.
category,_,probs = learn.predict("I really liked that movie")

print(f"This is a: {category}.")
print(f"Probability it's a positive: {probs[1]:.4f}")

category,_,probs = learn.predict("I did not like that movie, it was awful. It was the worst thing I have ever seen")

print(f"This is a: {category}.")
print(f"Probability it's a positive: {probs[1]:.4f}")

This is a: pos.
Probability it's a positive: 0.8641


This is a: neg.
Probability it's a positive: 0.3807


<img src="cpu_gpu_belastning.png" alt="CPU og GPU belastning" width="800"/>

## ULMFiT

<img src="ulmfit.png" alt="ULMFiT process" width="800"/>

In [17]:
dataloaders_lm = TextDataLoaders.from_folder(subset_path/'unsup', is_lm=True, valid_pct=0.1)

In [18]:
dataloaders_lm.show_batch(max_n=3)

Unnamed: 0,text,text_
0,"xxbos xxmaj this xxunk is the worst show i have ever seen on xxup tv . xxmaj ever . xxmaj and i watch a lot of xxup tv . xxmaj it basically deals with a bunch of trashy , low class and xxunk xxunk women xxunk for the "" love "" of xxunk up xxunk xxmaj xxunk xxmaj xxunk . i hope most people watch it for its xxunk or even xxunk","xxmaj this xxunk is the worst show i have ever seen on xxup tv . xxmaj ever . xxmaj and i watch a lot of xxup tv . xxmaj it basically deals with a bunch of trashy , low class and xxunk xxunk women xxunk for the "" love "" of xxunk up xxunk xxmaj xxunk xxmaj xxunk . i hope most people watch it for its xxunk or even xxunk xxunk"
1,"xxunk without easily being xxunk . \n\n i wonder how many more times i will watch "" the xxmaj age of xxmaj xxunk "" before i xxunk being exposed to xxmaj hollywood 's xxunk century xxunk , such as "" xxunk xxmaj day "" or "" wild , xxmaj wild xxmaj west "" . xxmaj all i know is that xxmaj ellen xxmaj olenska ( as one of my favorite cinematic xxunk","without easily being xxunk . \n\n i wonder how many more times i will watch "" the xxmaj age of xxmaj xxunk "" before i xxunk being exposed to xxmaj hollywood 's xxunk century xxunk , such as "" xxunk xxmaj day "" or "" wild , xxmaj wild xxmaj west "" . xxmaj all i know is that xxmaj ellen xxmaj olenska ( as one of my favorite cinematic xxunk )"
2,films . \n\n xxmaj my favourite director is xxmaj john xxmaj xxunk xxmaj my favourite actor is xxmaj chow yun - fat . i like films with xxunk xxunk xxunk . i like long action sequences . i like a slight bit of martial arts but not too much . xxmaj good photography . xxmaj the sense that the director gave a damn about the film . xxmaj the sense that the,. \n\n xxmaj my favourite director is xxmaj john xxmaj xxunk xxmaj my favourite actor is xxmaj chow yun - fat . i like films with xxunk xxunk xxunk . i like long action sequences . i like a slight bit of martial arts but not too much . xxmaj good photography . xxmaj the sense that the director gave a damn about the film . xxmaj the sense that the actors


In [19]:
llm_learn = language_model_learner(dataloaders_lm, AWD_LSTM, metrics=[accuracy, Perplexity()], path=subset_path/'unsup', wd=0.1)

  wgts = torch.load(wgts_fname, map_location = lambda storage,loc: storage)


In [20]:
llm_learn.fit_one_cycle(4, 1e-2) # 0.01 | https://iconof.com/1cycle-learning-rate-policy/

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,4.261617,3.936362,0.268338,51.231895,00:05
1,4.099275,3.833188,0.271326,46.209633,00:03
2,3.956356,3.793528,0.272511,44.412815,00:03
3,3.847576,3.783887,0.271635,43.986694,00:03


In [21]:
llm_learn.save('4epoch')
# llm_learn = llm_learn.load('1epoch')

Path('/Users/kristianbotnen/.fastai/data/subset/unsup/models/4epoch.pth')

In [22]:
llm_learn.unfreeze()
llm_learn.fit_one_cycle(10, 1e-3) # 0.001 | https://iconof.com/1cycle-learning-rate-policy/

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,3.659177,3.765779,0.275541,43.197353,00:03
1,3.619452,3.734732,0.279622,41.876816,00:03
2,3.555174,3.709059,0.279731,40.815399,00:03
3,3.468799,3.706579,0.285298,40.714272,00:03
4,3.385066,3.70383,0.285549,40.602505,00:03
5,3.296635,3.73615,0.280691,41.936234,00:03
6,3.206934,3.744673,0.280599,42.295174,00:03
7,3.123483,3.761913,0.278713,43.030655,00:03
8,3.06274,3.762731,0.279681,43.065891,00:03
9,3.013705,3.763774,0.279372,43.110836,00:03


In [23]:
llm_learn.save_encoder('10epoch_finetuned')

In [30]:
print(llm_learn.predict("The man is a good", 50, temperature=0.75))

The man is a good guy and has a lot of passion for merit , love and love . He has a great time and finds some good friends and family , but he has no great experience in it . He has a lot of have to do with some sort of


In [31]:
the_best_review_starts_with = "I liked this movie because: "
n_words = 40
n_sentences = 2
preds = [llm_learn.predict(the_best_review_starts_with, n_words, temperature=0.75) 
         for _ in range(n_sentences)]

In [32]:
print(preds)

['i liked this movie because : " it \'s so hard to believe that this movie was made by a filmmaker . It was a matter of fact , but because of the length , it was not a movie or a movie .', "i liked this movie because : i have a great idea of how this film could deal with family and family . It is a fascinating topic , because it has a lot of love and it 's not really quite an appropriate thing for"]


<img src="ulmfit.png" alt="ULMFiT process" width="800"/>

## Skip this part?

<img src="nevralt_nettverk.png" alt="Nevralt nettverk" width="600"/>

In [33]:
dataloaders_classifier = TextDataLoaders.from_folder(subset_path, valid='test', text_vocab=dataloaders_lm.vocab)

In [34]:
learn_2pass = text_classifier_learner(dataloaders_classifier, AWD_LSTM, drop_mult=0.5, metrics=accuracy)

  wgts = torch.load(wgts_fname, map_location = lambda storage,loc: storage)


In [35]:
encoder_path = subset_path/'unsup/models'
learn_2pass = learn_2pass.load_encoder(encoder_path/'10epoch_finetuned')

  wgts = torch.load(join_path_file(file,self.path/self.model_dir, ext='.pth'), map_location=device)


In [36]:
learn_2pass.fit_one_cycle(1, 2e-2) # 0.02 | https://iconof.com/1cycle-learning-rate-policy/

epoch,train_loss,valid_loss,accuracy,time
0,0.58931,0.628925,0.599609,00:16


In [None]:
#print(slice(1e-2/(2.6**4),1e-2))
#print(slice(5e-3/(2.6**4),5e-3))
#print(slice(1e-3/(2.6**4),1e-3))

In [37]:
learn_2pass.freeze_to(-2) # Last two layers
learn_2pass.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2)) # epoch, lr group 0 (body), lr group 1 (head)

epoch,train_loss,valid_loss,accuracy,time
0,0.464235,0.574443,0.697266,00:06


In [38]:
learn_2pass.freeze_to(-3) # Last three layers
learn_2pass.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3)) # epoch, lr group 0 (body), lr group 1 (head)

epoch,train_loss,valid_loss,accuracy,time
0,0.335465,0.50359,0.75,00:09


In [39]:
learn_2pass.unfreeze() # All layers
learn_2pass.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3)) # epoch, lr group 0 (body), lr group 1 (head)

epoch,train_loss,valid_loss,accuracy,time
0,0.23793,0.440052,0.791016,00:10
1,0.218334,0.41825,0.814453,00:10


In [40]:
# Use our model by passing it a review.
category,_,probs = learn_2pass.predict("I really liked that movie")

print(f"This is a: {category}.")
print(f"Probability it's a positive: {probs[1]:.4f}")

category,_,probs = learn_2pass.predict("I did not like that movie, it was awful")

print(f"This is a: {category}.")
print(f"Probability it's a positive: {probs[1]:.4f}")

This is a: pos.
Probability it's a positive: 0.9358


This is a: neg.
Probability it's a positive: 0.1404
