In [None]:
#using this guide: https://towardsdatascience.com/fine-tuning-the-bart-large-model-for-text-summarization-3c69e4c04582

In [None]:
##Import and prepare the data

!pip install ohmeow-blurr -q
!pip install bert-score -q

import pandas as pd
from fastai.text.all import *
from transformers import *
from blurr.data.all import *
from blurr.modeling.all import *

[K     |████████████████████████████████| 91 kB 5.3 MB/s 
[K     |████████████████████████████████| 1.2 MB 33.7 MB/s 
[K     |████████████████████████████████| 43 kB 2.0 MB/s 
[K     |████████████████████████████████| 298 kB 68.9 MB/s 
[K     |████████████████████████████████| 189 kB 60.9 MB/s 
[K     |████████████████████████████████| 3.3 MB 63.9 MB/s 
[K     |████████████████████████████████| 56 kB 5.9 MB/s 
[K     |████████████████████████████████| 61 kB 558 kB/s 
[K     |████████████████████████████████| 596 kB 69.4 MB/s 
[K     |████████████████████████████████| 3.3 MB 83.5 MB/s 
[K     |████████████████████████████████| 895 kB 60.2 MB/s 
[K     |████████████████████████████████| 132 kB 76.9 MB/s 
[K     |████████████████████████████████| 1.1 MB 50.2 MB/s 
[K     |████████████████████████████████| 243 kB 82.8 MB/s 
[K     |████████████████████████████████| 271 kB 90.5 MB/s 
[K     |████████████████████████████████| 160 kB 81.7 MB/s 
[K     |███████████████████████

In [None]:
!pip install google.colab
from google.colab import files
uploaded = files.upload()



Saving Datafiniti_Hotel_Reviews.csv to Datafiniti_Hotel_Reviews.csv


In [None]:
import io
df = pd.read_csv(io.BytesIO(uploaded['Datafiniti_Hotel_Reviews.csv']), error_bad_lines=False)
df = df.dropna().reset_index()

#Select part of data we want to keep
df = df[['reviews.title','reviews.text']]

#Clean text
df['reviews.text'] = df['reviews.text'].apply(lambda x: x.replace('\n',''))

#Select only part of it (makes testing faster)
reviews = df.head(150)
reviews.head()

Unnamed: 0,reviews.title,reviews.text
0,Best romantic vacation ever!!!!,Our experience at Rancho Valencia was absolutely perfect from beginning to end!!!! We felt special and very happy during our stayed. I would come back in a heart beat!!!
1,Sweet sweet serenity,Amazing place. Everyone was extremely warm and welcoming. We've stayed at some top notch places and this is definitely in our top 2. Great for a romantic getaway or take the kids along as we did. Had a couple stuffed animals waiting for our girls upon arrival. Can't wait to go back.
2,Amazing Property and Experience,"We booked a 3 night stay at Rancho Valencia to play some tennis, since it is one of the highest rated tennis resorts in America. This place is really over the top from a luxury standpoint and overall experience. The villas are really perfect, the staff is great, attention to details (includes fresh squeezed orange juice each morning), restaurants, bar and room service amazing, and the tennis program was really impressive as well. We will want to come back here again."
3,"Never again...beware, if you want sleep.",Currently in bed writing this for the past hr 1/2 there have been dogs barking and squealing call the front desk to advise basically to be told there's nothing they can do. 315.00 and I can't sleep.
4,ALWAYS GREAT STAY...,I live in Md and the Aloft is my Home away from home...we stayed 1 night 7-7-16 ...Staff is great ! Especially Olivia who was Extra special because she remembered me by my voice over the phone ...which tells me she is very alert and pays attention to the customer their needs.AND SHE DID ! Thumbs up... More


In [None]:
##Import the model 

In [None]:
pretrained_model_name = "facebook/bart-large-cnn"
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, 
                                                                  model_cls=BartForConditionalGeneration)

#Create mini-batch and define parameters
hf_batch_tfm = HF_Seq2SeqBeforeBatchTransform(hf_arch, hf_config, hf_tokenizer, hf_model, 
    task='summarization',
    text_gen_kwargs=
 {'max_length': 400,'min_length': 2,'do_sample': False, 'early_stopping': True, 'num_beams': 4, 'temperature': 1.0, 
  'top_k': 50, 'top_p': 1.0, 'repetition_penalty': 1.0, 'bad_words_ids': None, 'bos_token_id': 0, 'pad_token_id': 1,
 'eos_token_id': 2, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'encoder_no_repeat_ngram_size': 0,
 'num_return_sequences': 1, 'decoder_start_token_id': 2, 'use_cache': True, 'num_beam_groups': 1,
 'diversity_penalty': 0.0, 'output_attentions': False, 'output_hidden_states': False, 'output_scores': False,
 'return_dict_in_generate': False, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2, 'remove_invalid_values': False})


#Prepare data for training
blocks = (HF_Seq2SeqBlock(before_batch_tfm=hf_batch_tfm), noop)
dblock = DataBlock(blocks=blocks, get_x=ColReader('reviews.text'), get_y=ColReader('reviews.title'), splitter=RandomSplitter())
dls = dblock.dataloaders(reviews, batch_size = 4)

Downloading:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

In [None]:
##Define performance metrics

In [None]:
seq2seq_metrics = {
        'rouge': {
            'compute_kwargs': { 'rouge_types': ["rouge1", "rouge2", "rougeL"], 'use_stemmer': True },
            'returns': ["rouge1", "rouge2", "rougeL"]
        },
        'bertscore': {
            'compute_kwargs': { 'lang': 'fr' },
            'returns': ["precision", "recall", "f1"]}}

#Model
model = HF_BaseModelWrapper(hf_model)
learn_cbs = [HF_BaseModelCallback]
fit_cbs = [HF_Seq2SeqMetricsCallback(custom_metrics=seq2seq_metrics)]

#Specify training
learn = Learner(dls, model,
                opt_func=ranger,loss_func=CrossEntropyLossFlat(),
                cbs=learn_cbs,splitter=partial(seq2seq_splitter, arch=hf_arch)).to_fp16()

#Create optimizer with default hyper-parameters
learn.create_opt() 
learn.freeze()

#Training
learn.fit_one_cycle(10, lr_max=3e-5, cbs=fit_cbs)

Downloading:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.93k [00:00<?, ?B/s]

epoch,train_loss,valid_loss,rouge1,rouge2,rougeL,bertscore_precision,bertscore_recall,bertscore_f1,time
0,7.306691,7.291654,0.081358,0.018182,0.062886,0.606304,0.681205,0.640762,00:35
1,6.519368,5.754456,0.095573,0.044444,0.087457,0.649204,0.692784,0.66931,00:09
2,5.489066,3.916157,0.148594,0.061905,0.148622,0.68244,0.703391,0.691322,00:08
3,4.092442,2.57983,0.159392,0.1,0.162857,0.712842,0.713495,0.712015,00:07
4,3.026924,2.400629,0.16159,0.1,0.164762,0.719901,0.722446,0.720141,00:07
5,2.306442,2.3968,0.143155,0.061905,0.133619,0.692136,0.704806,0.697541,00:07
6,1.820355,2.444309,0.114785,0.066667,0.114688,0.704545,0.711809,0.707202,00:07
7,1.492552,2.573729,0.132639,0.066667,0.132418,0.700808,0.711499,0.705012,00:08
8,1.293742,2.539728,0.161831,0.1,0.163147,0.71122,0.715143,0.711951,00:07
9,1.170886,2.5377,0.161831,0.1,0.163147,0.71168,0.715919,0.712579,00:07


Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

In [None]:
df['reviews.text'][0]

'Our experience at Rancho Valencia was absolutely perfect from beginning to end!!!! We felt special and very happy during our stayed. I would come back in a heart beat!!!'

In [None]:
outputs = learn.blurr_generate(df['reviews.text'][0], early_stopping=False, num_return_sequences=1)

for idx, o in enumerate(outputs):
    print(f'=== Prediction {idx+1} ===\n{o}\n')

=== Prediction 1 ===
 I would come back to Rancho Valencia in a heart beat!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"!!!!!!!!"!!!!!!!"!!!!!!!!!"......!!!!"!!!!!!!!!!"......!!!!!!!!!!"!!!!!!"!!!!"........!!!!".....!!!!".......!!!!!!! "!!!!!!!!!"



In [None]:
b = dls.one_batch()
preds = learn.model(b[0])
len(preds),preds[0], preds[1].shape

(4,
 tensor(1.1299, device='cuda:0', grad_fn=<NllLossBackward0>),
 torch.Size([4, 23, 50264]))

In [None]:
learn.show_results(learner=learn, max_n=10)


Unnamed: 0,text,target,prediction
0,"Loved this place! If you are looking for a chain hotel, THIS IS NOT for you. If you are looking for different, unique, friendly, fun...then this is it. We chose the little room with Queen bed, no windows..and it was GREAT. Best bed I've slept in while traveling in a long long time. Although there are internal hotel noises every once and a while..it was by no means disturbing or loud. Location was awesome! We mainly hung out on Decatour street/Jackson square/French market..but even bourbon street was a short walk. Hotel staff is very knowledgeable about places to eat and things to do. We didn't drive here so no idea about parking or valet. Staff was extremely friendly. It's just a super little gem in a crazy town.",Hotel with Personality!,Best bed I've slept in while traveling in a long long time.
1,"Bad: The bed wasn't that comfy, which was surprising given all of the other amazing details. Sheets were rough/stiff- not cozy. Could be easily fixed, though!. Good: This was by far one of the best hotel experiences I've ever had. Check-in and check-out was super fast, staff was phenomenal, and the room was very intuitively set-up with thoughtful details technology. It was clear that Virgin isn't just a typical hotel. The chamber set-up of the room allowed for privacy comfort, even in a relatively small space. Snacks mini-bar had 7-11 prices. Everything was super clean, well serviced and wonderful. In short: STAY HERE!",Impeccable customer service overall experience,"Great Staff, Amazing Experience"
2,Would be nice if the duvet were in a duvet cover as I just get tangled between the flat sheets and the comforter Gym could really do with some more variety in terms of machinery - Its impossible to do much leg work as there is no squat rack with a bar - or even a smith machine which has multiple purposes Super helpful staff 24/7 - very accommodating of my unusual hours Room service menu is amazing Views are amazing Cerise Bar on the roof is epic and the bartenders are amazing I will be coming back and SOON!,Honestly one of the best stays I've ever had (and I travel a lot!),Amazing hotel and experience
3,"Awesome hotel. Virgin really took the time to figure out all things at are frustrating about staying in a hotel and fixed them - kudos!!! The hotel will become a regular haunt in the future. From free wifi, free cocktails, a mini bar you're not terrified to touch, not to mention the no check out time, they really are way ahead of the rest. Oh by the way did I mention the really cool room... Awesome! Highly recommend the hotel. From young teens to my parents in their 70's the hotel met our every need.",Awesome hotel.,"Free wifi, free cocktails, a mini bar you're not terrified to touch..."
