# 1. load training data

In [None]:
# https://drive.google.com/uc?export=download&id=1ZH3owx7KFZn36q0rEelKyn4PpUq_0H6H
from urllib.request import urlopen
from zipfile import ZipFile

zipurl = 'https://drive.google.com/uc?export=download&id=1ZH3owx7KFZn36q0rEelKyn4PpUq_0H6H'
    # Download the file from the URL
zipresp = urlopen(zipurl)
    # Create a new file on the hard drive
tempzip = open("/tmp/tempfile.zip", "wb")
    # Write the contents of the downloaded file into the new file
tempzip.write(zipresp.read())
    # Close the newly-created file
tempzip.close()
    # Re-open the newly-created file with ZipFile()
zf = ZipFile("/tmp/tempfile.zip")
    # Extract its contents into <extraction_path>
    # note that extractall will automatically create the path
zf.extractall(path = './')
    # close the ZipFile instance
zf.close()

# 2. all the source code is bundled to the cell below

In [1]:
#@title ⠀ {display-mode: "form"}

def stylize():
    "Handle dark mode"
    display(HTML('''
    <style>
    :root {
        --table_bg: #EBF8FF;
    }
    html[theme=dark] {
        --colab-primary-text-color: #d5d5d5;
        --table_bg: #2A4365;
    }
    .jupyter-widgets {
        color: var(--colab-primary-text-color);
    }
    table {
        border-collapse: collapse !important;
    }
    td {
        text-align:left !important;
        border: solid var(--table_bg) !important;
        border-width: 1px 0 !important;
        padding: 6px !important;
    }
    tr:nth-child(even) {
        background-color: var(--table_bg) !important;
    }
    .table_odd {
        background-color: var(--table_bg) !important;
        margin: 0 !important;
    }
    .table_even {
        border: solid var(--table_bg) !important;
        border-width: 1px 0 !important;
        margin: 0 !important;
    }
    .jupyter-widgets {
        margin: 6px;
    }
    .widget-html-content {
        font-size: var(--colab-chrome-font-size) !important;
        line-height: 1.24 !important;
    }
    </style>'''))

def print_html(x):
    "Better printing"
    x = x.replace('\n', '<br>')
    display(HTML(x))
        
# Check we use GPU
import torch
from IPython.display import display, HTML, Javascript, clear_output

# quick check bypass
# if not torch.cuda.is_available(): # cuda
# if torch.cuda.is_available(): # cpu
if not torch.cuda.is_available():
    print_html('Error: GPU was not found\n1/ click on the "Runtime" menu and "Change runtime type"\n'\
          '2/ set "Hardware accelerator" to "GPU" and click "save"\n3/ click on the "Runtime" menu, then "Run all" (below error should disappear)')
    raise ValueError('No GPU available')
else:
    # colab requires special handling
    try:
        import google.colab
        IN_COLAB = True
    except:
        IN_COLAB = False

    # Install dependencies (mainly for colab)
    if IN_COLAB:
        !pip install transformers
        !pip install torch wandb -qq
        !curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
        !sudo apt-get install git-lfs

    # dataset
    import os
    import json
    import numpy as np
    from torch.utils.data import Dataset
    from transformers import GPT2Tokenizer
    import pickle
    import random
    import sys
    import time
    import torch.nn.functional as F
    from tqdm import tnrange, tqdm
    from types import SimpleNamespace
    from datetime import datetime
    import numpy as np
    from transformers import GPT2Config, GPT2LMHeadModel,AdamW, GPT2Tokenizer, get_linear_schedule_with_warmup
    
    from torch.nn import CrossEntropyLoss
    import torch.nn.functional as F
    from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
    

    def add_special_tokens():
        """ Returns GPT2 tokenizer after adding separator and padding tokens """
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        special_tokens = {'pad_token':'<|pad|>','sep_token':'<|sep|>'}
        num_add_toks = tokenizer.add_special_tokens(special_tokens)
        return tokenizer

    def set_seed(args):
        random.seed(args.seed)
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        if args.n_gpu > 0:
            torch.cuda.manual_seed_all(args.seed)


    def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
        """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
            Args:
                logits: logits distribution shape (vocabulary size)
                top_k > 0: keep only top k tokens with highest probability (top-k filtering).
                top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                    Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
            From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
        """
        assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
        top_k = min(top_k, logits.size(-1))  # Safety check
        if top_k > 0:
            # Remove all tokens with a probability less than the last token of the top-k
            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
            logits[indices_to_remove] = filter_value

        if top_p > 0.0:
            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
            cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

            # Remove tokens with cumulative probability above the threshold
            sorted_indices_to_remove = cumulative_probs > top_p
            # Shift the indices to the right to keep also the first token above the threshold
            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
            sorted_indices_to_remove[..., 0] = 0

            indices_to_remove = sorted_indices[sorted_indices_to_remove]
            logits[indices_to_remove] = filter_value
        return logits


    def sample_seq(model, context, length, device, temperature=1, top_k=0, top_p=0.0):
        """ Generates a sequence of tokens 
            Args:
                model: gpt/gpt2 model
                context: tokenized text using gpt/gpt2 tokenizer
                length: length of generated sequence.
                device: torch.device object.
                temperature >0: used to control the randomness of predictions by scaling the logits before applying softmax.
                top_k > 0: keep only top k tokens with highest probability (top-k filtering).
                top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
        """

        context = torch.tensor(context, dtype=torch.long, device=device)
        context = context.unsqueeze(0)
        generated = context
        with torch.no_grad():  
            for _ in tnrange(length):
                inputs = {'input_ids': generated}
                outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
                next_token_logits = outputs[0][0, -1, :] / temperature
                filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
                next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
        return generated


    def beam_search(model, context, length, beam_size, device, temperature=1):
        """ Generate sequence using beam search https://machinelearningmastery.com/beam-search-decoder-natural-language-processing/
            Args:
                model: gpt/gpt2 model
                context: tokenized text using gpt/gpt2 tokenizer
                length: length of generated sequence.
                beam_size: >=1 and <= total_no_of_tokens
                device: torch.device object.
                temperature >0: used to control the randomness of predictions by scaling the logits before applying softmax.
        """
        context = torch.tensor(context, dtype=torch.long, device=device)
        context = context.unsqueeze(0)
        with torch.no_grad():  
            inputs = {'input_ids': context}
            outputs = model(**inputs) 
            next_token_logits = outputs[0][0, -1, :] / temperature
            next_token_probs = F.softmax(next_token_logits)
            scores, indices = torch.topk(next_token_probs, beam_size)
            indices = indices.tolist()
            sequences = [[c] for c in indices]
            for _ in tnrange(length-1):
                logits = torch.zeros(beam_size*len(next_token_logits))
                for j in range(len(sequences)):
                    new_generated = torch.cat((context,torch.tensor([sequences[j]], dtype=torch.long, device=device)),dim=1)
                    inputs = {'input_ids': new_generated}
                    outputs = model(**inputs) 
                    next_token_logits = outputs[0][0, -1, :] / temperature
                    next_token_probs = F.softmax(next_token_logits)
                    start, stop = j*len(next_token_logits), (j+1)*len(next_token_logits)
                    logits[start:stop] = scores[j]*next_token_probs
                scores, new_logits_indices = torch.topk(logits,beam_size)
                logits = (new_logits_indices%50259).tolist()
                for j in range(len(sequences)):
                    sequences[j] = sequences[j]+[logits[j]]
        return scores, sequences


    def generate_beam_sample(data, tokenizer, model, num=1, length=100, beam_size=3, device=torch.device('cuda')):
        """ Generate summaries for "num" number of articles using beam search.
            Args:
                data = GPT21024Dataset object
                tokenizer = gpt/gpt2 tokenizer
                num = number of articles for which summaries has to be generated
        """
        for i in range(num):
            sample = data[i]
            idx = sample['sum_idx']
            context = sample['article'][:idx].tolist()
            summary = sample['article'][idx+1:][:100].tolist()
            scores, sequences = beam_search(model, context, length, beam_size, device)
            print('new_article', end='\n\n')
            print(tokenizer.decode(context[:-1]), end='\n\n')
            print('actual_summary', end='\n\n')
            print(tokenizer.decode(summary), end='\n\n')
            for i in range(len(sequences)):
                text = tokenizer.convert_ids_to_tokens(sequences[i],skip_special_tokens=True)
                text = tokenizer.convert_tokens_to_string(text)  
                print("generated_summary-{} and Score is {}.".format(i+1, scores[i]), end='\n\n')
                print(text, end='\n\n')


    def generate_sample(data, tokenizer, model, num=1, eval_step=False, length=100, temperature=1, top_k=10, top_p=0.5, device=torch.device('cuda')):
        """ Generate summaries for "num" number of articles.
            Args:
                data = GPT21024Dataset object
                tokenizer = gpt/gpt2 tokenizer
                model = gpt/gpt2 model
                num = number of articles for which summaries has to be generated
                eval_step = can be True/False, checks generating during evaluation or not
        """
        for i in range(num):
            sample = data[i]
            idx = sample['sum_idx']
            context = sample['article'][:idx].tolist()
            summary = sample['article'][idx+1:][:100].tolist()
            generated_text = sample_seq(model, context, length, device, temperature, top_k, top_p)
            generated_text = generated_text[0, len(context):].tolist()
            text = tokenizer.convert_ids_to_tokens(generated_text,skip_special_tokens=True)
            text = tokenizer.convert_tokens_to_string(text)
            if eval_step==False:
                print('new_article', end='\n\n')
                print(tokenizer.decode(context), end='\n\n')
                print("generated_summary", end='\n\n')
                print(text, end='\n\n')
                print('actual_summary', end='\n\n')
                print(tokenizer.decode(summary), end='\n\n')
            else:
                print(tokenizer.decode(context), end='\n\n')
                print("generated_summary", end='\n\n')

    # dataset
    class GPT21024Dataset(Dataset):

        def __init__(self, root_dir, ids_file, mode='train',length=None):
            self.root_dir = root_dir
            self.tokenizer = add_special_tokens()

            # with open(ids_file,'r') as f:
                # if mode=='train':
                #     self.idxs = np.array(json.load(f)['train_ids'])
                # elif mode=='valid':
                #     self.idxs = np.array(json.load(f)['valid_ids'])
                # elif mode=='test':
                #     self.idxs = np.array(json.load(f)['test_ids'])

                # self.idxs = self.idxs -min(self.idxs)

            self.idxs = os.listdir(root_dir)
            self.mode = mode
            if len == None:
                self.len = len(self.idxs)
            else:
                self.len = length

        def __len__(self):
            return self.len

        def __getitem__(self,idx):

            if self.mode=='valid':
                idx = self.idxs[-idx]
            elif self.mode=='test':
                idx = self.idxs[-idx-self.len]   #assuming valid and test set of same sizes
            else:
                idx = self.idxs[idx]
            # file_name = os.path.join(self.root_dir,str(idx)+".json")
            file_name = os.path.join(self.root_dir,str(idx))
            with open(file_name,'r') as f:
                  data = json.load(f)
            text = self.tokenizer.encode(self.tokenizer.pad_token)*1024
            content = data['article'] + self.tokenizer.encode(self.tokenizer.sep_token) + data['abstract']
            text[:len(content)] = content
            text = torch.tensor(text)
            sample = {'article': text, 'sum_idx': len(data['article'])}
            return sample
        
    def train(args, model, tokenizer, train_dataset, valid_dataset, ignore_index):
        """ Trains GPT2 model and logs necessary details.
            Args:
                args: dict that contains all the necessary information passed by user while training
                model: finetuned gpt/gpt2 model
                tokenizer: GPT/GPT2 tokenizer
                train_dataset: GPT21024Dataset object for training data
                ignore_index: token not considered in loss calculation
        """
        # writer = SummaryWriter('./logs')
        train_sampler = RandomSampler(train_dataset)
        train_dl = DataLoader(train_dataset,sampler=train_sampler,batch_size=args.batch_size,num_workers=args.num_workers)
        loss_fct = CrossEntropyLoss(ignore_index=ignore_index) #ignores padding token for loss calculation
        optimizer = AdamW(model.parameters(),lr=args.lr)
        scheduler = scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps = 80000)
        # WarmupLinearSchedule(optimizer,100,80000)


        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = tnrange(int(args.num_train_epochs), desc="Epoch")
        set_seed(args)
        for _ in train_iterator:
            epoch_iterator = tqdm(train_dl, desc="Training")
            for step, batch in enumerate(epoch_iterator):
                inputs, labels = torch.tensor(batch['article']), torch.tensor(batch['article'])
                inputs = inputs.to(args.device)
                labels = labels.to(args.device)
                model.train()
                logits = model(inputs)[0]
                idx = batch['sum_idx'].item() # index of separator token
                # only consider loss on reference summary just like seq2seq models
                shift_logits = logits[..., idx:-1, :].contiguous()
                shift_labels = labels[..., idx+1:].contiguous()
                loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
                loss = loss/args.gradient_accumulation_steps
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                tr_loss += loss.item()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1
                    # writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                    # writer.add_scalar('loss', (tr_loss - logging_loss)/args.gradient_accumulation_steps, global_step)
                    logging_loss = tr_loss
                    print("loss:", loss.item(), end='\n\n')
                    if (step + 1)/args.gradient_accumulation_steps == 1.0:
                        print('After 1st update: ', end='\n\n')
                        generate_sample(valid_dataset, tokenizer, model, num=2, eval_step=False,device=args.device)


                if (step + 1) % (10*args.gradient_accumulation_steps) == 0:
                    results = evaluate(args, model, valid_dataset, ignore_index, global_step)
                    # for key, value in results.items():
                    #     writer.add_scalar('eval_{}'.format(key), value, global_step)
                    print('After', global_step+1,'updates: ', end='\n\n')
                    generate_sample(valid_dataset, tokenizer, model, num=2, eval_step=True,device=args.device)



    def evaluate(args, model, eval_dataset, ignore_index, global_step=None):
        """ Returns perplexity score on validation dataset.
            Args:
                args: dict that contains all the necessary information passed by user while training
                model: finetuned gpt/gpt2 model
                eval_dataset: GPT21024Dataset object for validation data
                global_step: no. of times gradients have backpropagated
                ignore_index: token not considered in loss calculation
        """
        if not os.path.exists(args.output_dir):
            os.mkdir(args.output_dir)
        eval_output_dir = args.output_dir

        results = {}
        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.batch_size)
        loss_fct = CrossEntropyLoss(ignore_index=ignore_index) #ignores padding token for loss calculation

        eval_loss = 0.0
        nb_eval_steps = 0
        model.eval()

        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            inputs, labels = torch.tensor(batch['article']).to(args.device), torch.tensor(batch['article']).to(args.device)

            with torch.no_grad():
                logits = model(inputs)[0]
                idx = batch['sum_idx'].item() # index of separator token
                # only consider loss on reference summary just like seq2seq models
                shift_logits = logits[..., idx:-1, :].contiguous()
                shift_labels = labels[..., idx+1:].contiguous()
                lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
                eval_loss += lm_loss.mean().item()
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        perplexity = torch.exp(torch.tensor(eval_loss))

        result = {
            "perplexity": perplexity
        }
        print("perplexity:", perplexity.item())

        if global_step:
            output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
            with open(output_eval_file, "a") as f:
                for key in sorted(result.keys()):
                    f.write('\n\n')
                    f.write("time = %s, %s = %s, step = %s\n" % (datetime.now().strftime("%d/%m/%Y %H:%M:%S"), key, str(result[key]), str(global_step)))
        return result


# 3. training loop
# all parameters are stored in argsDict {}
# model is saved at the end of training

In [None]:
# train
argsDict = { 'lr': 5e-5, 'seed': 42, 'n_gpu': 1, 'gradient_accumulation_steps': 32, 'batch_size': 1,
        'num_workers': 4, 'device': torch.device('cuda'), 'num_train_epochs': 1, 'output_dir': 'output',
        'model_dir': 'weights', 'fp16': True, 'fp16_opt_level': 'O0', 'max_grad_norm': 1.0,
        'root_dir': 'CNN/gpt2_1024_data', 'ids_file': 'CNN/ids.json'
       }
args = SimpleNamespace(**argsDict)


train_data = GPT21024Dataset('CNN/gpt2_1024_data','CNN/ids.json',mode='train',length=3000) #training on only 3000 datasets
valid_data = GPT21024Dataset('CNN/gpt2_1024_data','CNN/ids.json',mode='valid',length=500)  #validation on only 500 datasets
tokenizer = add_special_tokens()
ignore_idx = tokenizer.pad_token_id
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))
model.to(args.device)

start = time.time()
train(args, model, tokenizer, train_data, valid_data, ignore_idx)
print('total time: ', (time.time()-start)/60, " minutes", end='\n\n')

print('Saving trained model...')
model_file = os.path.join('model_{}_data{}_trained_after_{}_epochs_only_sum_loss_ignr_pad.bin'.format(args.fp16_opt_level,3000,args.num_train_epochs))
config_file = os.path.join('config_{}_data{}_trained_after_{}_epochs_only_sum_loss_ignr_pad.json'.format(args.fp16_opt_level,3000,args.num_train_epochs))
torch.save(model.state_dict(), model_file)
model.config.to_json_file(config_file)

# ############################################################
# 4. play with model, 
# can jump to this step if training is already done
# ############################################################

In [12]:
argsDictTest = { 'seed': 42,
        'num_workers': 4, 'device': torch.device('cuda'), 'num_train_epochs': 1, 'output_dir': 'output',
        'model_dir': 'weights',
        'root_dir': 'CNN/gpt2_1024_data', 'ids_file': 'CNN/ids.json'
       }
argsTest = SimpleNamespace(**argsDictTest)

In [7]:
from transformers import GPT2Config, GPT2LMHeadModel

# using the same validation and training data as during training
tokenizer = add_special_tokens()
# train_data = GPT21024Dataset(args.root_dir,args.ids_file,mode='train',length=3000)
# valid_data = GPT21024Dataset(args.root_dir,args.ids_file,mode='valid',length=500)
test_data = GPT21024Dataset(argsTest.root_dir,argsTest.ids_file,mode='test',length=500)


In [13]:
model_file = "weights/model_O0_data3000_trained_after_5_epochs_only_sum_loss_ignr_pad.bin"
config_file = "weights/config_O0_data3000_trained_after_5_epochs_only_sum_loss_ignr_pad.json"

config = GPT2Config.from_json_file(config_file)
model = GPT2LMHeadModel(config)
state_dict = torch.load(model_file)
model.load_state_dict(state_dict)
model.eval()
model.to(argsTest.device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [15]:
generate_sample(test_data, tokenizer, model, num=2, length=100, temperature=1, top_k=10, top_p=0.5, device=argsTest.device)

  for _ in tnrange(length):


  0%|          | 0/100 [00:00<?, ?it/s]

new_article

-LRB- CNN -RRB- -- The number of birth defects in China are on the rise and the rate has nearly doubled in the past decade in Beijing and several provinces, a state-run newspaper reported Tuesday. The increase is mainly due to improved diagnostic techniques and monitoring capability, as well as women waiting until they are older to have children, China Daily reported, citing the Beijing municipal health bureau. Environmental pollution could also be a factor, the newspaper said, quoting Caijing magazine. The rate of birth defects in Beijing last year was 170 per 10,000 births, nearly twice the rate in 1997, when it was 90 birth defects per 10,000 births, China Daily reported, using figures from the health bureau. The most common birth defects in Beijing's Shunyi district are congenital heart disease, excessive fingers or toes, cleft lip or palate, and neural tube defects, a regional reproductive health officer told China Daily. A rise in birth defects was also seen in the p

  0%|          | 0/100 [00:00<?, ?it/s]

new_article

Goma, Democratic Republic of Congo -LRB- CNN -RRB- -- The United Nations and Congolese army are sending additional soldiers to the troubled regional capital of Goma, a U.N. spokesman said Wednesday. `` Troops are being deployed at the town entrance, on the road that goes to Rutshuru, '' said Alexander Essome, MONUSCO regional spokesperson. MONUSCO, the U.N. peacekeeping mission in Democratic Republic of Congo, `` has been reinforced with extra troops in the area, '' Essome said. Rutshuru, 60 kilometers -LRB- 37 miles -RRB- north of Goma, was taken by M23 rebels on July 1, but they retreated within days and the Congolese regular army moved back into the town Wednesday. The Congolese army is dispatching a U.S. trained elite battalion to Goma, where most shops were closed Wednesday because of threats from M23 to take Goma. The M23, linked to Laurent Nkunda's CNDP rebel group, still controls the town of Bunagana. Goma is in on the border with Rwanda on Lake Kivu.

generated_su

In [16]:
generate_beam_sample(test_data, tokenizer, model, num=2, length=100, beam_size=3, device=argsTest.device)

  next_token_probs = F.softmax(next_token_logits)
  for _ in tnrange(length-1):


  0%|          | 0/99 [00:00<?, ?it/s]

  next_token_probs = F.softmax(next_token_logits)


new_article

-LRB- CNN -RRB- -- The number of birth defects in China are on the rise and the rate has nearly doubled in the past decade in Beijing and several provinces, a state-run newspaper reported Tuesday. The increase is mainly due to improved diagnostic techniques and monitoring capability, as well as women waiting until they are older to have children, China Daily reported, citing the Beijing municipal health bureau. Environmental pollution could also be a factor, the newspaper said, quoting Caijing magazine. The rate of birth defects in Beijing last year was 170 per 10,000 births, nearly twice the rate in 1997, when it was 90 birth defects per 10,000 births, China Daily reported, using figures from the health bureau. The most common birth defects in Beijing's Shunyi district are congenital heart disease, excessive fingers or toes, cleft lip or palate, and neural tube defects, a regional reproductive health officer told China Daily. A rise in birth defects was also seen in the p

  0%|          | 0/99 [00:00<?, ?it/s]

new_article

Goma, Democratic Republic of Congo -LRB- CNN -RRB- -- The United Nations and Congolese army are sending additional soldiers to the troubled regional capital of Goma, a U.N. spokesman said Wednesday. `` Troops are being deployed at the town entrance, on the road that goes to Rutshuru, '' said Alexander Essome, MONUSCO regional spokesperson. MONUSCO, the U.N. peacekeeping mission in Democratic Republic of Congo, `` has been reinforced with extra troops in the area, '' Essome said. Rutshuru, 60 kilometers -LRB- 37 miles -RRB- north of Goma, was taken by M23 rebels on July 1, but they retreated within days and the Congolese regular army moved back into the town Wednesday. The Congolese army is dispatching a U.S. trained elite battalion to Goma, where most shops were closed Wednesday because of threats from M23 to take Goma. The M23, linked to Laurent Nkunda's CNDP rebel group, still controls the town of Bunagana. Goma is in on the border with Rwanda on Lake Kivu

actual_summar

## Download An Article Given A Query

In [17]:
from googlesearch import search
def sentences_from_query(query):
    # Get url
    if query.startswith("http"):
        url = query
    else:
        url = search(query, num_results=1)[0]
    print(url)
    page = requests.get(url).text
    soup = BeautifulSoup(page)
    # Get text from all <p> tags.
    p_tags = soup.find_all('p')
    # Get the text from each of the "p" tags and strip surrounding whitespace.
    p_tags_text = " ".join([tag.get_text().strip() for tag in p_tags])
    return p_tags_text

ModuleNotFoundError: No module named 'googlesearch'

In [None]:
article = sentences_from_query("neural embedding")
article = tokenizer.encode(article)[:900]

In [None]:
generated_text = sample_seq(model, article, 50, args.device, temperature=1, top_k=10, top_p=0.5)
generated_text = generated_text[0, len(article):].tolist()
text = tokenizer.convert_ids_to_tokens(generated_text,skip_special_tokens=True)
text = tokenizer.convert_tokens_to_string(text)

In [None]:
print("Article: \n")
print(tokenizer.decode(article))
print("------------------------------------------------------------ \n")
print("Generated Summary: \n")
print(text)