### Import Libraries

In [1]:
from IPython.display import clear_output

In [2]:
!pip install transformers
!pip install pdfplumber
!pip install Pillow==9.0.0
clear_output()

In [3]:
from transformers import pipeline
import pdfplumber
import re
from google.colab import drive

In [4]:
drive.mount('/content/drive/')

Mounted at /content/drive/


### Extract PDF

In [5]:
def extract_pdf(directory):
    all_text = ''
    with pdfplumber.open(directory) as pdf:
            for pdf_page in pdf.pages:
                single_page_text = pdf_page.extract_text()
                all_text = all_text + '\n' + single_page_text
        
    return all_text

In [71]:
doc = '/content/drive/MyDrive/Online learning and its problems in the Covid-19 emergency period.pdf'
text = extract_pdf(doc)

### Chuck Text

In [72]:
text = text.replace('.', '.<eos>')
text = text.replace('?', '?<eos>')
text = text.replace('!', '!<eos>')
text = re.sub('\\+n', ' ', text)
text = re.sub('\n'," ",text) # Remove every '\n'
text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
text = re.sub(':', ' ', text)
text = re.sub(';', ' ', text)
text = re.sub(',', ' ', text)
text = re.sub('\\+', ' ', text)
text = re.sub('  +', ' ', text) # Remove extra spaces
sentences = text.split('<eos>')

In [82]:
max_chunk = 200
current_chunk = 0 
chunks = []
for sentence in sentences:
    if len(chunks) == current_chunk + 1: 
        if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
            chunks[current_chunk].extend(sentence.split(' '))
        else:
            current_chunk += 1
            chunks.append(sentence.split(' '))
    else:
        print(current_chunk)
        chunks.append(sentence.split(' '))

for chunk_id in range(len(chunks)):
    chunks[chunk_id] = ' '.join(chunks[chunk_id])

0


In [83]:
len(chunks)

36

In [84]:
summarizer = pipeline('summarization')

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)


In [62]:
result = summarizer(chunks, max_length=500, min_length=30, do_sample=False)

Your max_length is set to 500, but you input_length is only 276. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=138)
Your max_length is set to 500, but you input_length is only 250. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=125)
Your max_length is set to 500, but you input_length is only 262. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=131)
Your max_length is set to 500, but you input_length is only 191. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=95)
Your max_length is set to 500, but you input_length is only 212. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=106)
Your max_length is set to 500, but you input_length is only 260. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=130)
Your max_length is set to 500, but you input_length is only 260. You mi

In [89]:
result

[{'summary_text': ' Online learning and its problems in the Covid-19 emergency period . Available online at Jurnal Prima Edukasia 8 (1) 2020 86-95 . Rimba Hamid * Izlan Sentryo Sakka Hasan Universitas Halu Oleo .'},
 {'summary_text': ' Data collection techniques were those using open and closed questionnaires with 316 students of 2017 2018 and 2019 classes . Results of the study showed that PGSD FKIP UHO students in the online learning process concentrated on 3 main regencies/cities namely Kendari City Muna Regency and Konawe Selatan Regency .'},
 {'summary_text': ' The Corona Virus Disease (Covid-19) pandemic or plague has struck more than 200 countries in the world and has given its own challenges for educational institutions especially higher education . This condition requires citizens to stay at home work worship and study at home .'},
 {'summary_text': ' The level of Covid-19 spread has rapidly made every sector of human life feel its effects immediately . This condition forces e

In [90]:
summary_text = ' '.join([summ['summary_text'] for summ in result])

In [91]:
summary_text

" Online learning and its problems in the Covid-19 emergency period . Available online at Jurnal Prima Edukasia 8 (1) 2020 86-95 . Rimba Hamid * Izlan Sentryo Sakka Hasan Universitas Halu Oleo .  Data collection techniques were those using open and closed questionnaires with 316 students of 2017 2018 and 2019 classes . Results of the study showed that PGSD FKIP UHO students in the online learning process concentrated on 3 main regencies/cities namely Kendari City Muna Regency and Konawe Selatan Regency .  The Corona Virus Disease (Covid-19) pandemic or plague has struck more than 200 countries in the world and has given its own challenges for educational institutions especially higher education . This condition requires citizens to stay at home work worship and study at home .  The level of Covid-19 spread has rapidly made every sector of human life feel its effects immediately . This condition forces educational institutions including LPTK to make breakthroughs related to learning met

In [92]:
print(f"Word counts before summarization: {len(text)}")
print(f"Word counts after summarization: {len(summary_text)}")

Word counts before summarization: 41774
Word counts after summarization: 10211


### Predict

In [67]:
from IPython.display import clear_output
!pip install transformers
clear_output()

In [68]:
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
import torch

In [69]:
def get_answer_using_bert(question, reference_text):
    # Load fine-tuned model for QA
    bert_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

    # Load Vocab as well
    bert_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

    # Apply bert_tokenizer on input text
    input_ids = bert_tokenizer.encode(question, reference_text, padding='longest', truncation=True, max_length=50, add_special_tokens=True)
    input_tokens = bert_tokenizer.convert_ids_to_tokens(input_ids)

    # Search index of first [SEP] token
    sep_location = input_ids.index(bert_tokenizer.sep_token_id)
    first_seg_len, second_seg_len = sep_location + 1, len(input_ids) - (sep_location + 1)
    seg_embedding = [0] * first_seg_len + [1] * second_seg_len

    # Run our example on model
    model_scores = bert_model(torch.tensor([input_ids]), token_type_ids=torch.tensor([seg_embedding]))
    ans_start_loc, ans_end_loc = torch.argmax(model_scores[0]), torch.argmax(model_scores[1])
    result = ' '.join(input_tokens[ans_start_loc:ans_end_loc + 1])

    # Return final result
    result = result.replace(' ##', '')
    return result

In [70]:
if __name__=="__main__":
    reference_text = summary_text
    question = input('Your question? \n \n')
    print('\n')
    print(get_answer_using_bert(question, reference_text))

Your question? 
 
what is the result of this research?




Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


online learning and its problems in the covid - 19 emergency period


### Conclusion
After hundreds of empirical research, this summarizer works well with this kinds of hyperparameters:
- max_chunk = 200
- don't add complex regex formula
- inference runtime: around 2 minutes