### Import Libraries

In [1]:
from IPython.display import clear_output

In [2]:
!pip install transformers
!pip install pdfplumber
!pip install Pillow==9.0.0
clear_output()

In [3]:
from transformers import pipeline
import pdfplumber
import re
from google.colab import drive

In [4]:
drive.mount('/content/drive/')

Mounted at /content/drive/


### Extract PDF

In [5]:
def extract_pdf(directory):
    all_text = ''
    with pdfplumber.open(directory) as pdf:
            for pdf_page in pdf.pages:
                single_page_text = pdf_page.extract_text()
                all_text = all_text + '\n' + single_page_text
        
    return all_text

In [74]:
doc = '/content/drive/MyDrive/Online learning and its problems in the Covid-19 emergency period.pdf'
text = extract_pdf(doc)

### Chuck Text

In [50]:
text = text.replace('.', '.<eos>')
text = text.replace('?', '?<eos>')
text = text.replace('!', '!<eos>')
text = re.sub('\\+n', ' ', text)
text = re.sub('\n'," ",text) # Remove every '\n'
text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
text = re.sub(':', ' ', text)
text = re.sub(';', ' ', text)
text = re.sub(',', ' ', text)
text = re.sub('\\+', ' ', text)
text = re.sub('  +', ' ', text) # Remove extra spaces
sentences = text.split('<eos>')

In [59]:
max_chunk = 100
current_chunk = 0 
chunks = []
for sentence in sentences:
    if len(chunks) == current_chunk + 1: 
        if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
            chunks[current_chunk].extend(sentence.split(' '))
        else:
            current_chunk += 1
            chunks.append(sentence.split(' '))
    else:
        print(current_chunk)
        chunks.append(sentence.split(' '))

for chunk_id in range(len(chunks)):
    chunks[chunk_id] = ' '.join(chunks[chunk_id])

0


In [60]:
len(chunks)

76

In [61]:
summarizer = pipeline('summarization')

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)


In [62]:
result = summarizer(chunks, max_length=500, min_length=30, do_sample=False)

Your max_length is set to 500, but you input_length is only 105. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)
Your max_length is set to 500, but you input_length is only 127. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=63)
Your max_length is set to 500, but you input_length is only 106. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)
Your max_length is set to 500, but you input_length is only 38. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)
Your max_length is set to 500, but you input_length is only 137. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=68)
Your max_length is set to 500, but you input_length is only 136. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=68)
Your max_length is set to 500, but you input_length is only 126. You might co

In [73]:
result[20]['summary_text']

' Data collection related to distribution of 316 PGSD FKIP UHO students during the online learning process in Covid-19 period spread across 17 Regencies / Cities in Southeast Sulawesi . Figure 1 shows the distribution of students at each location of domicile .'

In [75]:
summary_text = ' '.join([summ['summary_text'] for summ in result])

In [76]:
summary_text

' Online learning and its problems in the Covid-19 emergency period Rimba Hamid . Available online at Jurnal Prima Edu-asia 8 (1) 2020 86-95 .  This research aims to obtain an in-depth description of distribution of students of PGSD FKIP UHO . It aims to get a description about (1) distribution of the students of (1), (2) infrastructure support for the effectiveness of online learning in the Covid-19 period and (3) students\' perceptions about online learning .  Data collection techniques were those using open and closed questionnaires with 316 students of 2017 2018 and 2019 classes who filled out questionnaires online . Data obtained from students in the form of . raw data collected online and converted in Excel format .  The data were then processed based on the focus of this study . An in-depth descriptive quantitative and qualitative analysis was carried out . Based on the results of the processed data an analysis was conducted .  The study showed that PGSD FKIP UHO students in the

In [None]:
!pip install transformers
!pip install pdfplumber
clear_output()

In [77]:
print(f"Word counts before summarization: {len(text)}")
print(f"Word counts after summarization: {len(summary_text)}")

Word counts before summarization: 42650
Word counts after summarization: 19417


### Predict

In [78]:
from IPython.display import clear_output
!pip install transformers
clear_output()

In [79]:
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
import torch

In [80]:
def get_answer_using_bert(question, reference_text):
    # Load fine-tuned model for QA
    bert_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

    # Load Vocab as well
    bert_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

    # Apply bert_tokenizer on input text
    input_ids = bert_tokenizer.encode(question, reference_text, padding='longest', truncation=True, max_length=50, add_special_tokens=True)
    input_tokens = bert_tokenizer.convert_ids_to_tokens(input_ids)

    # Search index of first [SEP] token
    sep_location = input_ids.index(bert_tokenizer.sep_token_id)
    first_seg_len, second_seg_len = sep_location + 1, len(input_ids) - (sep_location + 1)
    seg_embedding = [0] * first_seg_len + [1] * second_seg_len

    # Run our example on model
    model_scores = bert_model(torch.tensor([input_ids]), token_type_ids=torch.tensor([seg_embedding]))
    ans_start_loc, ans_end_loc = torch.argmax(model_scores[0]), torch.argmax(model_scores[1])
    result = ' '.join(input_tokens[ans_start_loc:ans_end_loc + 1])

    # Return final result
    result = result.replace(' ##', '')
    return result

In [86]:
if __name__=="__main__":
    reference_text = summary_text
    question = input('Your question? \n \n')
    print('\n')
    print(get_answer_using_bert(question, reference_text))

Your question? 
 
what is the result of this research?




Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


online learning and its problems in the covid - 19 emergency period
