# **TITLE GENERATION FROM SIOP ABSTRACTS MODEL**

# Install the Libraries to Train a T5 Model

In [None]:
!pip install transformers --quiet 
!pip install sentencepiece --quiet

[K     |████████████████████████████████| 4.4 MB 14.5 MB/s 
[K     |████████████████████████████████| 596 kB 84.3 MB/s 
[K     |████████████████████████████████| 6.6 MB 66.8 MB/s 
[K     |████████████████████████████████| 101 kB 13.2 MB/s 
[K     |████████████████████████████████| 1.2 MB 14.9 MB/s 
[?25h

# Set the model to use the GPU when Possible for Faster Training

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


# Import the Required Libraries

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [None]:
# T5 Tokenizer (leverages SentencePiece and Unicode Normalizaiton)
tokenizer = T5Tokenizer.from_pretrained('t5-small') #change to t5-base or t5-large for potentially better results

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


# Download SIOP Title Abstract Data to the Colab Session

In [None]:
!gdown --id 1XjxrldwDhTpcRqYPl8XcK09graZW43Jq

Downloading...
From: https://drive.google.com/uc?id=1XjxrldwDhTpcRqYPl8XcK09graZW43Jq
To: /content/Copy of PastSIOPprograms.xlsx
100% 1.40M/1.40M [00:00<00:00, 172MB/s]


# Load in SIOP Data into a Dataframe

In [None]:
import pandas as pd
df = pd.read_excel("Copy of PastSIOPprograms.xlsx")
df = df[["title","abstract"]]
df.head()

Unnamed: 0,title,abstract
0,PREDICTING ONLINE COURSE-TAKING BEHAVIOR: THE ...,Our purpose is to understand why learners choo...
1,DETERMINANTS OF MOTIVATION TO LEARN IN ALTERNA...,This naturally occurring quasi-experiment exam...
2,ENHANCING E-LEARNING EFFECTIVENESS THROUGH LEA...,This study examines the role of two learner en...
3,DISCRIMINANT VALIDITY,In this study we examined the structure and di...
4,"THE RELATION BETWEEN PRACTICE EFFECTS, SCALE P...",This study examines the efficacy of test and t...


# Create Function to Preprocess Text

In [None]:
import re
# Clean Text
def preprocessText(text):
    text = text.split("--")[-1]
    #remove content into parenthesis
    text=text.strip()
    text = re.sub(r'\([^)]*\)', '', text)
    #remove quotes 
    text= re.sub('"','', text)
    text= text.replace("?"," ")
    text = text.replace(".",". ")
    #delete whitespaces
    text =  " ".join(text.split())
    return text.strip()

# Creating Training Examples

In [None]:
articles = df['abstract']
articles = ['summarize: '+preprocessText(x) for x in articles]
summaries = df['title']

# Create a Function to Load the Dataset

In [None]:
# Creating a custom dataset for reading the dataframe and loading it into the dataloader to pass it to the neural network at a later stage for finetuning the model and to prepare it for predictions

class CustomDataset(Dataset):

    def __init__(self, articles,summaries, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = summaries
        self.ctext = articles

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, padding='max_length',truncation=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, padding='max_length',truncation=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [None]:
MAX_ARTICLE_LEN = 512
MAX_SUMMARY_LEN = 32
TRAIN_BATCH_SIZE = 4
training_set = CustomDataset(articles, summaries, tokenizer, MAX_ARTICLE_LEN, MAX_SUMMARY_LEN)

train_params = {
        'batch_size': TRAIN_BATCH_SIZE,
        'shuffle': True,
        'num_workers': 0
        }

# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)

# Create a Training Function

In [None]:
# Creating the training function. This will be called in the main function. It is run depending on the epoch value.
# The model is put into train mode and then we wnumerate over the training loader and passed to the defined network 

def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if _ % 100 ==0:print('Completed ' + str(_) + " of " + str(len(loader)))

# Create a Summarization Function

In [None]:
def getSummary(text):
  with torch.no_grad():
    input_ids = tokenizer("summarize: "+text, return_tensors="pt").input_ids
    input_ids = input_ids.to(device)
    outputs = model.generate(input_ids,do_sample =True,num_beams=3, min_length=4,max_length=20)
    decodedwords = tokenizer.decode(outputs[0])
    return decodedwords

# Define the Learning Rate and Number of Training Epochs

In [None]:
TRAIN_EPOCHS = 15
LEARNING_RATE = 1e-4

# Load in the Pretrained Model

In [None]:
model = T5ForConditionalGeneration.from_pretrained("t5-small") #change to t5-base or t5-large for potentially better results
model = model.to(device)

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

# Provide a Sample of Text to Try to Generate Titles to During the Training Process

In [None]:
text="This research investigates how the perceived balance between cognition and emotion in others contributes to fear of other agents – “scary minds”.  Specifically, we explore the hypothesis that a threatening target with an unbalanced mind—asymmetric capacities for cognition (e.g., self-control and reasoning) and emotion (e.g., sensations and emotions)—is associated with greater fear of that target.  In four studies, targets with a perceived imbalance between capacities for cognition and emotion were rated as more frightening than those with relatively more matched levels of cognition and emotion.  These effects were observed using samples of scary monsters (Studies 1 & 2), scary animals (Studies 2 & 3),  and infected humans (Study 4).  We also find experimental evidence that imbalanced targets are perceived as less controllable, which mediates the effect of imbalance on fear.  These results suggest that the perceived degree of mismatch between cognition and experience predicts fear among scary targets, and highlight mind perception as an important part of the appraisal process of emotion."

In [None]:
# Defining the optimizer that will be used to tune the weights of the network in the training session. 
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
summ = getSummary(text)
print(summ) 

for epoch in range(TRAIN_EPOCHS):
    print("Epoch: ", epoch)
    train(epoch, tokenizer, model, device, training_loader, optimizer)
    summ = getSummary(text)
    print(summ)

<pad> I-O psychologists can translate a single input text into a single input text to
Epoch:  0
Completed 0 of 2583
Completed 100 of 2583


KeyboardInterrupt: ignored

# Save the Trained Model

In [None]:
model.save_pretrained("title_generation_model")

# Load the Trained Model




In [None]:
model = T5ForConditionalGeneration.from_pretrained("title_generation_model")
model = model.to(device)

# Try out Your Fine-Tuned Model

In [None]:
def getSummary(text):
  with torch.no_grad():
    input_ids = tokenizer("summarize: "+text, return_tensors="pt").input_ids
    input_ids = input_ids.to(device)
    outputs = model.generate(input_ids,do_sample =True,num_beams=3, min_length=4,max_length=20)
    decodedwords = tokenizer.decode(outputs[0])
    return decodedwords

In [None]:
text="Attendees learn how natural language processing algorithms can translate a single input text into a variety of suggestions to help I-O psychologists address creative research tasks. Attendees will (a) understand how abstractive summarization models generally work, (b) learn how abstractive summarization models can be applied to tasks requiring creativity, (c) train a model to suggest SIOP titles from abstracts, and (d) clarify questions/misconceptions about using neural network text analysis models."

In [None]:
getSummary(text)