In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.16.1-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 5.1 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 56.1 MB/s 
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 44.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.6 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 49.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyy

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, Dataset

import numpy as np
import pandas as pd
import tensorflow as tf

import json

from spacy.lang.en import English

In [None]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/skimlit_example_abstracts.json
!wget https://raw.githubusercontent.com/vishalrk1/pytorch/main/Pytorch_Helper.py

!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

# importing helper function
from helper_functions import create_tensorboard_callback, plot_loss_curves, pred_and_plot, unzip_data, walk_through_dir
from Pytorch_Helper import Tokenizer, LabelEncoder

--2022-01-30 15:54:05--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/skimlit_example_abstracts.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6737 (6.6K) [text/plain]
Saving to: ‘skimlit_example_abstracts.json’


2022-01-30 15:54:05 (56.7 MB/s) - ‘skimlit_example_abstracts.json’ saved [6737/6737]

--2022-01-30 15:54:05--  https://raw.githubusercontent.com/vishalrk1/pytorch/main/Pytorch_Helper.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11016 (11K) [text/plain]
Saving to: ‘Pytorch_H

In [None]:
tokenizer = Tokenizer.load(fp='/content/drive/MyDrive/Datasets/SkimLit/skimlit-pytorch-1/tokenizer.json')
label_encoder = LabelEncoder.load(fp='/content/drive/MyDrive/Datasets/SkimLit/skimlit-pytorch-1/label_encoder.json')

In [None]:
# Downloading glove embeddings files
!wget http://nlp.stanford.edu/data/glove.6B.zip
unzip_data('/content/glove.6B.zip')

def load_glove_embeddings(embeddings_file):
    """Load embeddings from a file."""
    embeddings = {}
    with open(embeddings_file, "r") as fp:
        for index, line in enumerate(fp):
            values = line.split()
            word = values[0]
            embedding = np.asarray(values[1:], dtype='float32')
            embeddings[word] = embedding
    return embeddings

def make_embeddings_matrix(embeddings, word_index, embedding_dim):
    """Create embeddings matrix to use in Embedding layer."""
    embedding_matrix = np.zeros((len(word_index), embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

EMBEDDING_DIM = 300
HIDDEN_DIM = 128

# Create embeddings
embeddings_file = '/content/glove.6B.{0}d.txt'.format(EMBEDDING_DIM)
glove_embeddings = load_glove_embeddings(embeddings_file=embeddings_file)

embedding_matrix = make_embeddings_matrix(
    embeddings=glove_embeddings, word_index=tokenizer.token_to_index,
    embedding_dim=EMBEDDING_DIM)

print (f"<Embeddings(words={embedding_matrix.shape[0]}, dim={embedding_matrix.shape[1]})>")

In [None]:
def gather_last_relevant_hidden(hiddens, seq_lens):
    """Extract and collect the last relevant
    hidden state based on the sequence length."""
    seq_lens = seq_lens.long().detach().cpu().numpy() - 1
    out = []
    for batch_index, column_index in enumerate(seq_lens):
        out.append(hiddens[batch_index, column_index])
    return torch.stack(out)

In [None]:
class SkimlitModel(nn.Module):
    def __init__(self, embedding_dim, vocab_size, hidden_dim, n_layers, linear_output, num_classes, pretrained_embeddings=None, padding_idx=0):
        super(SkimlitModel, self).__init__()

        # Initalizing embeddings
        if pretrained_embeddings is None:
            self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        else:
            pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
            self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, _weight=pretrained_embeddings, padding_idx=padding_idx)

        # LSTM layers
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, batch_first=True, bidirectional=True)

        # FC layers
        self.fc_text = nn.Linear(2*hidden_dim, linear_output)

        self.fc_line_num = nn.Linear(20, 64)
        self.fc_total_line = nn.Linear(24, 64)

        self.fc_final = nn.Linear((64+64+linear_output), num_classes)
        self.dropout = nn.Dropout(0.3)

    def forward(self, inputs):
        x_in, seq_lens, line_nums, total_lines = inputs
        x_in = self.embeddings(x_in)

        # RNN outputs 
        out, b_n = self.lstm1(x_in)
        x_1 = gather_last_relevant_hidden(hiddens=out, seq_lens=seq_lens)

        # FC layers output
        x_1 = F.relu(self.fc_text(x_1))
        x_2 = F.relu(self.fc_line_num(line_nums))
        x_3 = F.relu(self.fc_total_line(total_lines))

        x = torch.cat((x_1, x_2, x_3), dim=1)
        x = self.dropout(x)
        x = self.fc_final(x)
        return x

In [None]:
vocab_size = len(tokenizer)
num_classes = len(label_encoder)
print(num_classes)

class_names = label_encoder.class_to_index.keys()
class_names

5


dict_keys(['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS'])

In [None]:
model = SkimlitModel(embedding_dim=300, vocab_size=vocab_size, hidden_dim=128, n_layers=3, linear_output=128, num_classes=num_classes, pretrained_embeddings=embedding_matrix)

model.load_state_dict(torch.load('/content/drive/MyDrive/Datasets/SkimLit/skimlit-pytorch-1/skimlit-model-final-1.pt', map_location='cpu'))

In [None]:
model

## Preparing Data for predictions

In [None]:
with open("skimlit_example_abstracts.json", "r") as f:
    example_abstracts = json.load(f)

abstracts = pd.DataFrame(example_abstracts)
abstracts.head()

Unnamed: 0,abstract,source,details
0,This RCT examined the efficacy of a manualized...,https://pubmed.ncbi.nlm.nih.gov/20232240/,RCT of a manualized social treatment for high-...
1,Postpartum depression (PPD) is the most preval...,https://pubmed.ncbi.nlm.nih.gov/28012571/,Formatting removed (can be used to compare mod...
2,"Mental illness, including depression, anxiety ...",https://pubmed.ncbi.nlm.nih.gov/28942748/,Effect of nutrition on mental health
3,Hepatitis C virus (HCV) and alcoholic liver di...,https://pubmed.ncbi.nlm.nih.gov/22244707/,Baclofen promotes alcohol abstinence in alcoho...


In [None]:
abstracts.abstract[1]

"Postpartum depression (PPD) is the most prevalent mood disorder associated with childbirth. No single cause of PPD has been identified, however the increased risk of nutritional deficiencies incurred through the high nutritional requirements of pregnancy may play a role in the pathology of depressive symptoms. Three nutritional interventions have drawn particular interest as possible non-invasive and cost-effective prevention and/or treatment strategies for PPD; omega-3 (n-3) long chain polyunsaturated fatty acids (LCPUFA), vitamin D and overall diet. We searched for meta-analyses of randomised controlled trials (RCT's) of nutritional interventions during the perinatal period with PPD as an outcome, and checked for any trials published subsequently to the meta-analyses. Fish oil: Eleven RCT's of prenatal fish oil supplementation RCT's show null and positive effects on PPD symptoms. Vitamin D: no relevant RCT's were identified, however seven observational studies of maternal vitamin D 

In [None]:
# setup English sentence parser
nlp = English()

# create sentence splitting pipeline object
sentencizer = nlp.create_pipe("sentencizer")

# add sentence splitting pipeline object to sentence parser
nlp.add_pipe(sentencizer)

# create "doc" of parsed sequences, change index for a different abstract
doc = nlp(abstracts.abstract[1]) 

# return detected sentences from doc in string type (not spaCy token type)
abstract_lines = [str(sent) for sent in list(doc.sents)] 

abstract_lines

['Postpartum depression (PPD) is the most prevalent mood disorder associated with childbirth.',
 'No single cause of PPD has been identified, however the increased risk of nutritional deficiencies incurred through the high nutritional requirements of pregnancy may play a role in the pathology of depressive symptoms.',
 'Three nutritional interventions have drawn particular interest as possible non-invasive and cost-effective prevention and/or treatment strategies for PPD; omega-3 (n-3) long chain polyunsaturated fatty acids (LCPUFA), vitamin D and overall diet.',
 "We searched for meta-analyses of randomised controlled trials (RCT's) of nutritional interventions during the perinatal period with PPD as an outcome, and checked for any trials published subsequently to the meta-analyses.",
 "Fish oil: Eleven RCT's of prenatal fish oil supplementation RCT's show null and positive effects on PPD symptoms.",
 "Vitamin D: no relevant RCT's were identified, however seven observational studies o

In [None]:
# Get total number of lines
total_lines_in_sample = len(abstract_lines)

# Go through each line in abstract and create a list of dictionaries containing features for each line
sample_lines = []
for i, line in enumerate(abstract_lines):
    sample_dict = {}
    sample_dict["text"] = str(line)
    sample_dict["line_number"] = i
    sample_dict["total_lines"] = total_lines_in_sample - 1
    sample_lines.append(sample_dict)

sample_lines

[{'line_number': 0,
  'text': 'Postpartum depression (PPD) is the most prevalent mood disorder associated with childbirth.',
  'total_lines': 9},
 {'line_number': 1,
  'text': 'No single cause of PPD has been identified, however the increased risk of nutritional deficiencies incurred through the high nutritional requirements of pregnancy may play a role in the pathology of depressive symptoms.',
  'total_lines': 9},
 {'line_number': 2,
  'text': 'Three nutritional interventions have drawn particular interest as possible non-invasive and cost-effective prevention and/or treatment strategies for PPD; omega-3 (n-3) long chain polyunsaturated fatty acids (LCPUFA), vitamin D and overall diet.',
  'total_lines': 9},
 {'line_number': 3,
  'text': "We searched for meta-analyses of randomised controlled trials (RCT's) of nutritional interventions during the perinatal period with PPD as an outcome, and checked for any trials published subsequently to the meta-analyses.",
  'total_lines': 9},
 {'

In [None]:
df = pd.DataFrame(sample_lines)
df

Unnamed: 0,text,line_number,total_lines
0,Postpartum depression (PPD) is the most preval...,0,9
1,"No single cause of PPD has been identified, ho...",1,9
2,Three nutritional interventions have drawn par...,2,9
3,We searched for meta-analyses of randomised co...,3,9
4,Fish oil: Eleven RCT's of prenatal fish oil su...,4,9
5,"Vitamin D: no relevant RCT's were identified, ...",5,9
6,Diet: Two Australian RCT's with dietary advice...,6,9
7,"With the exception of fish oil, few RCT's with...",7,9
8,Further research is needed to determine whethe...,8,9
9,Given the prevalence of PPD and ease of admini...,9,9


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

nltk.download("stopwords")
STOPWORDS = stopwords.words("english")
porter = PorterStemmer()

def preprocess(text, stopwords=STOPWORDS):
    """Conditional preprocessing on our text unique to our task."""
    # Lower
    text = text.lower()

    # Remove stopwords
    pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
    text = pattern.sub("", text)

    # Remove words in paranthesis
    text = re.sub(r"\([^)]*\)", "", text)

    # Spacing and filters
    text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
    text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()

    return text

In [None]:
df.text = df.text.apply(preprocess)
df

Unnamed: 0,text,line_number,total_lines
0,postpartum depression prevalent mood disorder ...,0,9
1,single cause ppd identified however increased ...,1,9
2,three nutritional interventions drawn particul...,2,9
3,searched meta analyses randomised controlled t...,3,9
4,fish oil eleven rct prenatal fish oil suppleme...,4,9
5,vitamin relevant rct identified however seven ...,5,9
6,diet two australian rct dietary advice interve...,6,9
7,exception fish oil rct nutritional interventio...,7,9
8,research needed determine whether nutritional ...,8,9
9,given prevalence ppd ease administering ppd me...,9,9


In [None]:
text_seq = tokenizer.texts_to_sequences(texts=df['text'])

In [None]:
def pad_sequences(sequences, max_seq_len=0):
    """Pad sequences to max length in sequence."""
    max_seq_len = max(max_seq_len, max(len(sequence) for sequence in sequences))
    padded_sequences = np.zeros((len(sequences), max_seq_len))
    for i, sequence in enumerate(sequences):
        padded_sequences[i][:len(sequence)] = sequence
    return padded_sequences

In [None]:
class SkimlitDataset(Dataset):
  def __init__(self, text_seq, line_num, total_line):
    self.text_seq = text_seq
    self.line_num_one_hot = line_num
    self.total_line_one_hot = total_line

  def __len__(self):
    return len(self.text_seq)

  def __str__(self):
    return f"<Dataset(N={len(self)})>"

  def __getitem__(self, index):
    X = self.text_seq[index]
    line_num = self.line_num_one_hot[index]
    total_line = self.total_line_one_hot[index]
    return [X, len(X), line_num, total_line]
  
  def collate_fn(self, batch):
    """Processing on a batch"""
    # Getting Input
    batch = np.array(batch)
    text_seq = batch[:,0]
    seq_lens = batch[:, 1]
    line_nums = batch[:, 2]
    total_lines = batch[:, 3]

    # padding inputs
    pad_text_seq = pad_sequences(sequences=text_seq) # max_seq_len=max_length

    # converting line nums into one-hot encoding
    line_nums = tf.one_hot(line_nums, depth=20)

     # converting total lines into one-hot encoding
    total_lines = tf.one_hot(total_lines, depth=24)

    # converting inputs to tensors
    pad_text_seq = torch.LongTensor(pad_text_seq.astype(np.int32))
    seq_lens = torch.LongTensor(seq_lens.astype(np.int32))
    line_nums = torch.tensor(line_nums.numpy())
    total_lines = torch.tensor(total_lines.numpy())
    
    return pad_text_seq, seq_lens, line_nums, total_lines

  def create_dataloader(self, batch_size, shuffle=False, drop_last=False):
    dataloader = DataLoader(dataset=self, batch_size=batch_size, collate_fn=self.collate_fn, shuffle=shuffle, drop_last=drop_last, pin_memory=True)
    return dataloader


In [None]:
dataset = SkimlitDataset(text_seq=text_seq, line_num=df['line_number'], total_line=df['total_lines'])

In [None]:
dataloader = dataset.create_dataloader(batch_size=2)

In [None]:
batch_text_seq, batch_seq_len, batch_line_num, batch_total_line = next(iter(dataloader))
batch_line_num.shape, batch_total_line.shape, batch_line_num 



(torch.Size([2, 20]),
 torch.Size([2, 24]),
 tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0.],
         [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0.]]))

In [None]:
batch_text_seq

tensor([[ 1253,   217,  1854,  1151,   510,    33,  4943,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  142,   503,  4580,   449,    95,    57,    26,  1186,  6554, 10680,
            56,  1186,  1863,   466,    51,  1860,   460,  2884,   567,    98]])

In [None]:
from torch._C import dtype
from tqdm.notebook import tqdm

def model_prediction(model, dataloader):
  """Prediction step."""
  # Set model to eval mode
  model.eval()
  y_trues, y_probs = [], []
  # Iterate over val batches
  for i, batch in enumerate(dataloader):
    # Forward pass w/ inputs
    # batch = [item.to(.device) for item in batch]  # Set device
    inputs = batch
    z = model(inputs)
    # Store outputs
    y_prob = F.softmax(z, dim=1).detach().cpu().numpy()
    y_probs.extend(y_prob)
  return np.vstack(y_probs)

In [None]:
y_pred = model_prediction(model, dataloader)
y_pred



array([[3.2443214e-02, 5.7085557e-04, 2.1981153e-04, 9.6675080e-01,
        1.5307498e-05],
       [3.0264378e-01, 6.2012380e-01, 6.0995789e-03, 6.1218541e-02,
        9.9143842e-03],
       [3.8561532e-01, 4.3435767e-01, 5.6032207e-02, 8.0500022e-02,
        4.3494876e-02],
       [2.2097282e-02, 3.8963822e-03, 9.3239123e-01, 2.3200387e-02,
        1.8414730e-02],
       [5.7903677e-04, 4.2306773e-02, 7.0190453e-03, 8.4013293e-05,
        9.5001113e-01],
       [2.0131609e-01, 4.4898337e-01, 2.2759380e-02, 7.9234265e-02,
        2.4770696e-01],
       [2.2660716e-02, 9.6102292e-03, 8.9921057e-01, 2.1540524e-02,
        4.6977911e-02],
       [1.4795629e-02, 2.2826396e-01, 3.2464132e-01, 8.8621080e-03,
        4.2343706e-01],
       [1.8425023e-02, 9.7312939e-01, 2.0880327e-03, 4.3930989e-03,
        1.9644513e-03],
       [1.1512598e-01, 4.4469628e-01, 3.3758843e-01, 4.5346171e-02,
        5.7243101e-02]], dtype=float32)

In [None]:
pred = y_pred.argmax(axis=1)
pred = label_encoder.decode(pred)

In [None]:
# Visualize abstract lines and predicted sequence labels
for i, line in enumerate(abstract_lines):
    print(f"{pred[i]}: {line}")

OBJECTIVE: Postpartum depression (PPD) is the most prevalent mood disorder associated with childbirth.
CONCLUSIONS: No single cause of PPD has been identified, however the increased risk of nutritional deficiencies incurred through the high nutritional requirements of pregnancy may play a role in the pathology of depressive symptoms.
CONCLUSIONS: Three nutritional interventions have drawn particular interest as possible non-invasive and cost-effective prevention and/or treatment strategies for PPD; omega-3 (n-3) long chain polyunsaturated fatty acids (LCPUFA), vitamin D and overall diet.
METHODS: We searched for meta-analyses of randomised controlled trials (RCT's) of nutritional interventions during the perinatal period with PPD as an outcome, and checked for any trials published subsequently to the meta-analyses.
RESULTS: Fish oil: Eleven RCT's of prenatal fish oil supplementation RCT's show null and positive effects on PPD symptoms.
CONCLUSIONS: Vitamin D: no relevant RCT's were ide

# Creating Fincal function

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

# nltk.download("stopwords")
# STOPWORDS = stopwords.words("english")
# porter = PorterStemmer()

def download_stopwords():
  nltk.download("stopwords")
  STOPWORDS = stopwords.words("english")
  porter = PorterStemmer()
  return STOPWORDS, porter

def preprocess(text, stopwords=STOPWORDS):
    """Conditional preprocessing on our text unique to our task."""
    # Lower
    text = text.lower()

    # Remove stopwords
    pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
    text = pattern.sub("", text)

    # Remove words in paranthesis
    text = re.sub(r"\([^)]*\)", "", text)

    # Spacing and filters
    text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
    text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()

    return text

In [None]:
def load_glove_embeddings(embeddings_file):
    """Load embeddings from a file."""
    embeddings = {}
    with open(embeddings_file, "r") as fp:
        for index, line in enumerate(fp):
            values = line.split()
            word = values[0]
            embedding = np.asarray(values[1:], dtype='float32')
            embeddings[word] = embedding
    return embeddings

def make_embeddings_matrix(embeddings, word_index, embedding_dim):
    """Create embeddings matrix to use in Embedding layer."""
    embedding_matrix = np.zeros((len(word_index), embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

def get_embeddings(embedding_file_path, tokenizer, embedding_dim):
  glove_embeddings = load_glove_embeddings(embeddings_file=embeddings_file)
  embedding_matrix = make_embeddings_matrix(embeddings=glove_embeddings, word_index=tokenizer.token_to_index, embedding_dim=embedding_dim)
  return embedding_matrix

In [None]:
from spacy.lang.en import English

def spacy_function(abstract):
    
  # setup English sentence parser
  nlp = English()

  # create sentence splitting pipeline object
  sentencizer = nlp.create_pipe("sentencizer")

  # add sentence splitting pipeline object to sentence parser
  nlp.add_pipe(sentencizer)
    
  # create "doc" of parsed sequences, change index for a different abstract
  doc = nlp(abstract) 

  # return detected sentences from doc in string type (not spaCy token type)
  abstract_lines = [str(sent) for sent in list(doc.sents)]
    
  return abstract_lines
    
# ---------------------------------------------------------------------------------------------------------------------------

def model_prediction(model, dataloader):
  """Prediction step."""
  # Set model to eval mode
  model.eval()
  y_trues, y_probs = [], []
  # Iterate over val batches
  for i, batch in enumerate(dataloader):
    # Forward pass w/ inputs
    # batch = [item.to(.device) for item in batch]  # Set device
    inputs = batch
    z = model(inputs)
    # Store outputs
    y_prob = F.softmax(z, dim=1).detach().cpu().numpy()
    y_probs.extend(y_prob)
  return np.vstack(y_probs)

# ---------------------------------------------------------------------------------------------------------------------------

def make_predictions(text, embeding_path, model_path, tokenizer, label_encoder):
  # getting all lines seprated from abstract
  abstract_lines = list()
  abstract_lines = spacy_function(text)  
    
  # Get total number of lines
  total_lines_in_sample = len(abstract_lines)

  # Go through each line in abstract and create a list of dictionaries containing features for each line
  sample_lines = []
  for i, line in enumerate(abstract_lines):
    sample_dict = {}
    sample_dict["text"] = str(line)
    sample_dict["line_number"] = i
    sample_dict["total_lines"] = total_lines_in_sample - 1
    sample_lines.append(sample_dict)

  # converting sample line list into pandas Dataframe
  df = pd.DataFrame(sample_lines)

  # getting stopwords 
  STOPWORDS, porter = download_stopwords()

  # applying preprocessing function to lines
  df.text = df.text.apply(lambda x: preprocess(x, STOPWORDS))

  # converting texts into numberical sequences
  text_seq = tokenizer.texts_to_sequences(texts=df['text'])

  # creating Dataset
  dataset = SkimlitDataset(text_seq=text_seq, line_num=df['line_number'], total_line=df['total_lines'])

  # creating dataloader
  dataloader = dataset.create_dataloader(batch_size=2)

  # Preparing embedings
  embedding_matrix = get_embeddings(embeding_path, tokenizer, 300)

  # creating model
  model = SkimlitModel(embedding_dim=300, vocab_size=len(tokenizer), hidden_dim=128, n_layers=3, linear_output=128, num_classes=len(label_encoder), pretrained_embeddings=embedding_matrix)

  # loading model weight
  model.load_state_dict(torch.load('/content/drive/MyDrive/Datasets/SkimLit/skimlit-pytorch-1/skimlit-model-final-1.pt', map_location='cpu'))

  # setting model into evaluation mode
  model.eval()

  # getting predictions 
  y_pred = model_prediction(model, dataloader)

  # converting predictions into label class
  pred = y_pred.argmax(axis=1)
  pred = label_encoder.decode(pred)

  return abstract_lines, pred

# Test.1

In [None]:
tokenizer = Tokenizer.load(fp='/content/drive/MyDrive/Datasets/SkimLit/skimlit-pytorch-1/tokenizer.json')
label_encoder = LabelEncoder.load(fp='/content/drive/MyDrive/Datasets/SkimLit/skimlit-pytorch-1/label_encoder.json')

abstract_lines, pred = make_predictions(
    abstracts.abstract[1], 
    '/content/glove.6B.300d.txt', 
    '/content/drive/MyDrive/Datasets/SkimLit/skimlit-pytorch-1/skimlit-model-final-1.pt', 
    tokenizer, 
    label_encoder,
)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [None]:
# Visualize abstract lines and predicted sequence labels
for i, line in enumerate(abstract_lines):
    print(f"{pred[i]}: {line}")

OBJECTIVE: Postpartum depression (PPD) is the most prevalent mood disorder associated with childbirth.
CONCLUSIONS: No single cause of PPD has been identified, however the increased risk of nutritional deficiencies incurred through the high nutritional requirements of pregnancy may play a role in the pathology of depressive symptoms.
CONCLUSIONS: Three nutritional interventions have drawn particular interest as possible non-invasive and cost-effective prevention and/or treatment strategies for PPD; omega-3 (n-3) long chain polyunsaturated fatty acids (LCPUFA), vitamin D and overall diet.
METHODS: We searched for meta-analyses of randomised controlled trials (RCT's) of nutritional interventions during the perinatal period with PPD as an outcome, and checked for any trials published subsequently to the meta-analyses.
RESULTS: Fish oil: Eleven RCT's of prenatal fish oil supplementation RCT's show null and positive effects on PPD symptoms.
CONCLUSIONS: Vitamin D: no relevant RCT's were ide

# Test.2

In [None]:
abstract_lines, pred = make_predictions(
    abstracts.abstract[0], 
    '/content/glove.6B.300d.txt', 
    '/content/drive/MyDrive/Datasets/SkimLit/skimlit-pytorch-1/skimlit-model-final-1.pt', 
    tokenizer, 
    label_encoder,
)

print(abstracts.abstract[0])

This RCT examined the efficacy of a manualized social intervention for children with HFASDs. Participants were randomly assigned to treatment or wait-list conditions. Treatment included instruction and therapeutic activities targeting social skills, face-emotion recognition, interest expansion, and interpretation of non-literal language. A response-cost program was applied to reduce problem behaviors and foster skills acquisition. Significant treatment effects were found for five of seven primary outcome measures (parent ratings and direct child measures). Secondary measures based on staff ratings (treatment group only) corroborated gains reported by parents. High levels of parent, child and staff satisfaction were reported, along with high levels of treatment fidelity. Standardized effect size estimates were primarily in the medium and large ranges and favored the treatment group.




In [None]:
# Visualize abstract lines and predicted sequence labels
for i, line in enumerate(abstract_lines):
    print(f"{pred[i]}: {line}")

METHODS: This RCT examined the efficacy of a manualized social intervention for children with HFASDs.
METHODS: Participants were randomly assigned to treatment or wait-list conditions.
METHODS: Treatment included instruction and therapeutic activities targeting social skills, face-emotion recognition, interest expansion, and interpretation of non-literal language.
BACKGROUND: A response-cost program was applied to reduce problem behaviors and foster skills acquisition.
RESULTS: Significant treatment effects were found for five of seven primary outcome measures (parent ratings and direct child measures).
METHODS: Secondary measures based on staff ratings (treatment group only) corroborated gains reported by parents.
BACKGROUND: High levels of parent, child and staff satisfaction were reported, along with high levels of treatment fidelity.
RESULTS: Standardized effect size estimates were primarily in the medium and large ranges and favored the treatment group.


# Test.3

In [None]:
abstract_lines, pred = make_predictions(
    abstracts.abstract[2], 
    '/content/glove.6B.300d.txt', 
    '/content/drive/MyDrive/Datasets/SkimLit/skimlit-pytorch-1/skimlit-model-final-1.pt', 
    tokenizer, 
    label_encoder,
)

abstracts.abstract[2]



'Mental illness, including depression, anxiety and bipolar disorder, accounts for a significant proportion of global disability and poses a substantial social, economic and heath burden. Treatment is presently dominated by pharmacotherapy, such as antidepressants, and psychotherapy, such as cognitive behavioural therapy; however, such treatments avert less than half of the disease burden, suggesting that additional strategies are needed to prevent and treat mental disorders. There are now consistent mechanistic, observational and interventional data to suggest diet quality may be a modifiable risk factor for mental illness. This review provides an overview of the nutritional psychiatry field. It includes a discussion of the neurobiological mechanisms likely modulated by diet, the use of dietary and nutraceutical interventions in mental disorders, and recommendations for further research. Potential biological pathways related to mental disorders include inflammation, oxidative stress, t

In [None]:
# Visualize abstract lines and predicted sequence labels
for i, line in enumerate(abstract_lines):
    print(f"{pred[i]}: {line}")

BACKGROUND: Mental illness, including depression, anxiety and bipolar disorder, accounts for a significant proportion of global disability and poses a substantial social, economic and heath burden.
BACKGROUND: Treatment is presently dominated by pharmacotherapy, such as antidepressants, and psychotherapy, such as cognitive behavioural therapy; however, such treatments avert less than half of the disease burden, suggesting that additional strategies are needed to prevent and treat mental disorders.
CONCLUSIONS: There are now consistent mechanistic, observational and interventional data to suggest diet quality may be a modifiable risk factor for mental illness.
OBJECTIVE: This review provides an overview of the nutritional psychiatry field.
BACKGROUND: It includes a discussion of the neurobiological mechanisms likely modulated by diet, the use of dietary and nutraceutical interventions in mental disorders, and recommendations for further research.
CONCLUSIONS: Potential biological pathwa

# Transformers Model Predictions

In [None]:
from transformers import BertModel, BertTokenizerFast

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

In [None]:
class SkimlitDataset(Dataset):
  def __init__(self, text_seq, line_num, total_lines, tokenizer):
    self.text_seq = text_seq
    self.line_num = line_num
    self.total_lines = total_lines
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.text_seq)

  def __str__(self):
    return f"<Dataset(N={len(self)})>"
  
  def __getitem__(self, index):
    text = self.text_seq[index]
    line_num = self.line_num[index]
    total_lines = self.total_lines[index]

    return [text, line_num, total_lines]

  def collate_fn(self, batch):
    """Processing on a batch"""
    # Getting Input
    batch = np.array(batch)
    texts = batch[:,0]
    # print(texts.tolist())
    line_nums = batch[:, 1]
    total_lines = batch[:, 2]

    # converting line nums into one-hot encoding
    line_nums = tf.one_hot(line_nums, depth=20)

    # converting total lines into one-hot encoding
    total_lines = tf.one_hot(total_lines, depth=24)

    # tokenizing text inputs
    tokenized_text = self.tokenizer(texts.tolist(), return_tensors='pt', max_length=128, padding='max_length', truncation=True)

    # input_ids = tokenized_text['input_ids']
    # attention_mask = tokenized_text['attention_mask']
    line_nums = torch.tensor(line_nums.numpy())
    total_lines = torch.tensor(total_lines.numpy())

    return tokenized_text, line_nums, total_lines

  def create_dataloader(self, batch_size, shuffle=False, drop_last=False):
    dataloader = DataLoader(dataset=self, batch_size=batch_size, collate_fn=self.collate_fn, shuffle=shuffle, drop_last=drop_last, pin_memory=True)
    return dataloader

In [None]:
df

Unnamed: 0,text,line_number,total_lines
0,Postpartum depression (PPD) is the most preval...,0,9
1,"No single cause of PPD has been identified, ho...",1,9
2,Three nutritional interventions have drawn par...,2,9
3,We searched for meta-analyses of randomised co...,3,9
4,Fish oil: Eleven RCT's of prenatal fish oil su...,4,9
5,"Vitamin D: no relevant RCT's were identified, ...",5,9
6,Diet: Two Australian RCT's with dietary advice...,6,9
7,"With the exception of fish oil, few RCT's with...",7,9
8,Further research is needed to determine whethe...,8,9
9,Given the prevalence of PPD and ease of admini...,9,9


In [None]:
dataset = SkimlitDataset(text_seq=df['text'], line_num=df['line_number'], total_lines=df['total_lines'], tokenizer=tokenizer)

In [None]:
dataloader = dataset.create_dataloader(batch_size=2)

In [None]:
batch = next(iter(dataloader))

In [None]:
batch

# Model

In [None]:
class SkimlitModel(nn.Module):
  def __init__(self, num_classes, freeze_bert=True):
    super(SkimlitModel, self).__init__()

    # Initalizing BERT Model
    self.base_model = BertModel.from_pretrained('bert-base-cased')

    if freeze_bert:
      for param in self.base_model.parameters():
        param.requires_grad = False

    # FC layers
    self.fc_text = nn.Linear(768, 512)

    self.fc_line_num = nn.Linear(20, 64)
    self.fc_total_line = nn.Linear(24, 64)

    self.fc_final = nn.Linear((64+64+512), 128)
    self.classifier = nn.Linear(128, num_classes)
    self.dropout = nn.Dropout(0.3)

  def forward(self, text_inputs, line_nums, total_lines):

    # bert model output
    bert_output = self.base_model(**text_inputs)
    x_1 = bert_output['pooler_output']

    # FC layers output
    x_1 = F.relu(self.fc_text(x_1))
    x_2 = F.relu(self.fc_line_num(line_nums))
    x_3 = F.relu(self.fc_total_line(total_lines))

    x = torch.cat((x_1, x_2, x_3), dim=1)
    x = self.dropout(x)
    x = F.relu(self.fc_final(x))
    x = self.dropout(x)
    x = self.classifier(x)
    return x

In [None]:
model = SkimlitModel(num_classes=num_classes, freeze_bert=True)
model

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SkimlitModel(
  (base_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/Datasets/SkimLit/skimlit-pytorch-transformers-1.pt', map_location='cpu'))

<All keys matched successfully>

In [None]:
from torch._C import dtype
from tqdm.notebook import tqdm

def model_prediction(model, dataloader):
  """Prediction step."""
  # Set model to eval mode
  model.eval()
  y_trues, y_probs = [], []
  # Iterate over val batches
  for i, batch in enumerate(dataloader):
    # Forward pass w/ inputs
    # batch = [item.to(.device) for item in batch]  # Set device
    text_seq, line_nums, total_lines = batch

    input_ids, attention_mask = text_seq['input_ids'], text_seq['attention_mask']
    # line_nums, total_lines, labels = line_nums.to(device), total_lines.to(device), labels.to(device)

    inputs = {'input_ids':input_ids, 'attention_mask': attention_mask}

    z = model(inputs, line_nums, total_lines)
    # Store outputs
    y_prob = F.softmax(z, dim=1).detach().cpu().numpy()
    y_probs.extend(y_prob)
  return np.vstack(y_probs)

In [None]:
y_pred = model_prediction(model, dataloader)

In [None]:
pred = y_pred.argmax(axis=1)
label_encoder.decode(pred)

['BACKGROUND',
 'BACKGROUND',
 'BACKGROUND',
 'METHODS',
 'METHODS',
 'RESULTS',
 'RESULTS',
 'CONCLUSIONS',
 'CONCLUSIONS',
 'CONCLUSIONS']

In [None]:
from spacy.lang.en import English

def spacy_function(abstract):
    
  # setup English sentence parser
  nlp = English()

  # create sentence splitting pipeline object
  sentencizer = nlp.create_pipe("sentencizer")

  # add sentence splitting pipeline object to sentence parser
  nlp.add_pipe(sentencizer)
    
  # create "doc" of parsed sequences, change index for a different abstract
  doc = nlp(abstract) 

  # return detected sentences from doc in string type (not spaCy token type)
  abstract_lines = [str(sent) for sent in list(doc.sents)]
    
  return abstract_lines
    
# ---------------------------------------------------------------------------------------------------------------------------

def model_prediction(model, dataloader):
  """Prediction step."""
  # Set model to eval mode
  model.eval()
  y_trues, y_probs = [], []
  # Iterate over val batches
  for i, batch in enumerate(dataloader):
    # Forward pass w/ inputs
    text_seq, line_nums, total_lines = batch

    input_ids, attention_mask = text_seq['input_ids'], text_seq['attention_mask']
    inputs = {'input_ids':input_ids, 'attention_mask': attention_mask}

    z = model(inputs, line_nums, total_lines)
    # Store outputs
    y_prob = F.softmax(z, dim=1).detach().cpu().numpy()
    y_probs.extend(y_prob)
  return np.vstack(y_probs)

# ---------------------------------------------------------------------------------------------------------------------------

def make_predictions(text, model_path, tokenizer, label_encoder):
  # getting all lines seprated from abstract
  abstract_lines = list()
  abstract_lines = spacy_function(text)  
    
  # Get total number of lines
  total_lines_in_sample = len(abstract_lines)

  # Go through each line in abstract and create a list of dictionaries containing features for each line
  sample_lines = []
  for i, line in enumerate(abstract_lines):
    sample_dict = {}
    sample_dict["text"] = str(line)
    sample_dict["line_number"] = i
    sample_dict["total_lines"] = total_lines_in_sample - 1
    sample_lines.append(sample_dict)

  # converting sample line list into pandas Dataframe
  df = pd.DataFrame(sample_lines)

  # creating Dataset
  dataset = SkimlitDataset(text_seq=df['text'], line_num=df['line_number'], total_lines=df['total_lines'], tokenizer=tokenizer)

  # creating dataloader
  dataloader = dataset.create_dataloader(batch_size=2)

  # creating model
  model = SkimlitModel(num_classes=len(label_encoder))

  # loading model weight
  model.load_state_dict(torch.load(model_path, map_location='cpu'))

  # setting model into evaluation mode
  model.eval()

  # getting predictions 
  y_pred = model_prediction(model, dataloader)

  # converting predictions into label class
  pred = y_pred.argmax(axis=1)
  pred = label_encoder.decode(pred)

  return abstract_lines, pred

In [None]:
# '/content/drive/MyDrive/Datasets/SkimLit/skimlit-pytorch-transformers-1.pt'

abstract_lines, pred = make_predictions(
    abstracts.abstract[2],
    '/content/drive/MyDrive/Datasets/SkimLit/skimlit-pytorch-transformers-1.pt', 
    tokenizer, 
    label_encoder,
)

abstracts.abstract[2]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


'Mental illness, including depression, anxiety and bipolar disorder, accounts for a significant proportion of global disability and poses a substantial social, economic and heath burden. Treatment is presently dominated by pharmacotherapy, such as antidepressants, and psychotherapy, such as cognitive behavioural therapy; however, such treatments avert less than half of the disease burden, suggesting that additional strategies are needed to prevent and treat mental disorders. There are now consistent mechanistic, observational and interventional data to suggest diet quality may be a modifiable risk factor for mental illness. This review provides an overview of the nutritional psychiatry field. It includes a discussion of the neurobiological mechanisms likely modulated by diet, the use of dietary and nutraceutical interventions in mental disorders, and recommendations for further research. Potential biological pathways related to mental disorders include inflammation, oxidative stress, t

In [None]:
# Visualize abstract lines and predicted sequence labels
for i, line in enumerate(abstract_lines):
    print(f"{pred[i]}: {line}")

BACKGROUND: Mental illness, including depression, anxiety and bipolar disorder, accounts for a significant proportion of global disability and poses a substantial social, economic and heath burden.
BACKGROUND: Treatment is presently dominated by pharmacotherapy, such as antidepressants, and psychotherapy, such as cognitive behavioural therapy; however, such treatments avert less than half of the disease burden, suggesting that additional strategies are needed to prevent and treat mental disorders.
BACKGROUND: There are now consistent mechanistic, observational and interventional data to suggest diet quality may be a modifiable risk factor for mental illness.
OBJECTIVE: This review provides an overview of the nutritional psychiatry field.
OBJECTIVE: It includes a discussion of the neurobiological mechanisms likely modulated by diet, the use of dietary and nutraceutical interventions in mental disorders, and recommendations for further research.
CONCLUSIONS: Potential biological pathways