In [1]:
import tensorflow as tf
import tensorflow_hub as hub

In [2]:
import tensorflow as tf
import tensorflow_text as text
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam

## BERT Model

* Model with Bert embedding, character and positional embedding (https://github.com/vishalrk1/SkimLit/blob/main/Skimlit_BERT_Model.ipynb)

In [3]:
load_option = tf.saved_model.LoadOptions(experimental_io_device='/job:localhost')

In [5]:
skimlit_bert_model = tf.keras.models.load_model('Models/Skimlit_BertModel', options=load_option)

In [None]:
skimlit_bert_model.summary()

## Simple Model

* Model with custom token embedding, character embedding and positional embedding (https://github.com/vishalrk1/SkimLit/blob/main/skimlit_20K_final.ipynb)

In [20]:
skimlit_model = tf.keras.models.load_model('skimlit_tribrid_model')

In [21]:
skimlit_model.summary()

Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_inputs (InputLayer)        [(None, 1)]          0                                            
__________________________________________________________________________________________________
char_vectorizer (TextVectorizat (None, None)         0           char_inputs[0][0]                
__________________________________________________________________________________________________
token_inputs (InputLayer)       [(None,)]            0                                            
__________________________________________________________________________________________________
char_embed (Embedding)          (None, None, 25)     1750        char_vectorizer[0][0]            
____________________________________________________________________________________________

## Making Predictions with Model

* Split it into sentences (lines).
* split into characters
* find number of each line
* find number of total lines

In [22]:
import json
import pandas as pd

In [23]:
with open("skimlit_example_abstracts.json", "r") as f:
    example_abstracts = json.load(f)

In [24]:
abstracts = pd.DataFrame(example_abstracts)
abstracts.head()

Unnamed: 0,abstract,source,details
0,This RCT examined the efficacy of a manualized...,https://pubmed.ncbi.nlm.nih.gov/20232240/,RCT of a manualized social treatment for high-...
1,Postpartum depression (PPD) is the most preval...,https://pubmed.ncbi.nlm.nih.gov/28012571/,Formatting removed (can be used to compare mod...
2,"Mental illness, including depression, anxiety ...",https://pubmed.ncbi.nlm.nih.gov/28942748/,Effect of nutrition on mental health
3,Hepatitis C virus (HCV) and alcoholic liver di...,https://pubmed.ncbi.nlm.nih.gov/22244707/,Baclofen promotes alcohol abstinence in alcoho...


In [25]:
from spacy.lang.en import English

# setup English sentence parser
nlp = English()

# create sentence splitting pipeline object
sentencizer = nlp.create_pipe("sentencizer")

# add sentence splitting pipeline object to sentence parser
nlp.add_pipe('sentencizer')

# create "doc" of parsed sequences, change index for a different abstract
doc = nlp(example_abstracts[0]["abstract"]) 

# return detected sentences from doc in string type (not spaCy token type)
abstract_lines = [str(sent) for sent in list(doc.sents)] 

abstract_lines

['This RCT examined the efficacy of a manualized social intervention for children with HFASDs.',
 'Participants were randomly assigned to treatment or wait-list conditions.',
 'Treatment included instruction and therapeutic activities targeting social skills, face-emotion recognition, interest expansion, and interpretation of non-literal language.',
 'A response-cost program was applied to reduce problem behaviors and foster skills acquisition.',
 'Significant treatment effects were found for five of seven primary outcome measures (parent ratings and direct child measures).',
 'Secondary measures based on staff ratings (treatment group only) corroborated gains reported by parents.',
 'High levels of parent, child and staff satisfaction were reported, along with high levels of treatment fidelity.',
 'Standardized effect size estimates were primarily in the medium and large ranges and favored the treatment group.']

In [26]:
# Get total number of lines
total_lines_in_sample = len(abstract_lines)

# Go through each line in abstract and create a list of dictionaries containing features for each line
sample_lines = []
for i, line in enumerate(abstract_lines):
    sample_dict = {}
    sample_dict["text"] = str(line)
    sample_dict["line_number"] = i
    sample_dict["total_lines"] = total_lines_in_sample - 1
    sample_lines.append(sample_dict)

sample_lines

[{'text': 'This RCT examined the efficacy of a manualized social intervention for children with HFASDs.',
  'line_number': 0,
  'total_lines': 7},
 {'text': 'Participants were randomly assigned to treatment or wait-list conditions.',
  'line_number': 1,
  'total_lines': 7},
 {'text': 'Treatment included instruction and therapeutic activities targeting social skills, face-emotion recognition, interest expansion, and interpretation of non-literal language.',
  'line_number': 2,
  'total_lines': 7},
 {'text': 'A response-cost program was applied to reduce problem behaviors and foster skills acquisition.',
  'line_number': 3,
  'total_lines': 7},
 {'text': 'Significant treatment effects were found for five of seven primary outcome measures (parent ratings and direct child measures).',
  'line_number': 4,
  'total_lines': 7},
 {'text': 'Secondary measures based on staff ratings (treatment group only) corroborated gains reported by parents.',
  'line_number': 5,
  'total_lines': 7},
 {'text'

In [27]:
# Get all line_number values from sample abstract
test_abstract_line_numbers = [line["line_number"] for line in sample_lines]

# One-hot encode to same depth as training data, so model accepts right input shape
test_abstract_line_numbers_one_hot = tf.one_hot(test_abstract_line_numbers, depth=15) 
test_abstract_line_numbers_one_hot

<tf.Tensor: shape=(8, 15), dtype=float32, numpy=
array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]],
      dtype=float32)>

In [28]:
# Get all total_lines values from sample abstract
test_abstract_total_lines = [line["total_lines"] for line in sample_lines]
# One-hot encode to same depth as training data, so model accepts right input shape
test_abstract_total_lines_one_hot = tf.one_hot(test_abstract_total_lines, depth=20)
test_abstract_total_lines_one_hot

<tf.Tensor: shape=(8, 20), dtype=float32, numpy=
array([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]], dtype=float32)>

In [29]:
def split_chars(text):
    return ' '.join(list(text))

In [30]:
# Split abstract lines into characters
abstract_chars = [split_chars(sentence) for sentence in abstract_lines]

In [31]:
# Make predictions on sample abstract features
test_abstract_pred_probs = skimlit_model.predict(x=(test_abstract_line_numbers_one_hot,
                                                   test_abstract_total_lines_one_hot,
                                                   tf.constant(abstract_lines),
                                                   tf.constant(abstract_chars)))
test_abstract_pred_probs

array([[0.24005985, 0.12260367, 0.02247092, 0.57987505, 0.03499049],
       [0.05617492, 0.02073206, 0.75388527, 0.07419971, 0.09500796],
       [0.12344755, 0.06010269, 0.45917484, 0.2546255 , 0.10264946],
       [0.0825512 , 0.14257629, 0.5514248 , 0.07865825, 0.14478949],
       [0.06460248, 0.1236739 , 0.3602884 , 0.05729885, 0.3941363 ],
       [0.0306172 , 0.12271825, 0.5668775 , 0.0439826 , 0.2358045 ],
       [0.02591554, 0.17573084, 0.0667953 , 0.02152464, 0.7100337 ],
       [0.02250943, 0.1462235 , 0.19884332, 0.03152658, 0.6008972 ]],
      dtype=float32)

In [32]:
test_abstract_preds = tf.argmax(test_abstract_pred_probs, axis=1)
test_abstract_preds

<tf.Tensor: shape=(8,), dtype=int64, numpy=array([3, 2, 2, 2, 4, 2, 4, 4], dtype=int64)>

In [33]:
classes = ['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS']

In [34]:
# Turn prediction class integers into string class names
test_abstract_pred_classes = [classes[i] for i in test_abstract_preds]
test_abstract_pred_classes

['OBJECTIVE',
 'METHODS',
 'METHODS',
 'METHODS',
 'RESULTS',
 'METHODS',
 'RESULTS',
 'RESULTS']

In [35]:
# Visualize abstract lines and predicted sequence labels
for i, line in enumerate(abstract_lines):
    print(f"{test_abstract_pred_classes[i]}: {line}")

OBJECTIVE: This RCT examined the efficacy of a manualized social intervention for children with HFASDs.
METHODS: Participants were randomly assigned to treatment or wait-list conditions.
METHODS: Treatment included instruction and therapeutic activities targeting social skills, face-emotion recognition, interest expansion, and interpretation of non-literal language.
METHODS: A response-cost program was applied to reduce problem behaviors and foster skills acquisition.
RESULTS: Significant treatment effects were found for five of seven primary outcome measures (parent ratings and direct child measures).
METHODS: Secondary measures based on staff ratings (treatment group only) corroborated gains reported by parents.
RESULTS: High levels of parent, child and staff satisfaction were reported, along with high levels of treatment fidelity.
RESULTS: Standardized effect size estimates were primarily in the medium and large ranges and favored the treatment group.


# Final Function for predictions

* Putting all the above steps in one function

In [36]:
from spacy.lang.en import English

def spacy_function(abstract):
    
    # setup English sentence parser
    nlp = English()

    # create sentence splitting pipeline object
    sentencizer = nlp.create_pipe("sentencizer")

    # add sentence splitting pipeline object to sentence parser
    nlp.add_pipe('sentencizer')
    
    # create "doc" of parsed sequences, change index for a different abstract
    doc = nlp(abstract) 

    # return detected sentences from doc in string type (not spaCy token type)
    abstract_lines = [str(sent) for sent in list(doc.sents)]
    
    return abstract_lines
    
# ---------------------------------------------------------------------------------------------------------------------------

def split_chars(text):
    return ' '.join(list(text))

# ---------------------------------------------------------------------------------------------------------------------------

def make_predictions(text):
    
    classes = ['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS']
    abstract_lines = list()
    
    abstract_lines = spacy_function(text)
    
    
    # Get total number of lines
    total_lines_in_sample = len(abstract_lines)

    # Go through each line in abstract and create a list of dictionaries containing features for each line
    sample_lines = []
    for i, line in enumerate(abstract_lines):
        sample_dict = {}
        sample_dict["text"] = str(line)
        sample_dict["line_number"] = i
        sample_dict["total_lines"] = total_lines_in_sample - 1
        sample_lines.append(sample_dict)
    
    # Get all line_number values from sample abstract
    test_abstract_line_numbers = [line["line_number"] for line in sample_lines]

    # One-hot encode to same depth as training data, so model accepts right input shape
    test_abstract_line_numbers_one_hot = tf.one_hot(test_abstract_line_numbers, depth=15)
    
    # Get all total_lines values from sample abstract
    test_abstract_total_lines = [line["total_lines"] for line in sample_lines]
    
    # One-hot encode to same depth as training data, so model accepts right input shape
    test_abstract_total_lines_one_hot = tf.one_hot(test_abstract_total_lines, depth=20)
    
    # Split abstract lines into characters
    abstract_chars = [split_chars(sentence) for sentence in abstract_lines]
    
    # Make predictions on sample abstract features
    test_abstract_pred_probs = skimlit_bert_model.predict(x=(test_abstract_line_numbers_one_hot,
                                                       test_abstract_total_lines_one_hot,
                                                       tf.constant(abstract_lines),
                                                       tf.constant(abstract_chars)))
    
    test_abstract_preds = tf.argmax(test_abstract_pred_probs, axis=1)
    
    test_abstract_pred_classes = [classes[i] for i in test_abstract_preds]

    return (test_abstract_pred_classes, abstract_lines)

In [37]:
example_abstracts[1]['abstract']

"Postpartum depression (PPD) is the most prevalent mood disorder associated with childbirth. No single cause of PPD has been identified, however the increased risk of nutritional deficiencies incurred through the high nutritional requirements of pregnancy may play a role in the pathology of depressive symptoms. Three nutritional interventions have drawn particular interest as possible non-invasive and cost-effective prevention and/or treatment strategies for PPD; omega-3 (n-3) long chain polyunsaturated fatty acids (LCPUFA), vitamin D and overall diet. We searched for meta-analyses of randomised controlled trials (RCT's) of nutritional interventions during the perinatal period with PPD as an outcome, and checked for any trials published subsequently to the meta-analyses. Fish oil: Eleven RCT's of prenatal fish oil supplementation RCT's show null and positive effects on PPD symptoms. Vitamin D: no relevant RCT's were identified, however seven observational studies of maternal vitamin D 

In [47]:
text = example_abstracts[3]['abstract']
pred, lines = make_predictions(text)

In [48]:
pred

['BACKGROUND',
 'BACKGROUND',
 'BACKGROUND',
 'BACKGROUND',
 'OBJECTIVE',
 'METHODS',
 'METHODS',
 'METHODS',
 'RESULTS',
 'RESULTS',
 'CONCLUSIONS',
 'CONCLUSIONS']

In [49]:
for i, line in enumerate(abstract_lines):
    print(f"{pred[i]}: {line}")

BACKGROUND: This RCT examined the efficacy of a manualized social intervention for children with HFASDs.
BACKGROUND: Participants were randomly assigned to treatment or wait-list conditions.
BACKGROUND: Treatment included instruction and therapeutic activities targeting social skills, face-emotion recognition, interest expansion, and interpretation of non-literal language.
BACKGROUND: A response-cost program was applied to reduce problem behaviors and foster skills acquisition.
OBJECTIVE: Significant treatment effects were found for five of seven primary outcome measures (parent ratings and direct child measures).
METHODS: Secondary measures based on staff ratings (treatment group only) corroborated gains reported by parents.
METHODS: High levels of parent, child and staff satisfaction were reported, along with high levels of treatment fidelity.
METHODS: Standardized effect size estimates were primarily in the medium and large ranges and favored the treatment group.


In [50]:
objective = ''
background = ''
method = ''
conclusion = ''
result = ''
for i, line in enumerate(lines):
    if pred[i] == 'OBJECTIVE':
        objective = objective + line
        
    elif pred[i] == 'BACKGROUND':
        background = background + line
        
    elif pred[i] == 'METHODS':
        method = method + line
        
    elif pred[i] == 'RESULTS':
        result = result + line
        
    elif pred[i] == 'CONCLUSIONS':
        conclusion = conclusion + line


In [51]:
objective

"The goal of this post-hoc analysis was to explore baclofen's effect in a subgroup of alcohol-dependent HCV-infected cirrhotic patients."

In [52]:
background

'Hepatitis C virus (HCV) and alcoholic liver disease (ALD), either alone or in combination, count for more than two thirds of all liver diseases in the Western world.There is no safe level of drinking in HCV-infected patients and the most effective goal for these patients is total abstinence.Baclofen, a GABA(B) receptor agonist, represents a promising pharmacotherapy for alcohol dependence (AD).Previously, we performed a randomized clinical trial (RCT), which demonstrated the safety and efficacy of baclofen in patients affected by AD and cirrhosis.'

In [53]:
method

'Any patient with HCV infection was selected for this analysis.Among the 84 subjects randomized in the main trial, 24 alcohol-dependent cirrhotic patients had a HCV infection; 12 received baclofen 10mg t.i.d.and 12 received placebo for 12-weeks.'

In [54]:
result

'With respect to the placebo group (3/12, 25.0%), a significantly higher number of patients who achieved and maintained total alcohol abstinence was found in the baclofen group (10/12, 83.3%; p=0.0123).Furthermore, in the baclofen group, compared to placebo, there was a significantly higher increase in albumin values from baseline (p=0.0132) and a trend toward a significant reduction in INR levels from baseline (p=0.0716).'

In [55]:
conclusion

'In conclusion, baclofen was safe and significantly more effective than placebo in promoting alcohol abstinence, and improving some Liver Function Tests (LFTs) (i.e. albumin, INR) in alcohol-dependent HCV-infected cirrhotic patients.Baclofen may represent a clinically relevant alcohol pharmacotherapy for these patients.'