### **Capstone Project: SkimLit**

In [3]:
# DL needs
import tensorflow as tf
import keras as kr

# Data needs
import pandas as pd
from sklearn.model_selection import train_test_split

# Numerical computation needs
import numpy as np

# plotting needs
import matplotlib.pyplot as plt
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# ensuring reproducibility
random_seed=42
tf.random.set_seed(random_seed)

import sys
sys.path.append('/home/rudraksha14/Desktop/RAY_RISE_ABOVE_YOURSELF/Programming/tensorflow/')

import important_functionalities as impf

**19. Creating an end to end pipeline to input abstract and get output classified text**

In [6]:
import re
import tensorflow_hub as hub


@kr.saving.register_keras_serializable(package="my_custom_package")
class UniversalEncodedLayer(tf.keras.layers.Layer):
    def __init__(self,**kwargs):
        super().__init__(**kwargs)
        self.use_layer = hub.KerasLayer("https://www.kaggle.com/models/google/universal-sentence-encoder/TensorFlow2/universal-sentence-encoder/2",
                                        input_shape = [],
                                        dtype=tf.string,
                                        trainable=False, # default=False,
                                        name='USE'
                                        )

    def call(self,inputs):
        return self.use_layer(inputs)

class SkimLit:
    def __init__(self,model_file = 'models/best_model.keras'):
        self.class_names = ['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS']
        self.model = tf.keras.models.load_model(model_file,custom_objects={'USE':UniversalEncodedLayer})
        pass

    def split_chars(self,text):
        return " ".join(list(text))
    
    def replace_numbers_and_strip(self,text):
        # Replace any sequence of digits (possibly with decimal points) with @
        return re.sub(r'\d+(\.\d+)?', '@', text).strip()

    def get_abstract_lines(self,filename):
        # read the data
        with open(filename,'r') as file:
            data = file.read()

        # Use regex to split
        ### logic: New line is whenever we encounter a '.' punctuation followed by a uppercase letter
        lines = re.split(r'(?<=[.])\s+(?=[A-Z])', data)
        return lines

    def preprocess(self,lines,seq_len_line_nums = 15, seq_len_total_lines = 20):
        lines = [self.replace_numbers_and_strip(line) for line in lines]

        test_sentences = tf.constant([line for line in lines],dtype=tf.string)
        test_chars = tf.constant([self.split_chars(line) for line in lines],dtype=tf.string)
       
        # line_numbers and total_lines need one-hot-encoding
        test_line_numbers = tf.constant(np.array([tf.one_hot(line_num,depth=seq_len_line_nums) for line_num in range(len(lines)) ]),dtype=tf.float32)
        test_total_lines = tf.constant(np.array([tf.one_hot(len(lines),depth = seq_len_total_lines)]*len(lines)),dtype=tf.float32)
        return test_sentences,test_chars,test_line_numbers,test_total_lines

    def classify(self,test_input):
        predictions =  self.model.predict(test_input)
        pred_labels = tf.argmax(predictions,axis=1)
        return pred_labels

    def skim_abstract(self,filename):
        lines = self.get_abstract_lines(filename)
        test_sentences,test_chars,test_line_numbers,test_total_lines = self.preprocess(lines)
        labels = self.classify((test_sentences,test_chars,test_line_numbers,test_total_lines))

        output_dict = {class_name:"" for class_name in self.class_names}

        for line_num,line in enumerate(lines):
            output_dict[self.class_names[labels[line_num]]]+= line.strip() + ' '

        output = ""
        output += f"BACKGROUND: {output_dict['BACKGROUND']}\n\n" if output_dict['BACKGROUND'] else ""
        output += f"OBJECTIVE: {output_dict['OBJECTIVE']}\n\n" if output_dict['OBJECTIVE'] else ""
        output += f"METHODS: {output_dict['METHODS']}\n\n" if output_dict['METHODS'] else ""
        output += f"RESULTS: {output_dict['RESULTS']}\n\n" if output_dict['RESULTS'] else ""
        output += f"CONCLUSIONS: {output_dict['CONCLUSIONS']}\n\n" if output_dict['CONCLUSIONS'] else ""
        
        print(output)

        with open('output.txt','w') as file:
            file.write(output)

sk = SkimLit()
output = sk.skim_abstract('abstract.txt')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 542ms/step
BACKGROUND: Preclinical and clinical studies show that the GABA(B) receptor agonist baclofen may represent a pharmacotherapy for alcohol dependence (AD). However, the mechanisms by which baclofen affects drinking are not well characterized; thus this pilot study investigated possible baclofen's biobehavioral mechanisms. 

METHODS: The design was a double-blind controlled randomized human laboratory pilot study. Fourteen non-treatment seeking alcohol-dependent heavy drinking subjects received either baclofen 10mg t.i.d. or an active placebo (cyproheptadine 2mg t.i.d., to control for sedation) for a 7-day period. At day 8, participants performed an alcohol cue-reactivity (CR) followed by an alcohol self-administration (ASA). 

RESULTS: The main results were a significant effect of baclofen for increasing stimulation (p=.001) and sedation (p<.01). Furthermore, when drinking during the ASA and the 2 days before was an

***-- END --***