In [1]:
import numpy as np # Linear algebra
# Data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd
from pandas import DataFrame
import xml.etree.ElementTree as ET # Reading xml files
# For plotting
import matplotlib.pyplot as plt
import pydot
import pydotplus
import graphviz
from keras.utils.vis_utils import plot_model
from keras.utils import plot_model
from sklearn.manifold import TSNE
# For Modelling
import tensorflow as tf
from tensorflow.keras import layers, models, preprocessing, callbacks, optimizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, Input, Add
from tensorflow.keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Conv2D, MaxPooling2D, BatchNormalization
from tensorflow.keras.layers import concatenate
from keras.metrics import categorical_accuracy
# For Pre-processing
import string
from string import digits
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import re
# Other useful modules
import h5py
from statistics import mode
import os
import datetime
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

In [4]:
def csv_row(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    nct_text = ""
    sum_text = ""
    model_text = ""
    ph_text = ""
    title_text = ""
    start_date = ""
    completion_date = ""
    # Only iterates through Phase 2 and 3 studies
    for ph in root.iter('phase'):
        ph_text = ph.text
        if (ph_text == "Phase 2" or ph_text == "Phase 3" or ph_text == "Phase 1"):
            #This bit finds all roots with nct_id which is a sub_root to id_info
            for nct in root.findall('id_info'):
                nctId_text = nct.find('nct_id').text
                nct_text =nctId_text
            # This bit finds the brief summary text
            for s in root.findall('brief_summary'):
                summary_text = s.find('textblock').text
                sum_text= summary_text
                sum_text = sum_text.replace('\r\n', '') # Replaces newline with a whitespace
                sum_text = re.sub(' +',' ',sum_text) # Compresses multiple whitespaces to only one
                #print("Summary Text:", sum_text)
            # Get's the official title for the study
            for t in root.iter('brief_title'):
                title_text = t.text
            # This get's the type of intervention_model
            for y in root.iter('intervention_model'):
                model_text = y.text
            if(root.find('start_date') == None):
                return None 
            for s in root.iter('start_date'):
                start_date = s.text
            if(root.find('primary_completion_date') == None):
                return None    
            for c in root.iter('primary_completion_date'):
                completion_date = c.text
    total_text = "\"" + nct_text + "\"" + ";" + "\"" + sum_text + "\"" + ";"  + "\"" + title_text + "\"" + ";"  +  "\"" + model_text + "\""+ ";"  + "\"" + start_date + "\""+ ";"  + "\"" + completion_date + "\""
    # This functions returns a text with Nct_Id, brief_summary, title and type of intervention model on the form we intended
    return total_text
print(csv_row("Downloads\\search_result (4)\\NCT00005606.xml"))

"NCT00005606";"
 RATIONALE: Peripheral blood lymphocyte therapy may be effective in the treatment and prevention of Epstein-Barr virus infection following transplantation. PURPOSE: Phase II trial to study the effectiveness of peripheral blood lymphocyte therapy in treating and preventing lymphoproliferative disorders in patients who have Epstein-Barr virus infection following transplantation. ";"Peripheral Blood Lymphocyte Therapy to Prevent Lymphoproliferative Disorders Caused by Epstein-Barr Virus in Patients Who Have Undergone Transplantation";"";"February 2000";"September 2003"


In [5]:
rdir = "Downloads\\search_result (4)"# Folders in directory where the all the xml files are placed
with open('train_data.csv', 'w', encoding="utf-8") as csvfile: 
    for root, dirs, files in os.walk(rdir):
        for filename in files:
            name = os.path.join(root, filename)
            data = csv_row(name)
            if(data):
                csvfile.write(data) #Writes total_text into a row in to train_data.csv
                csvfile.write("\n") # Skips to next line and do the same

In [23]:
# Earlier we saw that the returned text from our function was seperated by ';', so we use this as seperator when reading in the files
df = pd.read_csv("train_data.csv", sep=';', header=None,error_bad_lines=False, warn_bad_lines=False)
# Give the data sets appropiate column names
df.columns =  ['Nct_id', 'Summary', 'Title','Model','Start Date','End Date']
# We drop all the observations containing NaN's (missing values)
train = df.dropna()
duration = []
monthToNum = {"January": 1, "February":2, "March":3, "April":4, "May":5, "June":6, "July":7, "August":8, "September":9, "October":10, "November":11, "December":12}
for i,row in train.iterrows():
    if("\r\n" in row['Summary']):
        row['Summary'] = row['Summary'].replace("\r\n", '')

    sd = row['Start Date'].split(' ')
    if(len(sd) == 2):
        sdl = monthToNum[sd[0]]*30 + 365*int(sd[1]) + 1
    else:
        sdl = monthToNum[sd[0]]*30 + int(sd[1].replace(",", "")) + int(sd[2])*365
    ed = row['End Date'].split(' ')
    if(len(ed) == 2):
        edl = monthToNum[ed[0]]*30 + 365*int(ed[1]) + 1
    else:
        edl = monthToNum[ed[0]]*30 + int(ed[1].replace(",", "")) + int(ed[2])*365
    duration.append(edl - sdl)
train.insert(4,"Duration", duration)

print(train)

           Nct_id                                            Summary  \
0     NCT00000479   The purpose of this study is to evaluate the ...   
1     NCT00001277   Observational Phase: Patients whose parathyro...   
4     NCT00001566   This is a single arm study. The tumor specime...   
5     NCT00001637   Diseases such as leukemia, lymphoma, and mult...   
6     NCT00001806   In 1997, the Genetics Department of the NCI M...   
...           ...                                                ...   
9967  NCT05502458   To investigate the effects of perioperative a...   
9969  NCT05553808   This study is a sub-study of the master proto...   
9970  NCT05591456   Accelerated hypofractionated 1 week post-mast...   
9971  NCT05614518   The goal of this clinical trial was to assess...   
9972  NCT05622357   The goal of this clinical trial is to investi...   

                                                  Title  \
0     Women's Health Study (WHS): A Randomized Trial...   
1              St

In [24]:
#one hot encoding for the duration results
print(Y_train.quantile([0.25,0.5,0.75]))
short = []
mediumShort = []
mediumLong = []
long = []

for i,row in train.iterrows():
    duration = row['Duration']
    if(duration < 794):
        short.append(1)
        mediumShort.append(0)
        mediumLong.append(0)
        long.append(0)
    elif(duration < 1245):
        short.append(0)
        mediumShort.append(1)
        mediumLong.append(0)
        long.append(0)
    elif(duration < 1915):
        short.append(0)
        mediumShort.append(0)
        mediumLong.append(1)
        long.append(0)
    else:
        short.append(0)
        mediumShort.append(0)
        mediumLong.append(0)
        long.append(1)
train.insert(7,"Short", short)
train.insert(8,"Relatively Short", mediumShort)
train.insert(9,"Relatively Long", mediumLong)
train.insert(10,"Long", long)

X = train.drop(['Model','Start Date', 'End Date','Duration','Short','Relatively Short','Relatively Long','Long'], axis=1)
Y = train.drop(['Nct_id','Summary','Title', 'Start Date', 'End Date','Model','Duration'], axis = 1)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2)

      Duration  short  Relatively Short  Relatively Long  Long
0.25     797.0    0.0               0.0              0.0   0.0
0.50    1245.0    0.0               0.0              0.0   0.0
0.75    1915.0    0.0               0.0              1.0   1.0


In [26]:
print(Y_train.head()) # Prints the first 5 rows of the data
print(X_train.head())
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)


      Short  Relatively Short  Relatively Long  Long
7189      0                 0                0     1
3258      1                 0                0     0
2997      1                 0                0     0
7698      0                 0                1     0
5614      0                 0                0     1
           Nct_id                                            Summary  \
7189  NCT01731951   This pilot clinical trial studies how well im...   
3258  NCT00412880   Open label, uncontrolled Phase II trial to as...   
2997  NCT00362583   Primary objectives: - To confirm the efficacy...   
7698  NCT02002312   To determine the efficacy of multiple doses L...   
5614  NCT01079780   RATIONALE: Drugs used in chemotherapy, such a...   

                                                  Title  
7189  Imetelstat Sodium in Treating Participants Wit...  
3258            BI 2536 Second Line Monotherapy in SCLC  
2997  Efficacy and Safety of Intranasal Fentanyl in ...  
7698  Phase II St

In [27]:
import nltk
nltk.download("wordnet")
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [28]:
# This needs to be download for the lemmatization (converting to base form)
def text_cleaner(dataframe_org):
    dataframe = dataframe_org.copy()
    columns = ['Summary', 'Title']
    for col in columns:
        dataframe[col] = dataframe[col].str.translate(str.maketrans(' ', ' ', string.punctuation)) # Remove punctuation
        dataframe[col] = dataframe[col].str.translate(str.maketrans(' ', ' ', '\n')) # Remove newlines
        dataframe[col] =dataframe[col].str.translate(str.maketrans(' ', ' ', digits)) # Remove digits
        dataframe[col] =dataframe[col].apply(lambda tweet: re.sub(r'([a-z])([A-Z])',r'\1 \2',tweet)) # Split combined words
        dataframe[col] =dataframe[col].str.lower() # Convert to lowercase
        dataframe[col] =dataframe[col].str.split() # Split each sentence using delimiter
    # This part is for converting to base form
    lemmatizer = WordNetLemmatizer()
    sum_l=[]
    tit_l = []
    for y in tqdm(dataframe[columns[0]]): # tqdm is just a progress bar, an this loop only looks at summaries
        sum_new=[]
        for x in y: # Looks at words in every summary text
            z=lemmatizer.lemmatize(x)
            z=lemmatizer.lemmatize(z,'v') # The v specifies that it is in doubt of example a word is a noun or verb, it would consider it a verb.
            sum_new.append(z)
        y = sum_new
        sum_l.append(y)
    for w in tqdm(dataframe[columns[1]]): # Looks at titles
        tit_new=[]
        for x in w: # Every word in the titles
            z=lemmatizer.lemmatize(x)
            z=lemmatizer.lemmatize(z,'v')
            tit_new.append(z)
        w = tit_new
        tit_l.append(w)
    # This will join the words into strings as in the original data, just pre-processed and put into list
    sum_l2 = []
    for col in sum_l:
        col = ' '.join(col)
        sum_l2.append(col)
    tit_l2 = []
    for col in tit_l:
        col = ' '.join(col)
        tit_l2.append(col)
    # Data obtained after Lemmatization is in array form, and is converted to Dataframe in the next step.
    sum_data=pd.DataFrame(np.array(sum_l2), index=dataframe.index,columns={columns[0]})
    tit_data=pd.DataFrame(np.array(tit_l2), index=dataframe.index,columns={columns[1]})
    frames = [sum_data, tit_data]
    merged = pd.concat(frames, axis=1)
    return merged
def create_tok(train_data, MAX_FEATURES):
    clean_data = text_cleaner(train_data)
    tokenizer_sum = text.Tokenizer(num_words=MAX_FEATURES) # Keep the 20.000 most frequent words
    tokenizer_tit =  text.Tokenizer(num_words=MAX_FEATURES)
    # Summary Text
    summary_list = clean_data['Summary']
    tokenizer_sum.fit_on_texts(list(summary_list)) # Builds the word index
    #Title Text
    title_list = clean_data['Title'] # Text from Title
    tokenizer_tit.fit_on_texts(list(title_list))
    return tokenizer_sum, tokenizer_tit
def pre_process(dataframe, tokenizer, col, MAXLEN):
    clean_data = text_cleaner(dataframe)
    tokenized_list = tokenizer.texts_to_sequences(clean_data[col])
    X = sequence.pad_sequences(tokenized_list, maxlen=MAXLEN)
    return X

In [29]:
MAX_FEATURES = 20000 # Size of vocabluary
MAXLEN = 220 # Size of each text sequence, you can tune this depending on the mean length of you text sequences
tok_sum, tok_tit = create_tok(X_train,MAX_FEATURES )
# The following are used for model.fit
X_sum = pre_process(X_train, tok_sum, 'Summary', MAXLEN)
X_tit = pre_process(X_train, tok_tit, 'Title', MAXLEN)
print(X_sum)
print(X_tit)
# This is used for prediction
X_sum_test = pre_process(X_test, tok_sum, 'Summary', MAXLEN)
X_tit_test = pre_process(X_test, tok_tit, 'Title', MAXLEN)
list_classes = ["Short", "Relatively Short", "Relatively Long", "Long"] # The 4 categories
y = Y_train[list_classes].values
# y_test is used for model.evaluate later on
y_test = Y_test[list_classes].values

100%|████████████████████████████████████████████████████████████████████████████| 6137/6137 [00:02<00:00, 2119.49it/s]
100%|███████████████████████████████████████████████████████████████████████████| 6137/6137 [00:00<00:00, 16822.39it/s]
100%|████████████████████████████████████████████████████████████████████████████| 6137/6137 [00:01<00:00, 3163.16it/s]
100%|███████████████████████████████████████████████████████████████████████████| 6137/6137 [00:00<00:00, 16808.80it/s]
100%|████████████████████████████████████████████████████████████████████████████| 6137/6137 [00:01<00:00, 3166.33it/s]
100%|███████████████████████████████████████████████████████████████████████████| 6137/6137 [00:00<00:00, 16785.91it/s]


[[   0    0    0 ...   16   14   32]
 [   0    0    0 ... 7193  875   11]
 [   0    0    0 ... 3153 1297   95]
 ...
 [   0    0    0 ...  967   79  476]
 [   0    0    0 ...   97    3   10]
 [   0    0    0 ...   18    7 7192]]
[[   0    0    0 ...    8  567  657]
 [   0    0    0 ...  175    1  548]
 [   0    0    0 ...  568  142 1496]
 ...
 [   0    0    0 ...    6   61    2]
 [   0    0    0 ...   20   16  103]
 [   0    0    0 ...  132   20 5614]]


100%|████████████████████████████████████████████████████████████████████████████| 1535/1535 [00:00<00:00, 3168.11it/s]
100%|███████████████████████████████████████████████████████████████████████████| 1535/1535 [00:00<00:00, 15988.96it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1535/1535 [00:00<00:00, 3144.10it/s]
100%|███████████████████████████████████████████████████████████████████████████| 1535/1535 [00:00<00:00, 16680.89it/s]


In [30]:
def get_con_model():
    embed_size = 50 # How big  each word vector should be
    inp_sum = Input(shape=(MAXLEN, ))
    inp_title = Input(shape=(MAXLEN, ))
    total_inp = concatenate([inp_sum, inp_title]) # Merge the 2 inputs
    embed_layer = Embedding(MAX_FEATURES, embed_size)(total_inp)
    lstm_layer = LSTM(50)(embed_layer)
    layer1 = Dropout(0.1)(lstm_layer) # Regularization method, has the effect of reducing overfitting
    layer2 = Dense(50, activation="relu")(layer1) # The relu function can return very large values
    layer3 =  Dropout(0.1)(layer2) # Again regularization
    layer4 =BatchNormalization()(layer3) # Maintains the mean activation close to 0 and the activation standard deviation close to 1
    layer5 = Dense(4, activation="softmax")(layer4) # Only outputs values between 0 and 1, this is the final layer
    model_con = Model(inputs=[inp_sum,inp_title], outputs=layer5)
    model_con.compile(loss='categorical_crossentropy', # This is the loss function, and this type of function is used when solving categorical classification
                    optimizer='rmsprop', # Algorithm that update network weights iterative based in training data
                    metrics=['accuracy']) # This is our statistical measure
    return model_con
con_model = get_con_model()
# Gets informations about the layers in the model, including output, input and number of parameters:
con_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 220)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 220)]        0           []                               
                                                                                                  
 concatenate (Concatenate)      (None, 440)          0           ['input_1[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
 embedding (Embedding)          (None, 440, 50)      1000000     ['concatenate[0][0]']        

In [31]:
batch_size = 32 # number of samples that will be propagated through the network.
epochs = 10 # Number of passes over the entire data set
file_path="weights_base.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min') # Verbose means that it prints acc and loss
early = EarlyStopping(monitor="val_loss", mode="min", patience=3) 
#EarlyStopping should only be includede when tuning your model
callbacks_list = [checkpoint, early]
history = con_model.fit([X_sum, X_tit], y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list, verbose=2) # Model fit

Epoch 1/10

Epoch 1: val_loss improved from inf to 1.38064, saving model to weights_base.hdf5
173/173 - 13s - loss: 1.3741 - accuracy: 0.2901 - val_loss: 1.3806 - val_accuracy: 0.2687 - 13s/epoch - 74ms/step
Epoch 2/10

Epoch 2: val_loss improved from 1.38064 to 1.36502, saving model to weights_base.hdf5
173/173 - 11s - loss: 1.3034 - accuracy: 0.3699 - val_loss: 1.3650 - val_accuracy: 0.2801 - 11s/epoch - 64ms/step
Epoch 3/10

Epoch 3: val_loss improved from 1.36502 to 1.35626, saving model to weights_base.hdf5
173/173 - 11s - loss: 1.2284 - accuracy: 0.4367 - val_loss: 1.3563 - val_accuracy: 0.3208 - 11s/epoch - 64ms/step
Epoch 4/10

Epoch 4: val_loss did not improve from 1.35626
173/173 - 11s - loss: 1.1295 - accuracy: 0.5010 - val_loss: 1.3725 - val_accuracy: 0.3143 - 11s/epoch - 63ms/step
Epoch 5/10

Epoch 5: val_loss did not improve from 1.35626
173/173 - 11s - loss: 1.0334 - accuracy: 0.5684 - val_loss: 1.5863 - val_accuracy: 0.2834 - 11s/epoch - 64ms/step
Epoch 6/10

Epoch 6: v

In [None]:
con_model.load_weights(file_path)
con_model.evaluate([X_sum_test, X_tit_test], y_test, verbose=2) # Returns loss value and the metric specified, so in this case, model accuracy