In [39]:
import numpy as np # Linear algebra
# Data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd
from pandas import DataFrame
import xml.etree.ElementTree as ET # Reading xml files
# For plotting
import matplotlib.pyplot as plt
import pydot
import pydotplus
import graphviz
from keras.utils.vis_utils import plot_model
from keras.utils import plot_model
from sklearn.manifold import TSNE
# For Modelling
import tensorflow as tf
from tensorflow.keras import layers, models, preprocessing, callbacks, optimizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, Input, Add
from tensorflow.keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Conv2D, MaxPooling2D, BatchNormalization
from tensorflow.keras.layers import concatenate
from keras.metrics import categorical_accuracy
# For Pre-processing
import string
from string import digits
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import re
# Other useful modules
import h5py
from statistics import mode
import os
import datetime
import warnings
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime
warnings.filterwarnings('ignore')

In [36]:
def csv_row(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    nct_text = ""
    sum_text = ""
    model_text = ""
    ph_text = ""
    title_text = ""
    start_date = ""
    completion_date = ""
    # Only iterates through Phase 2 and 3 studies
    for ph in root.iter('phase'):
        ph_text = ph.text
        if (ph_text == "Phase 2" or ph_text == "Phase 3" or ph_text == "Phase 1"):
            #This bit finds all roots with nct_id which is a sub_root to id_info
            for nct in root.findall('id_info'):
                nctId_text = nct.find('nct_id').text
                nct_text =nctId_text
            # This bit finds the brief summary text
            for s in root.findall('brief_summary'):
                summary_text = s.find('textblock').text
                sum_text= summary_text
                sum_text = sum_text.replace('\r\n', '') # Replaces newline with a whitespace
                sum_text = re.sub(' +',' ',sum_text) # Compresses multiple whitespaces to only one
                #print("Summary Text:", sum_text)
            # Get's the official title for the study
            for t in root.iter('brief_title'):
                title_text = t.text
            # This get's the type of intervention_model
            for y in root.iter('intervention_model'):
                model_text = y.text
            if(root.find('start_date') == None):
                return None 
            for s in root.iter('start_date'):
                start_date = s.text
            if(root.find('primary_completion_date') == None):
                return None    
            for c in root.iter('primary_completion_date'):
                completion_date = c.text
    total_text = "\"" + nct_text + "\"" + ";" + "\"" + sum_text + "\"" + ";"  + "\"" + title_text + "\"" + ";"  +  "\"" + model_text + "\""+ ";"  + "\"" + start_date + "\""+ ";"  + "\"" + completion_date + "\""
    # This functions returns a text with Nct_Id, brief_summary, title and type of intervention model on the form we intended
    return total_text
print(csv_row("Downloads\\search_result\\NCT00001328.xml"))

"NCT00001328";"
 Malignant brain tumors are responsible for a significant amount of deaths in children and adults. Even with advances in surgery, radiation therapy, and chemotherapy, many patients diagnosed with a malignant brain tumor survive only months to weeks. In an attempt to improve the prognosis for these patients, researchers have developed a new approach to brain tumor therapy. This approach makes use of DNA technology to transfer genes sensitive to therapy into the cells of the tumor. Infections with the herpes simplex virus can cause cold sores in the area of the mouth. A drug called ganciclovir (Cytovene) can kill the virus. Ganciclovir is effective because the herpes virus contains a gene (Herpes-Thymidine Kinase TK gene) that is sensitive to the drug. Researchers have been able to separate this gene from the virus. Using DNA technology, researchers hope to transfer and implant the TK gene into tumor cells making them sensitive to ganciclovir. In theory, giving patients g

In [29]:
rdir = "Downloads\\search_result"# Folders in directory where the all the xml files are placed
with open('train_data.csv', 'w', encoding="utf-8") as csvfile: 
    for root, dirs, files in os.walk(rdir):
        for filename in files:
            name = os.path.join(root, filename)
            data = csv_row(name)
            if(data):
                csvfile.write(data) #Writes total_text into a row in to train_data.csv
                csvfile.write("\n") # Skips to next line and do the same
            

In [58]:
# Earlier we saw that the returned text from our function was seperated by ';', so we use this as seperator when reading in the files
df = pd.read_csv("train_data.csv", sep=';', header=None,error_bad_lines=False, warn_bad_lines=False)
# Give the data sets appropiate column names
df.columns =  ['Nct_id', 'Summary', 'Title','Model','Start Date','End Date']
# We drop all the observations containing NaN's (missing values)
train = df.dropna()
duration = []
monthToNum = {"January": 1, "February":2, "March":3, "April":4, "May":5, "June":6, "July":7, "August":8, "September":9, "October":10, "November":11, "December":12}
for i,row in train.iterrows():
    if("\r\n" in row['Summary']):
        row['Summary'] = row['Summary'].replace("\r\n", '')

    sd = row['Start Date'].split(' ')
    if(len(sd) == 2):
        sdl = monthToNum[sd[0]]*30 + 365*int(sd[1]) + 1
    else:
        sdl = monthToNum[sd[0]]*30 + int(sd[1].replace(",", "")) + int(sd[2])*365
    ed = row['End Date'].split(' ')
    if(len(ed) == 2):
        edl = monthToNum[ed[0]]*30 + 365*int(ed[1]) + 1
    else:
        edl = monthToNum[ed[0]]*30 + int(ed[1].replace(",", "")) + int(ed[2])*365
    duration.append(edl - sdl)
train.insert(4,"Duration", duration)

print(train)

           Nct_id                                            Summary  \
3     NCT00001806   In 1997, the Genetics Department of the NCI M...   
5     NCT00002463   RATIONALE: Drugs used in chemotherapy use dif...   
10    NCT00002484   RATIONALE: Radiation therapy uses high-energy...   
22    NCT00002527   RATIONALE: Chemoprevention therapy is the use...   
23    NCT00002528   RATIONALE: Removing axillary lymph nodes may ...   
...           ...                                                ...   
9970  NCT05520372   This is a retrospective study that included 6...   
9972  NCT05536362   The Study Showed that combining clonidine and...   
9973  NCT05590650   The goal of this clinical trial is to learn a...   
9974  NCT05591456   Accelerated hypofractionated 1 week post-mast...   
9975  NCT05622357   The goal of this clinical trial is to investi...   

                                                  Title  \
3       Methods in Education for Breast Cancer Genetics   
5     Combination

In [60]:
X = train.drop(['Model','Start Date', 'End Date'], axis=1)
Y = train.drop(['Nct_id','Summary','Title', 'Start Date', 'End Date','Model'], axis = 1)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2)

In [75]:
print(Y_train.head()) # Prints the first 5 rows of the data
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

      Duration
7444      3075
3181      2525
7252      1957
2838       670
7518       855
(6483, 4)
(6483, 1)
(1621, 4)
(1621, 1)


In [82]:
print(Y_train.quantile([0.25,0.5,0.75]))
for i in Y_train:
    if(i < )

      Duration
0.25     760.0
0.50    1174.0
0.75    1790.0


NameError: name 'Y_train' is not defined

In [63]:
import nltk
nltk.download("wordnet")
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [64]:
# This needs to be download for the lemmatization (converting to base form)
def text_cleaner(dataframe_org):
    dataframe = dataframe_org.copy()
    columns = ['Summary', 'Title']
    for col in columns:
        dataframe[col] = dataframe[col].str.translate(str.maketrans(' ', ' ', string.punctuation)) # Remove punctuation
        dataframe[col] = dataframe[col].str.translate(str.maketrans(' ', ' ', '\n')) # Remove newlines
        dataframe[col] =dataframe[col].str.translate(str.maketrans(' ', ' ', digits)) # Remove digits
        dataframe[col] =dataframe[col].apply(lambda tweet: re.sub(r'([a-z])([A-Z])',r'\1 \2',tweet)) # Split combined words
        dataframe[col] =dataframe[col].str.lower() # Convert to lowercase
        dataframe[col] =dataframe[col].str.split() # Split each sentence using delimiter
    # This part is for converting to base form
    lemmatizer = WordNetLemmatizer()
    sum_l=[]
    tit_l = []
    for y in tqdm(dataframe[columns[0]]): # tqdm is just a progress bar, an this loop only looks at summaries
        sum_new=[]
        for x in y: # Looks at words in every summary text
            z=lemmatizer.lemmatize(x)
            z=lemmatizer.lemmatize(z,'v') # The v specifies that it is in doubt of example a word is a noun or verb, it would consider it a verb.
            sum_new.append(z)
        y = sum_new
        sum_l.append(y)
    for w in tqdm(dataframe[columns[1]]): # Looks at titles
        tit_new=[]
        for x in w: # Every word in the titles
            z=lemmatizer.lemmatize(x)
            z=lemmatizer.lemmatize(z,'v')
            tit_new.append(z)
        w = tit_new
        tit_l.append(w)
    # This will join the words into strings as in the original data, just pre-processed and put into list
    sum_l2 = []
    for col in sum_l:
        col = ' '.join(col)
        sum_l2.append(col)
    tit_l2 = []
    for col in tit_l:
        col = ' '.join(col)
        tit_l2.append(col)
    # Data obtained after Lemmatization is in array form, and is converted to Dataframe in the next step.
    sum_data=pd.DataFrame(np.array(sum_l2), index=dataframe.index,columns={columns[0]})
    tit_data=pd.DataFrame(np.array(tit_l2), index=dataframe.index,columns={columns[1]})
    frames = [sum_data, tit_data]
    merged = pd.concat(frames, axis=1)
    return merged
def create_tok(train_data, MAX_FEATURES):
    clean_data = text_cleaner(train_data)
    tokenizer_sum = text.Tokenizer(num_words=MAX_FEATURES) # Keep the 20.000 most frequent words
    tokenizer_tit =  text.Tokenizer(num_words=MAX_FEATURES)
    # Summary Text
    summary_list = clean_data['Summary']
    tokenizer_sum.fit_on_texts(list(summary_list)) # Builds the word index
    #Title Text
    title_list = clean_data['Title'] # Text from Title
    tokenizer_tit.fit_on_texts(list(title_list))
    return tokenizer_sum, tokenizer_tit
def pre_process(dataframe, tokenizer, col, MAXLEN):
    clean_data = text_cleaner(dataframe)
    tokenized_list = tokenizer.texts_to_sequences(clean_data[col])
    X = sequence.pad_sequences(tokenized_list, maxlen=MAXLEN)
    return X

In [74]:
MAX_FEATURES = 20000 # Size of vocabluary
MAXLEN = 220 # Size of each text sequence, you can tune this depending on the mean length of you text sequences
tok_sum, tok_tit = create_tok(X_train,MAX_FEATURES )
# The following are used for model.fit
X_sum = pre_process(X_train, tok_sum, 'Summary', MAXLEN)
X_tit = pre_process(X_train, tok_tit, 'Title', MAXLEN)
print(X_sum)
print(X_tit)
# This is used for prediction
X_sum_test = pre_process(X_test, tok_sum, 'Summary', MAXLEN)
X_tit_test = pre_process(X_test, tok_tit, 'Title', MAXLEN)
list_classes = ["Model_Crossover Assignment", "Model_Other", "Model_Parallel Assignment", "Model_Single Group Assignment"] # The 4 categories
y = Y_train
print(Y_train)
# y_test is used for model.evaluate later on
y_test = Y_test

100%|████████████████████████████████████████████████████████████████████████████| 6483/6483 [00:02<00:00, 2605.48it/s]
100%|███████████████████████████████████████████████████████████████████████████| 6483/6483 [00:00<00:00, 13006.90it/s]
100%|████████████████████████████████████████████████████████████████████████████| 6483/6483 [00:02<00:00, 2585.50it/s]
100%|███████████████████████████████████████████████████████████████████████████| 6483/6483 [00:00<00:00, 14002.20it/s]
100%|████████████████████████████████████████████████████████████████████████████| 6483/6483 [00:02<00:00, 2740.59it/s]
100%|████████████████████████████████████████████████████████████████████████████| 6483/6483 [00:01<00:00, 5785.83it/s]


[[  0   0   0 ...  34  36   9]
 [  0   0   0 ...   1  10  27]
 [  0   0   0 ... 310  36   9]
 ...
 [  0   0   0 ...  89   2  23]
 [  0   0   0 ... 237  36   9]
 [  0   0   0 ... 473 156   9]]
[[  0   0   0 ...  11  13   3]
 [  0   0   0 ...   2  18  15]
 [  0   0   0 ... 284  13   3]
 ...
 [  0   0   0 ...  52  18  15]
 [  0   0   0 ...  91  13   3]
 [  0   0   0 ...  14  43   3]]


100%|████████████████████████████████████████████████████████████████████████████| 1621/1621 [00:00<00:00, 2262.44it/s]
100%|███████████████████████████████████████████████████████████████████████████| 1621/1621 [00:00<00:00, 11638.60it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1621/1621 [00:00<00:00, 2347.68it/s]
100%|███████████████████████████████████████████████████████████████████████████| 1621/1621 [00:00<00:00, 11357.18it/s]

      Duration
7444      3075
3181      2525
7252      1957
2838       670
7518       855
...        ...
6851      1095
4192      2529
8593      1260
1971       455
6915      2340

[6483 rows x 1 columns]





In [72]:
def get_con_model():
    embed_size = 50 # How big  each word vector should be
    inp_sum = Input(shape=(MAXLEN, ))
    inp_title = Input(shape=(MAXLEN, ))
    total_inp = concatenate([inp_sum, inp_title]) # Merge the 2 inputs
    embed_layer = Embedding(MAX_FEATURES, embed_size)(total_inp)
    lstm_layer = LSTM(50)(embed_layer)
    layer1 = Dropout(0.1)(lstm_layer) # Regularization method, has the effect of reducing overfitting
    layer2 = Dense(50, activation="relu")(layer1) # The relu function can return very large values
    layer3 =  Dropout(0.1)(layer2) # Again regularization
    layer4 =BatchNormalization()(layer3) # Maintains the mean activation close to 0 and the activation standard deviation close to 1
    layer5 = Dense(1, activation="softmax")(layer4) # Only outputs values between 0 and 1, this is the final layer
    model_con = Model(inputs=[inp_sum,inp_title], outputs=layer5)
    model_con.compile(loss='categorical_crossentropy', # This is the loss function, and this type of function is used when solving categorical classification
                    optimizer='rmsprop', # Algorithm that update network weights iterative based in training data
                    metrics=["accuracy"]) # This is our statistical measure
    return model_con
con_model = get_con_model()
# Gets informations about the layers in the model, including output, input and number of parameters:
con_model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_9 (InputLayer)           [(None, 220)]        0           []                               
                                                                                                  
 input_10 (InputLayer)          [(None, 220)]        0           []                               
                                                                                                  
 concatenate_4 (Concatenate)    (None, 440)          0           ['input_9[0][0]',                
                                                                  'input_10[0][0]']               
                                                                                                  
 embedding_4 (Embedding)        (None, 440, 50)      1000000     ['concatenate_4[0][0]']    

In [73]:
batch_size = 32 # number of samples that will be propagated through the network.
epochs = 10 # Number of passes over the entire data set
file_path="weights_base.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min') # Verbose means that it prints acc and loss
early = EarlyStopping(monitor="val_loss", mode="min", patience=3) 
#EarlyStopping should only be includede when tuning your model
callbacks_list = [checkpoint, early]
history = con_model.fit([X_sum, X_tit], y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list, verbose=2) # Model fit

Epoch 1/10

Epoch 1: val_loss improved from inf to 0.00000, saving model to weights_base.hdf5
183/183 - 22s - loss: 0.0000e+00 - accuracy: 0.0000e+00 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00 - 22s/epoch - 119ms/step
Epoch 2/10

Epoch 2: val_loss did not improve from 0.00000
183/183 - 18s - loss: 0.0000e+00 - accuracy: 0.0000e+00 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00 - 18s/epoch - 98ms/step
Epoch 3/10

Epoch 3: val_loss did not improve from 0.00000
183/183 - 22s - loss: 0.0000e+00 - accuracy: 0.0000e+00 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00 - 22s/epoch - 118ms/step
Epoch 4/10

Epoch 4: val_loss did not improve from 0.00000
183/183 - 29s - loss: 0.0000e+00 - accuracy: 0.0000e+00 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00 - 29s/epoch - 156ms/step
