## Models and data for the first Manuscript

### Import from the python files data_loader and model

In [None]:
from data_loader import *
from model import *

In [None]:
from tensorflow.keras.initializers import glorot_normal, glorot_uniform, he_normal, he_uniform, Constant


In [None]:
def reset_seeds():
    """
        Sets seeds for reproducibility purposes
    """
    np.random.seed(9)
    tf.random.set_seed(12)
    os.environ['PYTHONHASHSEED']=str(15)
    print('Reset Seeds')

In [None]:
reset_seeds()

### Set the paths for the token, noonan, and non noonan files

In [None]:
token_path = '../tokenizer/token_all.json'
path_non_noonan = "../data/combined_non_noonan.csv"
path_noonan = "../data/noonan_r3.csv"

In [None]:
n_array, attribute_names, n_string_list, n_patient_list = sort_data(path_noonan)
nn_array, attribute_names, nn_string_list, nn_patient_list = sort_data(path_non_noonan)

In [None]:
n_int_list, indx_words, word_indx, token_path = string_to_ints(n_string_list, token_path)
nn_int_list, indx_words, word_indx, token_path = string_to_ints(nn_string_list, token_path)

In [None]:
def translate_to_string(indx_words, original):
    translated = ""
    for num in original:
        translated += indx_words[num] + " "
    return translated

In [None]:
def length_filter(int_list, length, lineend_index):
    """
    Takes a list of list of ints and checks their lengths after removing the lineend term. Returns the indices of lists
    of greater or equal length.
    """
    filtered = []
    for index, item in enumerate(int_list):
        no_lineend = item.copy()
        no_lineend.remove(lineend_index)
        if len(no_lineend) >= length:
            filtered.append(index)
    print("original count: {len1}\t filtered count: {len2}".format(len1=len(int_list), len2=len(filtered)))
    return filtered

### Do the filtering for length, minimum is 10 words (not including gender or lineend)

In [None]:
lineend_index = word_indx["lineend"]
n_filtered_index = length_filter(n_int_list, 11, lineend_index)
nn_filtered_index = length_filter(nn_int_list, 11, lineend_index)
n_features = np.array(n_int_list, dtype=object)[n_filtered_index]
nn_features = np.array(nn_int_list, dtype=object)[nn_filtered_index]

In [None]:
n_patient_array = np.array(n_patient_list)[n_filtered_index]
nn_patient_array = np.array(nn_patient_list)[nn_filtered_index]
print(n_patient_array.shape)
print(nn_patient_array.shape)

In [None]:
indices = np.arange(n_features.shape[0])
np.random.shuffle(indices)
n_features = n_features[indices]
n_patient_array = n_patient_array[indices]

indices = np.arange(nn_features.shape[0])
np.random.shuffle(indices)
nn_features = nn_features[indices]
nn_patient_array = nn_patient_array[indices]

### Split data into k fold

In [None]:
num_words = len(indx_words)+ 1
print("the shapes are %s for noonan and %s for non noonan" %(n_features.shape[0], nn_features.shape[0]))

k = 7
k_fold_noonan = k_fold(k, n_features.shape[0])
k_fold_non_noonan = k_fold(k, nn_features.shape[0])
cross_validation = []

In [None]:
MAX_SEQUENCE_LENGTH = 1000
embedding_size = 128

In [None]:
seed = 9
g_norm = glorot_normal(seed = seed)
g_unif = glorot_uniform(seed = seed)
he_norm = he_normal(seed = seed)
he_unif = he_uniform(seed = seed)

In [None]:
# inits = {
#          'ReLU-glorot_normal': ('relu', g_norm),
#          'ReLU-glorot_uniform': ('relu', g_unif),
#          'ReLU-he_normal': ('relu', he_norm),
#          'ReLU-he_uniform': ('relu', he_unif),
#          'PReLU-glorot_normal': ('prelu', g_norm),
#          'PReLU-glorot_uniform': ('prelu', g_unif),
#          'PReLU-he_normal': ('prelu', he_norm),
#          'PReLU-he_uniform': ('prelu', he_unif)
#          }
# init = inits['ReLU-he_uniform'][0]

In [None]:
def make_model(MAX_SEQUENCE_LENGTH, num_words, embedding_size, model_type):
    keras.backend.clear_session()
    print("making model")
    print("numwords is %s training length is all" %(num_words))

    # shared layers
    sequence_input = keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = keras.layers.Embedding(num_words, embedding_size, input_length=MAX_SEQUENCE_LENGTH)(sequence_input)
    
    if model_type == 'conv':
        l_conv = Conv1D(128, 3, activation='relu')(embedded_sequences)
        l_pool = keras.layers.GlobalMaxPooling1D()(l_conv)
        dense1 = keras.layers.Dense(64, activation='relu')(l_pool)
        drop_1 = keras.layers.Dropout(0.5)(dense1)
        dense1 = keras.layers.Dense(16, activation='relu')(drop_1)
        drop_1 = keras.layers.Dropout(0.5)(dense1)

    if model_type == 'lstm':
        l_lstm = keras.layers.LSTM(64, return_sequences=True, dropout=0.1)(embedded_sequences)
        l_pool = keras.layers.GlobalMaxPooling1D()(l_lstm)
        dense1 = keras.layers.Dense(32, activation='relu')(l_pool)
        drop_1 = keras.layers.Dropout(0.5)(dense1)

    if model_type == 'bigru':
        l_bigru = keras.layers.Bidirectional(keras.layers.GRU(64, return_sequences=True, dropout=0.1))(embedded_sequences)
        l_pool = keras.layers.GlobalMaxPooling1D()(l_bigru)
        dense1 = keras.layers.Dense(32, activation='relu')(l_pool)
        drop_1 = keras.layers.Dropout(0.5)(dense1)
        
    if model_type == 'gru':
        l_gru = keras.layers.GRU(128, return_sequences=True, dropout=0.1)(embedded_sequences)
        l_pool = keras.layers.GlobalMaxPooling1D()(l_gru)
        dense1 = keras.layers.Dense(32, activation='relu')(l_pool)
        drop_1 = keras.layers.Dropout(0.5)(dense1)
        
    if model_type == 'dense':
        flatten = keras.layers.Flatten()(embedded_sequences)
        dense1 = keras.layers.Dense(16, activation='relu')(flatten)
        drop_1 = keras.layers.Dropout(0.2)(dense1)
    
    preds_1 = keras.layers.Dense(1, activation='sigmoid')(drop_1)

    model = keras.Model(sequence_input, outputs=preds_1)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tf.keras.metrics.AUC(curve="PR")])
    model.summary()
    return model

In [None]:
def shuffle_three(features, labels, patients):
    """
    Takes an array of features, an array of labels, and patients and shuffles the rows
    Args:
        features - (np.ndarray) features array
        labels - (np.ndarray) labels corresponding to the features
        patients - (np.ndarray) patient id corresponding to the features
    Returns:
        features - (np.ndarray) features shuffled
        labels - (np.ndarray) labels shuffled according to the features
        patients - (np.ndarray) patient id shuffled according to the features
    """
    indices = np.arange(features.shape[0])
    np.random.shuffle(indices)
    return features[indices], labels[indices], patients[indices]

In [None]:
class savePredict(keras.callbacks.Callback):
    """
    Callback to save predictions after each epoch
    """
    def __init__(self, validation_generator, validation_labels, validation_patients, output_path, pic_path, save_plot=False, save_pr_recall=False):
        self.out = []
        self.output_path = output_path
        self.validation_generator = validation_generator
        self.validation_labels = validation_labels
        self.validation_patients = validation_patients
        self.pic_path = pic_path
        self.save_plot = save_plot
        self.save_pr_recall = save_pr_recall

    def on_epoch_end(self, epoch, logs=None):
        output = self.model.predict(self.validation_generator, verbose=2)
        if self.save_pr_recall:
            analysis = calculate_pr_recall(output, self.validation_labels, threshold=.01)
            analysisdir = "{output_path}_{epochNum}_pr.csv".format(output_path=self.output_path, epochNum=epoch)
            save_chart(analysis, analysisdir)

        valdir = "{output_path}_{epochNum}_val.csv".format(output_path=self.output_path, epochNum=epoch)
        with open(valdir, 'w', newline='\n', encoding="ISO-8859-1") as csvfile:
            record_writer = csv.writer(csvfile, delimiter=',')
            attribute_names = ['prediction', 'actual', 'patient']
            record_writer.writerow(attribute_names)
            for i  in range(len(output)):
                row = [output[i][0], self.validation_labels[i], self.validation_patients[i]]
                record_writer.writerow(row)

        if self.save_plot:
            chartdir = "{chartdir}_{epochNum}_pr.pdf".format(chartdir=self.pic_path, epochNum=epoch)
            plot_pr_recall(valdir, chartdir)

### Resampling and creating static folds

In [None]:
resampled_k = []
for i in range(k): 
    n_features_train_index = k_fold_noonan[0][i]
    nn_features_train_index = k_fold_non_noonan[0][i]
    n_features_validation_index = k_fold_noonan[1][i]
    nn_features_validation_index = k_fold_non_noonan[1][i]
    n_features_test_index = k_fold_noonan[2][i]
    nn_features_test_index = k_fold_non_noonan[2][i]
    
    np.random.shuffle(n_features_train_index)
    num_controls = 100 * len(n_features_train_index)
    train_controls_index = nn_features_train_index[0:num_controls]
    x_train_index = train_controls_index + n_features_train_index
    y_train = np.array([0] * len(train_controls_index) + [1] * len(n_features_train_index))
    x_train =  np.concatenate((nn_features[train_controls_index], n_features[n_features_train_index]), axis=None)
    train_patients = np.concatenate((nn_patient_array[train_controls_index], n_patient_array[n_features_train_index]), axis=None)

    np.random.shuffle(n_features_validation_index)
    num_controls = 1000 * len(n_features_validation_index)
    validation_controls_index = nn_features_validation_index[0:num_controls]
    x_validation_index = validation_controls_index + n_features_validation_index
    y_validation = np.array([0] * len(validation_controls_index) + [1] * len(n_features_validation_index))
    x_validation = np.concatenate((nn_features[validation_controls_index], n_features[n_features_validation_index]), axis=None)
    validation_patients = np.concatenate((nn_patient_array[validation_controls_index], n_patient_array[n_features_validation_index]), axis=None)

    np.random.shuffle(n_features_test_index)
    num_controls = 1000 * len(n_features_test_index)
    test_controls_index = nn_features_test_index[0:num_controls]
    x_test_index = test_controls_index + n_features_test_index
    y_test = np.array([0] * len(test_controls_index) + [1] * len(n_features_test_index))
    x_test = np.concatenate((nn_features[test_controls_index], n_features[n_features_test_index]), axis=None)
    test_patients = np.concatenate((nn_patient_array[test_controls_index], n_patient_array[n_features_test_index]), axis=None)
    
    x_train = pad_sequences(x_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
    x_validation = pad_sequences(x_validation, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
    x_test = pad_sequences(x_test, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
    
    x_train, y_train, train_patients = shuffle_three(x_train, y_train, train_patients)
    x_validation, y_validation, validation_patients = shuffle_three(x_validation, y_validation, validation_patients)
    x_test, y_test, test_patients = shuffle_three(x_test, y_test, test_patients)
    resampled_k.append([[x_train, y_train, train_patients], [x_validation, y_validation, validation_patients], [x_test, y_test, test_patients]])
    
    print("shapes for data sets \n train: {train}, \t valiation: {validation}, \t test: {test}".format(train=x_train.shape[0], test=x_test.shape[0], validation=x_validation.shape[0]))

### This is just to extract the list of names

In [None]:
# final_train, final_validation, final_test = resampled_k[-1]
# train_df = pd.DataFrame(list(zip(final_train[2], final_train[1])), 
#                           columns=['patient_id', 'label'])
# train_df_non_noonan = train_df[train_df.label != 1]
# train_df_non_noonan.to_csv(index=False, path_or_buf="../models/20210908-160537_conv_gender/final_train_non_noonan.csv")

### To perform training

In [None]:
model_types = ['conv', 'dense', 'bigru', 'gru', 'lstm']

In [None]:
for model_type in model_types:

    file_name = model_type + '_gender_final'
    reset_seeds()

    logdir = "../logs/" + datetime.now().strftime("%Y%m%d-%H%M%S") + "_" + file_name
    modeldir = '../models/' + datetime.now().strftime("%Y%m%d-%H%M%S") + "_" + file_name
    chartdir = '../pics/' + datetime.now().strftime("%Y%m%d-%H%M%S") + "_" + file_name
    os.mkdir(logdir)
    os.mkdir(modeldir)
    os.mkdir(chartdir)

    for i in range(k): 
        """
        first: get the training validation and test sets
        """
        train, validation, test = resampled_k[i]

        x_train = train[0]
        y_train = train[1]
        patient_train = train[2]
        x_validation = validation[0]
        y_validation = validation[1]
        patient_validation = validation[2]
        x_test = test[0]
        y_test = test[1]
        patient_test = test[2]

        """
        second: make the actual model
        """
        model = make_model(MAX_SEQUENCE_LENGTH, num_words, embedding_size, model_type)
        display(keras.utils.plot_model(model, show_shapes=True)) 
        """
        third: make callbacks
        """
        logpath = '{logdir}/fold_{foldNum}'.format(logdir=logdir, foldNum=i)
        modelpath = '{modeldir}/fold_{foldNum}.h5'.format(modeldir=modeldir, foldNum=i)
        picspath = '{chartdir}/fold_{foldNum}'.format(chartdir=chartdir, foldNum=i)
        outpath = '{modeldir}/fold_{foldNum}'.format(modeldir=modeldir, foldNum=i)

        tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logpath)

        callbacks = [
            ModelCheckpoint(modelpath, save_best_only=False,
            save_weights_only=False, monitor='val_auc', mode='max'),
            tensorboard_callback,
            savePredict(x_validation, y_validation, patient_validation, outpath, picspath)]

        """
        fourth: fit the data
        """

        print("beginning training")

        if i == (k - 1):
            mean_aucs = np.array(cross_validation).mean(axis=0)
            max_aucs = max(mean_aucs)
            max_auc_index = np.argmax(mean_aucs) + 1
            print("max_auc is {max} at epoch {epoch}".format(max=max_aucs, epoch=max_auc_index))

            modelpath = '{modeldir}/fold_{foldNum}_final.h5'.format(modeldir=modeldir, foldNum=i)
            picspath = '{chartdir}/fold_{foldNum}_final'.format(chartdir=chartdir, foldNum=i)
            outpath = '{modeldir}/fold_{foldNum}_final'.format(modeldir=modeldir, foldNum=i)

            callbacks = [
                ModelCheckpoint(modelpath, save_best_only=False,
                save_weights_only=False, monitor='val_auc', mode='max'),
                tensorboard_callback,
                savePredict(x_test, y_test, patient_test, outpath, picspath, save_plot=True, save_pr_recall=True)]
            history = model.fit(x_train, y_train, validation_data=(x_test, y_test),
                epochs=max_auc_index, callbacks=callbacks, verbose=1, batch_size=200)

            """
            fifth: plot the cross_validation
            """
            strlist = listdir(modeldir)
            p = re.compile('fold_[0-9]*_{best_epoch}_val\.csv'.format(best_epoch=max_auc_index - 1))
            newlist = list(filter(p.match, strlist))
            best_epoch_dir = modeldir + "/best_epoch"
            os.mkdir(best_epoch_dir)
            for file in newlist:
                source = modeldir + '/' + file
                dest = best_epoch_dir + '/' + file
                copyfile(source, dest)
            final_plot = "{chartdir}/cross_validation_epoch_{best_epoch}.pdf".format(chartdir=chartdir, best_epoch=max_auc_index)
            plot_combined(best_epoch_dir, final_plot)

        else:
            history = model.fit(x_train, y_train, validation_data=(x_validation, y_validation),
                epochs=20, callbacks=callbacks, verbose=1, batch_size=200)
            print(history.history)
            cross_validation.append(history.history['val_auc'])

        keras.backend.clear_session()
        print(cross_validation)
        print(np.array(cross_validation).mean(axis=0))

# Analyzing Models

In [None]:
path_to_final_test = '../models/20210908-160537_conv_gender/fold_6_final_8_val.csv'
# path_to_final_test = '../models/20210908-201527_bigru_gender_final/fold_6_final_8_val.csv'
# path_to_final_test = '../models/20210908-191345_conv_gender_final/fold_6_final_8_val.csv'
# path_to_final_test = '../models/20210908-194758_dense_gender_final/fold_6_final_8_val.csv'
# path_to_final_test = '../models/20210908-214419_gru_gender_final/fold_6_final_8_val.csv'
# path_to_final_test = '../models/20210908-225146_lstm_gender_final/fold_6_final_8_val.csv'

## Looking at false positives

In [None]:
final_test = pd.read_csv(path_to_final_test)
final_test.head()
over_threshold = final_test[final_test['prediction'] >= .84]
false_positives = over_threshold[over_threshold['actual'] == 0]
print(len(false_positives))
# false_positives.head()
false_positives.sort_values(by=['prediction'], ascending=False).head()

In [None]:
list_id = list(false_positives['patient'])
patient_strings = []
for id in list_id:
    index = nn_patient_list.index(str(id))
    patient_strings.append(nn_string_list[index])

In [None]:
# To display the id and terms of each patient
# for patient_string, patient_id in zip(patient_strings, list_id):
#     print(patient_id)
#     print(patient_string.split('lineend'))

In [None]:
of_interest = ['charge', 'alagille', 'williams', 'kabuki', 'syndrome']

In [None]:
contains_syndrome = []
for patient_string in patient_strings:
    list_form = patient_string.split('lineend')
    relevant_terms = []
    for item in list_form:
        if any(x in item.lower() for x in of_interest):
            relevant_terms.append(item)
    contains_syndrome.append(relevant_terms)

In [None]:
for syndrome_string, patient_id in zip(contains_syndrome, list_id):
    if syndrome_string != []:
        print(patient_id)
        print(syndrome_string)

## Checking the predictive value of individual terms

In [None]:
def test_dx(model_paths, token_path, training_length=1000):
    """
    Loads model for testing and predictions on dx
    Args:
        model_paths - (list) List of model file paths
        token_path - (string) Path to the token file
        training_length- (int) Number of words to be used
    Returns:
        prediction - (np.ndarray) Predictions for each DX
        string_list - (list) Strings used for each DX
        int_list - (list) string_list tokenized
    """
    prediction = None
    attribute_names, string_list = get_dx('../data/noonan_r3.csv')
    string_list = list(string_list)
    string_list += ['Female lineend', 'Male lineend']
    int_list, indx_words, word_indx, token_path = string_to_ints(string_list, token_path)
    features = pad_sequences(int_list, maxlen=training_length, padding='post', truncating='post')
    for model_path in model_paths:
        # summarize model.
        model = load_model(model_path)
        # model.summary()
        # make predictions
        output = model.predict(features, verbose=0)
        if prediction is None:
            prediction = output
        else:
            prediction = np.append(prediction, output, axis=1)
    prediction = np.average(prediction, axis=1)
    return prediction, string_list, int_list

In [None]:
path_to_final_model = '../models/20210908-160537_conv_gender/fold_6_final.h5'
output_file = '../models/20210908-160537_conv_gender/dx_predictions.csv'
model_paths = [path_to_final_model]

In [None]:
predicts, dx_strings, ints = test_dx(model_paths, token_path)
with open(output_file, 'w', newline='\n', encoding="ISO-8859-1") as csvfile:
    record_writer = csv.writer(csvfile, delimiter=',')
    attribute_names = ['dx', 'prediction', 'length of string']
    record_writer.writerow(attribute_names)
    for i  in range(len(dx_strings)):
        row = [dx_strings[i], predicts[i], len(ints[i])]
        record_writer.writerow(row)

## Ranking the individual terms for each detected true positive case

In [None]:
path_to_final_model = '../models/20210908-160537_conv_gender/fold_6_final.h5'
path_to_final_test = '../models/20210908-160537_conv_gender/fold_6_final_8_val.csv'
final_test = pd.read_csv(path_to_final_test)
final_test.head()
over_threshold = final_test[final_test['prediction'] >= .84]

In [None]:
true_positives = over_threshold[over_threshold['actual'] == 1]
print(len(true_positives))
true_positives = true_positives.sort_values(by=['prediction'], ascending=False)
patient_indices = []
for id in true_positives['patient']:
    patient_indices.append(n_patient_list.index(str(id)))
detected_patients = n_array[patient_indices]
detected_patients_ints = list(np.array(n_int_list, dtype=object)[patient_indices])
detected_patients_features = pad_sequences(detected_patients_ints, maxlen=1000, padding='post', truncating='post')

#### Uncomment block below to check false postives

In [None]:
# false_positives = over_threshold[over_threshold['actual'] == 0]
# print(len(false_positives))
# false_positives = false_positives.sort_values(by=['prediction'], ascending=False)
# patient_indices = []
# for id in false_positives['patient']:
#     patient_indices.append(nn_patient_list.index(str(id)))
# detected_patients = nn_array[patient_indices]
# detected_patients_ints = list(np.array(nn_int_list, dtype=object)[patient_indices])
# detected_patients_features = pad_sequences(detected_patients_ints, maxlen=1000, padding='post', truncating='post')

In [None]:
string_list = []
removed_list = []
patient_id_list = []
for patient in detected_patients:
    patient_features = np.array(patient)
    uniques = set(patient_features[:,3])
    uniques = ["None"] + list(uniques)
    patient_id = patient_features[0,0]
    patient_id_list = patient_id_list + [patient_id] * len(uniques)
    removed_list = removed_list + uniques
    for item in uniques:
        patient_dx_string = patient_features[0][4] + " lineend "
        for patient_feature in patient_features:
            dx_name = patient_feature[3]
            if dx_name != item:
                patient_dx_string = patient_dx_string + dx_name + " lineend "
        string_list.append(patient_dx_string)
int_list, _indx_words, _word_indx, _token_path = string_to_ints(string_list, token_path)
new_features = pad_sequences(int_list, maxlen=1000, padding='post', truncating='post')

new_prediction = None
output = model.predict(new_features, verbose=0)
new_prediction = output

In [None]:
new_prediction_list = new_prediction.reshape(len(patient_id_list))
differences = []
for new_value, patient_id in zip(new_prediction_list, patient_id_list):
    original_prediction = true_positives[true_positives['patient'] == int(patient_id)]['prediction'].values[0]
    difference = round(new_value - original_prediction, 5)
    differences.append(difference)

In [None]:
removed_df = pd.DataFrame(list(zip(patient_id_list, removed_list, new_prediction_list, differences)), 
                          columns=['patient_id', 'term_removed', 'prediction_post_removal', 'difference_in_prediction_score'])
removed_df = removed_df.sort_values(by=['patient_id', 'difference_in_prediction_score'], ascending=True)

### Saves removed_df to a csv

In [None]:
removed_df.to_csv(index=False, path_or_buf='../models/20210908-160537_conv_gender/removed2.csv')

## Biobank Samples

In [None]:
def custom_sort_data(path, num_examples=None):
    """
    Sorts the data based on patient number
    Args:
        path - (string) Path to csv file
        num_examples - (int) Number of samples to load
    Returns:
        features - (np.ndarray) numpy array containing all the information of each patient
        attribute_names - (np.ndarray) numpy array containing the headers
        string_list - (list) list of strings descriptions
        patient_list - (list) list of patient ids
    """
    attribute_names = []
    data = []
    string_list = []
    patient_list = []
    prev_id = None
    patients_loaded = 0
    with open(path, newline='\n', encoding="ISO-8859-1") as csvfile:
        record_reader = csv.reader(csvfile, delimiter=',')
        attribute_names = next(record_reader)
        for row in record_reader:
            row_id = row[0]
            dx_name = row[3]
            gender = row[5]
            if prev_id == None:
                new_patient = []
                patient_dx_string = gender + " lineend "
                patients_loaded += 1
            elif prev_id != row_id:
                if num_examples != None and patients_loaded >= num_examples:
                    break
                data.append(new_patient)
                string_list.append(patient_dx_string)
                patient_list.append(prev_id)
                new_patient = []
                patient_dx_string = gender + " lineend "
                patients_loaded += 1
            new_patient.append(row)
            patient_dx_string = patient_dx_string + dx_name + " lineend "
            prev_id = row_id
        data.append(new_patient)
        string_list.append(patient_dx_string)
        patient_list.append(prev_id)
    print('{file} read'.format(file=path))
    return np.array(data, dtype=object), np.array(attribute_names), string_list, patient_list


In [None]:
model_paths = ['../models/final_model/fold_6_final.h5']
patient_path = '../data/1275_BTM_non_noonan_r3.csv'
model = load_model(model_path)

In [None]:
def make_predict(model_paths, patient_path, token_path, training_length=1000):
    prediction = None
    # load dataset
    patient_array, attribute_names, string_list, patient_list = custom_sort_data(patient_path)
    int_list, indx_words, word_indx, token_path = string_to_ints(string_list, token_path)
    features = pad_sequences(int_list, maxlen=training_length, padding='post', truncating='post')
    for model_path in model_paths:
        # summarize model.
        model = load_model(model_path)
        # model.summary()
        # make predictions
        output = model.predict(features, verbose=0)
        if prediction is None:
            prediction = output
        else:
            prediction = np.append(prediction, output, axis=1)
        print("prediction done for %s " %(model_path))
    prediction = np.average(prediction, axis=1)
    return prediction, patient_list, string_list, int_list

In [None]:
predicts, patients, strings, ints = make_predict(model_paths, patient_path, token_path)

In [None]:
predicted_df = pd.DataFrame(list(zip(patients, predicts, strings, ints)), 
                          columns= ['patient_id', 'prediction', 'string', 'ints'])
predicted_df = predicted_df.sort_values(by=['prediction'], ascending=False)

In [None]:
over_threshold = predicted_df[predicted_df['prediction'] >= .84]

In [None]:
over_threshold

In [None]:
of_interest = ['charge', 'alagille', 'williams', 'kabuki', 'syndrome']

In [None]:
of_interest = ['syndrome']

In [None]:
patient_strings = list(over_threshold['string'])
list_id = list(over_threshold['patient_id'])
contains_syndrome = []
for patient_string in patient_strings:
    list_form = patient_string.split('lineend')
    relevant_terms = []
    for item in list_form:
        if any(x in item.lower() for x in of_interest):
            relevant_terms.append(item)
    relevant_terms = set(relevant_terms)
    contains_syndrome.append(relevant_terms)

In [None]:
count = 0
for syndrome_string, patient_id in zip(contains_syndrome, list_id):
    if syndrome_string != set():
        count += 1
        print(patient_id)
        print(syndrome_string)     
print(count)

In [None]:
output_file = '../models/final_model/patient_predictions.csv'

In [None]:
# over_threshold_trimmed = over_threshold.drop('ints', 1)
# over_threshold_trimmed.to_csv(index=False, path_or_buf=output_file)

In [None]:
predicted_df_trimmed = predicted_df.drop('string', 1)
predicted_df_trimmed= predicted_df_trimmed.drop('ints', 1)

In [None]:
predicted_df_trimmed.sort_values(by=['prediction'], ascending=False)

In [None]:
predicted_df_trimmed.to_csv(index=False, path_or_buf=output_file)

### Loading samples from ID

In [None]:
id_path = '../data/pseudo_prospective_result.csv'
id_df = pd.read_csv(id_path)

In [None]:
noonan_id_df = id_df[id_df["label"] == 1]
non_noonan_id_df = id_df[id_df["label"] == 0]

In [None]:
non_noonan_id_df

In [None]:
n_features_list = list(n_features)
nn_features_list = list(nn_features)
n_patients_list = list(n_patient_array)
nn_patients_list = list(nn_patient_array)

In [None]:
noonan_zipped = list(zip(n_patients_list, n_features_list))
non_noonan_zipped = list(zip(nn_patients_list, nn_features_list))

In [None]:
df_noonan = pd.DataFrame(noonan_zipped, columns=["pid", "sequence"])
df_non_noonan = pd.DataFrame(non_noonan_zipped, columns=["pid", "sequence"])
df_noonan['pid'] = pd.to_numeric(df_noonan["pid"])
df_non_noonan['pid'] = pd.to_numeric(df_non_noonan["pid"])
print("converted to ints")

In [None]:
df_noonan

In [None]:
merged_df = id_df.merge(df_non_noonan, on="pid")
merged_df

### import model

In [None]:
model_path = "../models/20210908-160537_conv_gender/fold_6_final.h5"

In [None]:
int_list = merged_df["sequence"].to_list()
labels = merged_df["label"].to_list()
patients = merged_df["pid"].to_list()

In [None]:
MAX_SEQUENCE_LENGTH = 1000
padded_sequences = pad_sequences(int_list, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

In [None]:
model = load_model(model_path)

In [None]:
output = model.predict(padded_sequences)

In [None]:
analysis = calculate_pr_recall(output, labels, threshold=.01)
analysisdir = "../models/20210908-160537_conv_gender/prelim_pr.csv"
save_chart(analysis, analysisdir)

In [None]:
valdir = "../models/20210908-160537_conv_gender/prelim_val.csv"
with open(valdir, 'w', newline='\n', encoding="ISO-8859-1") as csvfile:
    record_writer = csv.writer(csvfile, delimiter=',')
    attribute_names = ['prediction', 'actual', 'patient']
    record_writer.writerow(attribute_names)
    for i  in range(len(output)):
        row = [output[i][0], labels[i], patients[i]]
        record_writer.writerow(row)

In [None]:
chartdir = "../models/20210908-160537_conv_gender/prelim_pr.pdf"
plot_pr_recall(valdir, chartdir)