In [None]:
#!python -m pip install -q git+https://github.com/wilhelm-lab/dlomix

In [None]:
#!python -m pip install wandb

# Preprocess the data

In [None]:
import pandas as pd
import re
import numpy as np
import csv

## Read the input data

In [None]:
input_file1 = "ptm/input_data/TUM_mod_citrullination_l.parquet"
input_file2 = "ptm/input_data/TUM_mod_citrullination_2.parquet"

## Create dataframe

In [None]:
tmp_data1 = pd.read_parquet(input_file1, engine='pyarrow')
tmp_data2 = pd.read_parquet(input_file2, engine='pyarrow')
data = pd.concat([tmp_data1,tmp_data2])

In [None]:
data.head(10)

## Divide data into HCD and CID

In [None]:
data_HCD = data[data['fragmentation']=='HCD']
data_CID = data[data['fragmentation']=='CID']

In [None]:
data_HCD.head(10)

## Sort for modified_sequence and sort andromeda_score from highest to lowest

In [None]:
data_HCD = data_HCD.sort_values(['modified_sequence', 'andromeda_score'], ascending=[True, False])

In [None]:
data_HCD

## Select only the top 5 modified_sequences with highest score

In [None]:
data_HCD_5 = data_HCD.sort_values(['modified_sequence', 'andromeda_score'], ascending=[True, False]).groupby('modified_sequence').head(5)

In [None]:
data_HCD_5.reset_index(inplace=True)

In [None]:
data_HCD_5

In [None]:
data_CID_5 = data_CID.sort_values(['modified_sequence', 'andromeda_score'], ascending=[True, False]).groupby('modified_sequence').head(5)

In [None]:
data_CID_5

## Calculate 10% of the dataframes to create a test set

In [None]:
ten_percent_HCD = int(len(data_HCD_5)*0.1)
ten_percent_HCD

In [None]:
ten_percent_CID = int(len(data_CID_5)*0.1)
ten_percent_CID

## Create TEST set

In [None]:
test_HCD = data_HCD_5.head(ten_percent_HCD)
test_HCD

## Create TRAIN_VAL set

In [None]:
index_test_HCD = test_HCD.index
index_test_HCD

In [None]:
train_val_HCD = data_HCD_5.drop(data_HCD_5.index[index_test_HCD])
train_val_HCD

## Check if TEST and TRAIN_VAL have intersecting values

## ##TODO: the maximal overlapp is 5. Put the overlapped sequence into the set where the rest of the same sequences are.

In [None]:
pd.Series(list(set(train_val_HCD['modified_sequence']).intersection(set(test_HCD['modified_sequence']))))

## Change modification e.g R[UNIMOD:7] to r

In [None]:
def changeMod(input_data):
        for index, row in input_data.iterrows():
            sequence = row['modified_sequence']
            open_bracket = find(sequence,'[')
            tmp_seq =""
            new_sequence = ""
            if len(open_bracket)>=1:
                for index_mod in open_bracket:
                    modified_AA = sequence[index_mod-1]
                    if modified_AA == 'M':
                        tmp_seq = sequence[:index_mod-1] + 'M(ox)' + sequence[index_mod:]
                        new_sequence = re.sub("[\[].*?[\]]", "", tmp_seq)
        
                    else:
                        tmp_seq = sequence[:index_mod-1] + modified_AA.lower() + sequence[index_mod:]
                        new_sequence = re.sub("[\(\[].*?[\)\]]", "", tmp_seq)
                input_data.at[index, 'modified_sequence'] = new_sequence
        return input_data
    
def find(s, ch):
        return [i for i, ltr in enumerate(s) if ltr == ch]

In [None]:
test_HCD = changeMod(test_HCD)

## Remove specific modifications e.g Q, K

In [None]:
def removeChar(data,removeChar:str):
    tmp_remove = removeChar+"\\["
    filter = data['modified_sequence'].str.contains(tmp_remove)
    filtered_df = data[~filter]
    return filtered_df.reset_index(drop=True)

In [None]:
test_HCD = removeChar(test_HCD,'K')
test_HCD = removeChar(test_HCD,'Q')

## Rename columns

In [None]:
train_val_HCD.rename(columns={'intensities_raw':'intensities','modified_sequence': 'sequence', 'collision_energy_aligned_normed': 'collision_energy', 'precursor_charge_onehot':'precursor_charge_onehot'}, inplace=True)
test_HCD.rename(columns={'intensities_raw':'intensities','modified_sequence': 'sequence', 'collision_energy_aligned_normed': 'collision_energy', 'precursor_charge_onehot':'precursor_charge_onehot'}, inplace=True)

## Change format of precurser_charge from [1 0 0 1 0] to [1, 0, 0, 1, 0]

In [None]:
train_val_HCD['intensities'] = train_val_HCD['intensities'].apply(lambda a: np.array2string(a, separator=', '))
train_val_HCD['precursor_charge_onehot'] = train_val_HCD['precursor_charge_onehot'].apply(lambda a: np.array2string(a, separator=', '))

test_HCD['intensities'] = test_HCD['intensities'].apply(lambda a: np.array2string(a, separator=', '))
test_HCD['precursor_charge_onehot'] = test_HCD['precursor_charge_onehot'].apply(lambda a: np.array2string(a, separator=', '))

In [None]:
train_val_HCD

## Write TEST and TRAIN_VAL to .csv file

In [None]:
train_val_HCD

In [None]:
train_val_HCD.to_csv('ptm/output/train_val_hcd.csv', encoding='utf-8', index=False)
test_HCD.to_csv('ptm/output/test_hcd.csv', encoding='utf-8', index=False)

# Intensity Prediction

In [31]:
import numpy as np
import pandas as pd
import dlomix
from dlomix import constants, data, eval, layers, models, pipelines, reports, utils
print([x for x in dir(dlomix) if not x.startswith("_")])

[34m[1mwandb[0m: Thanks for trying out the Report API!
[34m[1mwandb[0m: For a tutorial, check out https://colab.research.google.com/drive/1CzyJx1nuOS4pdkXa2XPaRQyZdmFmLmXV
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Try out tab completion to see what's available.
[34m[1mwandb[0m:   ∟ everything:    `wr.<tab>`
[34m[1mwandb[0m:       ∟ panels:    `wr.panels.<tab>`
[34m[1mwandb[0m:       ∟ blocks:    `wr.blocks.<tab>`
[34m[1mwandb[0m:       ∟ helpers:   `wr.helpers.<tab>`
[34m[1mwandb[0m:       ∟ templates: `wr.templates.<tab>`
[34m[1mwandb[0m:       
[34m[1mwandb[0m: For bugs/feature requests, please create an issue on github: https://github.com/wandb/wandb/issues


['META_DATA', 'constants', 'data', 'eval', 'layers', 'losses', 'models', 'pipelines', 'reports', 'utils']


## 0. Import and Initialize Weights and Biases

In [1]:
import wandb
from wandb.keras import WandbCallback

In [2]:
# enter project name
project_name = 'dlomix_intensity'

## 1. Load Data

In [3]:
from dlomix.data import IntensityDataset

In [4]:
#TRAIN_DATAPATH = 'https://raw.githubusercontent.com/wilhelm-lab/dlomix-resources/tasks/intensity/example_datasets/Intensity/proteomeTools_train_val.csv'
TRAIN_DATAPATH = 'D:\\Uni\\Masterarbeit\\dlomix\\ptm\\output\\train_val_hcd.csv'

BATCH_SIZE = 64

int_data = IntensityDataset(data_source=TRAIN_DATAPATH, seq_length=30, batch_size=BATCH_SIZE,
                            collision_energy_col='collision_energy', val_ratio=0.2, test=False)

In [None]:
"Training examples", BATCH_SIZE * len(int_data.train_data)

In [None]:
"Validation examples", BATCH_SIZE * len(int_data.val_data)

### 1.1 Load weights from HDF5 file

In [59]:
import h5py

In [60]:
f = h5py.File('ptm/weights/weights_163_0.11385.hdf5', 'r+')

In [61]:
group = f['model_weights']
data_file = group['embedding']
for group in data_file.keys() :
    for dset in data_file[group].keys():      
        arr = data_file[group][dset][:]
        print(arr.shape)
arr

(23, 32)


array([[-1.21482657e-02, -1.65589547e-04,  2.12439969e-02,
         3.18754017e-02, -4.14248481e-02,  3.20397951e-02,
         2.74296105e-03,  1.45598641e-03, -3.54228169e-02,
        -9.24826134e-03,  2.08272427e-01,  1.87835265e-02,
         6.67113289e-02, -5.45682646e-02,  3.71077210e-02,
         1.95011846e-03,  5.13596169e-04, -8.23312551e-02,
         2.79431120e-02, -2.38665901e-02,  1.85046364e-02,
         1.72093976e-03,  4.31428887e-02, -4.19865474e-02,
        -3.72934788e-02, -2.45524086e-02, -1.23757534e-02,
        -1.85872838e-02,  4.19065133e-02, -6.21134862e-02,
         2.57181693e-02,  2.20089629e-02],
       [-1.61757041e-02, -9.22288920e-04,  2.45429222e-02,
        -4.87967581e-02,  1.10913478e-02,  2.60036271e-02,
        -1.09062074e-02,  4.08430491e-03,  3.70002575e-02,
         1.69252027e-02,  6.04478866e-02, -2.70478576e-02,
        -3.31291393e-03,  1.03691155e-02, -2.65134834e-02,
        -2.64446135e-04, -1.47339585e-03,  1.25516713e-01,
        -2.95

In [62]:
f.close()

## 2. Model

In [9]:
from dlomix.models import PrositIntensityPredictor
import keras
from keras import initializers
from keras.layers import Embedding

In [10]:
model = PrositIntensityPredictor(seq_length=30)

In [None]:
save_path = "./output/rtmodel.hdf5"
#model.build(input_shape = (22,16,30))
model.load_weights('ptm/weights/weights_163_0.11385.hdf5')

In [64]:
rand_uni_weights = np.random.uniform(-1, 1, size=(24, 16))
rand_uni_weights.shape

(24, 16)

In [None]:
embedding_layer = Embedding(
    24,
    16,
    trainable=True,
)
embedding_layer.build((1,))
embedding_layer.set_weights([rand_uni_weights])

In [None]:
model.emdedding = embedding_layer

In [None]:
#model.layers[1] = embedding_layer

In [None]:
model.layers[8].get_weights()[0]

In [None]:
for layer in model.layers:
    print(layer.name)

### 3.2 Create random uniform weight matrix

In [None]:
rand_uni_weights = np.random.uniform(-1, 1, size=(24, 16))
rand_uni_weights.shape

In [None]:
rand_uni_weights

In [None]:
model.layers[1].set_weights([rand_uni_weights])    

In [None]:
model.layers[1].get_weights()[0]

In [None]:
for layer in model.layers:
    print(layer.name, layer.inbound_nodes, layer.outbound_nodes)

### 3.3 assign weight matrix to embedding layer

In [65]:
model.set_weights([rand_uni_weights])

ValueError: Weights for model 'sequential' have not yet been created. Weights are created when the model is first called on inputs or `build()` is called with an `input_shape`.

## 3. Training

In [None]:
import tensorflow as tf
from dlomix.losses import masked_spectral_distance, masked_pearson_correlation_distance
tf.get_logger().setLevel('ERROR')

In [None]:
# create the optimizer object
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

# compile the model  with the optimizer and the metrics we want to use, we can add our custom timedelta metric
model.compile(optimizer=optimizer,
              loss=masked_spectral_distance,
              metrics=['mse',masked_pearson_correlation_distance])

### 3.1 train model from scratch

In [None]:
wandb.init(project=project_name, name='walktrough')
history = model.fit(int_data.train_data,
                    validation_data=int_data.val_data,
                    epochs=2,callbacks=[WandbCallback(save_model=False)])

In [None]:
# Mark the run as finished
wandb.finish()

### 3.4 save model

In [None]:
save_path = "./output/rtmodel.hdf5"
model.save_weights(save_path)

In [None]:
trained_model = RetentionTimePredictor(seq_length=30)
trained_model.load_weights(save_path)

## 4. Testing and Reporting

In [None]:
# create the dataset object for test data

#TEST_DATAPATH = 'https://raw.githubusercontent.com/wilhelm-lab/dlomix-resources/tasks/intensity/example_datasets/Intensity/proteomeTools_test.csv'
TEST_DATAPATH = 'D:\\Uni\\Masterarbeit\\dlomix\\ptm\\output\\test_hcd.csv'

test_int_data = IntensityDataset(data_source=TEST_DATAPATH,
                              seq_length=30, collision_energy_col='collision_energy',batch_size=32, test=True)

In [None]:
# use model.predict from keras directly on the testdata

predictions = model.predict(test_int_data.test_data)

In [None]:
from dlomix.reports import IntensityReport

# create a report object by passing the history object and plot different metrics
report = IntensityReport(output_path="./output", history=history)

In [None]:
# you can generate a complete report for intensity by calling generate_report
# the function takes the test dataset object and the predictions as arguments

report.generate_report(test_int_data, predictions)

In [None]:
# you can also manually see the results by calling other utility functions
from dlomix.reports.postprocessing import normalize_intensity_predictions


predictions_df = report.generate_intensity_results_df(test_int_data, predictions)
predictions_df.head()

In [None]:
predictions_acc = normalize_intensity_predictions(predictions_df)
predictions_acc.head()

In [None]:
predictions_acc['spectral_angle'].describe()

In [None]:
import seaborn as sns

sns.violinplot(predictions_acc['spectral_angle'])