# Tutorial for freezing the Prosit Model
This tutorial shows, how to freeze the Prosit Intensity Predictor model and only let the first and last layer remain trainable for refinement and transfer learning.

### Imports

In [1]:
import dlomix
from dlomix.models import PrositIntensityPredictor
import tensorflow as tf
import yaml
from dlomix.losses import masked_spectral_distance, masked_pearson_correlation_distance

2024-07-18 10:43:49.486426: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-18 10:43:49.486483: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-18 10:43:49.487966: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-18 10:43:49.495873: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm



Avaliable feature extractors are (use the key of the following dict and pass it to features_to_extract in the Dataset Class):
{
   "atom_count": "Atom count of PTM.",
   "delta_mass": "Delta mass of PTM.",
   "mod_gain": "Gain of atoms due to PTM.",
   "mod_loss": "Loss of atoms due to PTM.",
   "red_smiles": "Reduced SMILES representation of PTM."
}.
When writing your own feature extractor, you can either
    (1) use the FeatureExtractor class or
    (2) write a function that can be mapped to the Hugging Face dataset.
In both cases, you can access the parsed sequence information from the dataset using the following keys, which all provide python lists:
    - _parsed_sequence: parsed sequence
    - _n_term_mods: N-terminal modifications
    - _c_term_mods: C-terminal modifications



### Load a pretrained model

In [2]:
model_path = "/cmnfs/proj/bmpc_dlomix/models/baseline_models/noptm_baseline_full_bs1024/"
model = tf.keras.models.load_model(model_path + "85c6c918-4a2a-42e5-aab1-e666121c69a6.keras")
model.summary()

2024-07-18 10:43:55.436963: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:274] failed call to cuInit: CUDA_ERROR_NOT_INITIALIZED: initialization error
2024-07-18 10:43:55.437025: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:129] retrieving CUDA diagnostic information for host: minotaur.exbio.wzw.tum.de
2024-07-18 10:43:55.437038: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:136] hostname: minotaur.exbio.wzw.tum.de
2024-07-18 10:43:55.437217: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:159] libcuda reported version is: 550.90.7
2024-07-18 10:43:55.437262: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:163] kernel reported version is: 550.90.7
2024-07-18 10:43:55.437272: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:241] kernel version seems to match DSO: 550.90.7


Model: "prosit_intensity_predictor"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  928       
                                                                 
 sequential (Sequential)     (None, 30, 512)           1996800   
                                                                 
 sequential_1 (Sequential)   multiple                  4608      
                                                                 
 sequential_2 (Sequential)   (None, 29, 512)           1576806   
                                                                 
 encoder_att (AttentionLaye  multiple                  542       
 r)                                                              
                                                                 
 sequential_3 (Sequential)   multiple                  0         
                                        

##### Initialize the optimizer 

In [3]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)

### Freeze the model

In [4]:
# function to freeze all layers except first and/or last layer
def freeze_model(model:dlomix.models.prosit.PrositIntensityPredictor,optimizer:tf.keras.optimizers, trainable_first_layer:bool = False, trainable_last_layer:bool = False, loss:dlomix.losses=masked_spectral_distance, metrics:list=[masked_pearson_correlation_distance]) -> None:
    ''' Freezes all layers of a PrositIntensityPredictor and keep first and/or last layer trainable.

    First setting the whole model to trainable because this attribute overshadows the trainable attribute of every sublayer.
    Then iterating through all sublayers and sets the trainable attribute of every layer to 'False', model is now frozen.
    Next, setting the trainable attribute of either the first embedding layer or the last time density layer to trainable.
    Finally, compile the model with the optimizer, loss, and metrics to make the changes take effect.

    Parameter
    ---------
    model                   : dlomix.models.prosit.PrositIntensityPredictor
                              The model to be frozen.
    optimizer               : tf.keras.optimizers
                              The optimizer is needed for compiling the model.
    trainable_first_layer   : bool
                              Whether the first layer should remain trainable.
    trainable_last_layer    : bool
                              Whether the last layer should remain trainable
    loss                    : dlomix.losses
                              The loss for compiling the model. 
                              default: masked_spectral_distance
    metrics                 : list[dlomix.losses]
                              The metrics for compiling the model.
                              default: [masked_pearson_correlation_distance] 
    --------

    '''

    model.trainable = True 
    for lay in model.layers:
        try:
            for sublay in lay.layers:
                sublay.trainable = False
        except (AttributeError):
            lay.trainable = False

    if (trainable_first_layer):
        first_layer = model.get_layer(name="embedding")
        first_layer.trainable = True

    if (trainable_last_layer):
        last_layer = model.regressor.get_layer(name = "time_dense")
        last_layer.trainable = True

    model.compile(
        optimizer=optimizer,
        loss=loss,
        metrics=metrics
    )

In [5]:
# function to print the trainable attribute of every layer
def check_trainability(model, sublayers = False):
    for lay in model.layers:
        if(sublayers):
            print()
            try:
                lay.layers
                print(f'Layer: {lay.name}, Trainable: {lay.trainable}')
                for lay2 in lay.layers:
                    print(f"Layer: {lay2.name}, Trainable: {lay2.trainable}")
            except(AttributeError):
                print(f'Layer: {lay.name}, Trainable: {lay.trainable}')
        else:
            print(f'Layer: {lay.name}, Trainable: {lay.trainable}')


*Freeze all layers except the first and the last layer:*

In [6]:
freeze_model(model, optimizer, trainable_first_layer=True, trainable_last_layer=True)
check_trainability(model, sublayers=True)


Layer: embedding, Trainable: True

Layer: sequential, Trainable: True
Layer: bidirectional, Trainable: False
Layer: dropout, Trainable: False
Layer: gru_1, Trainable: False
Layer: dropout_1, Trainable: False

Layer: sequential_1, Trainable: True
Layer: meta_in, Trainable: False
Layer: meta_dense, Trainable: False
Layer: meta_dense_do, Trainable: False

Layer: sequential_2, Trainable: True
Layer: decoder, Trainable: False
Layer: dropout_2, Trainable: False
Layer: decoder_attention_layer, Trainable: False

Layer: encoder_att, Trainable: False

Layer: sequential_3, Trainable: True
Layer: add_meta, Trainable: False
Layer: repeat, Trainable: False

Layer: sequential_4, Trainable: True
Layer: time_dense, Trainable: True
Layer: activation, Trainable: False
Layer: out, Trainable: False


### Prepare everything for training

##### Load the dataset and the PTM alphabet

In [7]:
from dlomix.data import load_processed_dataset
dataset = load_processed_dataset("/cmnfs/proj/bmpc_dlomix/datasets/processed/noptm_baseline_small_bs1024")

### Continue Training with frozen layers

In [8]:
freeze_model(model,optimizer, trainable_first_layer=True, trainable_last_layer=True)
check_trainability(model, sublayers=True)


Layer: embedding, Trainable: True

Layer: sequential, Trainable: True
Layer: bidirectional, Trainable: False
Layer: dropout, Trainable: False
Layer: gru_1, Trainable: False
Layer: dropout_1, Trainable: False

Layer: sequential_1, Trainable: True
Layer: meta_in, Trainable: False
Layer: meta_dense, Trainable: False
Layer: meta_dense_do, Trainable: False

Layer: sequential_2, Trainable: True
Layer: decoder, Trainable: False
Layer: dropout_2, Trainable: False
Layer: decoder_attention_layer, Trainable: False

Layer: encoder_att, Trainable: False

Layer: sequential_3, Trainable: True
Layer: add_meta, Trainable: False
Layer: repeat, Trainable: False

Layer: sequential_4, Trainable: True
Layer: time_dense, Trainable: True
Layer: activation, Trainable: False
Layer: out, Trainable: False


In [9]:
original_weights = model.get_weights()

In [10]:
# train again while only the first layer and the last layer are trainable
model.fit(
    dataset.tensor_train_data,
    validation_data=dataset.tensor_val_data,
    epochs=1
)



<keras.src.callbacks.History at 0x7f98081827a0>

In [11]:
# check which weights have changed
retrained_weights = model.get_weights()
for i, w in enumerate(zip(original_weights, retrained_weights)):
    print(f'weights {i} stayed the same: {(w[0]==w[1]).all()}')
    

weights 0 stayed the same: False
weights 1 stayed the same: True
weights 2 stayed the same: True
weights 3 stayed the same: True
weights 4 stayed the same: True
weights 5 stayed the same: True
weights 6 stayed the same: True
weights 7 stayed the same: True
weights 8 stayed the same: True
weights 9 stayed the same: True
weights 10 stayed the same: True
weights 11 stayed the same: True
weights 12 stayed the same: True
weights 13 stayed the same: True
weights 14 stayed the same: True
weights 15 stayed the same: True
weights 16 stayed the same: True
weights 17 stayed the same: True
weights 18 stayed the same: True
weights 19 stayed the same: False
weights 20 stayed the same: False


Two weight tensors changed for the last layer. Both tensors belong to the last time_dense layer:

In [13]:
print(retrained_weights[20]) # 6
print(len(retrained_weights[19])) # 512
print([len(x) for x in retrained_weights[19]]) # 6
print(512 * 6 + 6) # 3078
print(model.get_layer(name="sequential_4").summary())

[0.15428965 0.05034949 0.01139193 0.07319621 0.02805466 0.00192984]
512
[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 

##### Release the model again

In [13]:
def release_model(model:PrositIntensityPredictor, optimizer_config: dict = {"learning_rate":1e-4}, loss:dlomix.losses=masked_spectral_distance, metrics:list=[masked_pearson_correlation_distance]) -> None:
    '''Unfreezes all layers of a PrositIntensityPredictor model.

        Sets the trainable attribute of every layer to 'True'.
        Finally, compiles the model with the optimizer, loss, and metrics to make the changes take effect.

        Parameter
        ---------
        model                   : dlomix.models.prosit.PrositIntensityPredictor
                                The model to be unfrozen.
        optimizer_config        : dict
                                The initialization parameters for the new optimizer needed for compiling the model.
        loss                    : dlomix.losses
                                The loss for compiling the model.
                                default: masked_spectral_distance
        metrics                 : list[dlomix.losses]
                                The metrics for compiling the model.
                                default: [masked_pearson_correlation_distance] 
        --------
        '''
    model.trainable = True

    for lay in model.layers:
        try:
            for sublay in lay.layers:
                sublay.trainable = True
        except (AttributeError):
            lay.trainable = True

    new_optimizer = tf.keras.optimizers.Adam(**optimizer_config)
    model.compile(
        optimizer=new_optimizer,
        loss=loss,
        metrics=metrics
    )
     

     


In [16]:
release_model(model, optimizer_config = {"learning_rate": 1e-4})
check_trainability(model, sublayers=True)

model compiled

Layer: embedding, Trainable: True

Layer: sequential, Trainable: True
Layer: bidirectional, Trainable: True
Layer: dropout, Trainable: True
Layer: gru_1, Trainable: True
Layer: dropout_1, Trainable: True

Layer: sequential_1, Trainable: True
Layer: meta_in, Trainable: True
Layer: meta_dense, Trainable: True
Layer: meta_dense_do, Trainable: True

Layer: sequential_2, Trainable: True
Layer: decoder, Trainable: True
Layer: dropout_2, Trainable: True
Layer: decoder_attention_layer, Trainable: True

Layer: encoder_att, Trainable: True

Layer: sequential_3, Trainable: True
Layer: add_meta, Trainable: True
Layer: repeat, Trainable: True

Layer: sequential_4, Trainable: True
Layer: time_dense, Trainable: True
Layer: activation, Trainable: True
Layer: out, Trainable: True


In [17]:
weights = model.get_weights()

In [18]:
# train again while model is released again
model.fit(
    dataset.tensor_train_data,
    validation_data=dataset.tensor_val_data,
    epochs=1
)



<keras.src.callbacks.History at 0x7f7f60490b20>

In [19]:
# check which weights have changed
retrained_weights = model.get_weights()
for i, w in enumerate(zip(original_weights, retrained_weights)):
    print(f'weights {i} stayed the same: {(w[0]==w[1]).all()}')

weights 0 stayed the same: False
weights 1 stayed the same: False
weights 2 stayed the same: False
weights 3 stayed the same: False
weights 4 stayed the same: False
weights 5 stayed the same: False
weights 6 stayed the same: False
weights 7 stayed the same: False
weights 8 stayed the same: False
weights 9 stayed the same: False
weights 10 stayed the same: False
weights 11 stayed the same: False
weights 12 stayed the same: False
weights 13 stayed the same: False
weights 14 stayed the same: False
weights 15 stayed the same: False
weights 16 stayed the same: False
weights 17 stayed the same: False
weights 18 stayed the same: False
weights 19 stayed the same: False
weights 20 stayed the same: False
