In [None]:
import numpy as np 
import pandas as pd
# from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import peptides

In [None]:
def create_dataframe(dictionary): 
    dataframes_dict = {}
    for sequence, values in dictionary.items():
        df = pd.DataFrame(data={"Sequence": [sequence], **{f"Value_{i+1}": [v] for i, v in enumerate(values)}})
        dataframes_dict[sequence] = df
    
    df = pd.concat(dataframes_dict.values(), ignore_index=True)
    return df

In [None]:
def get_scaler(train_df): 
    sequence_col = train_df[["Sequence"]]
    numerical_cols = train_df.drop(columns=["Sequence"])
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler = scaler.fit(numerical_cols.to_numpy())

    return scaler

In [None]:
def scale(df, scaler): 
    seq_colmn = df[["Sequence"]]
    num_colmns = df.drop(columns=["Sequence"])
    scaled_num_data = scaler.transform(num_colmns.to_numpy())
    scaled_numerical_df = pd.DataFrame(scaled_num_data, columns=num_colmns.columns)
    df_scaled = pd.concat([seq_colmn, scaled_numerical_df], axis=1)

    return df_scaled

In [None]:
def dataframe_to_sequence_dict(df):
    sequence_dict = {row["Sequence"]: row[1:].values.astype(np.float32) for _, row in df.iterrows()}
    return sequence_dict

In [None]:
if not 'base_path' in locals():
  base_path = "../data/physicoProperties"

if not 'precision' in locals():
  precision = "gene"

if not 'chain' in locals():
  chain = "paired"

print(f"scale physico for {chain} {precision}")

In [None]:
train_epitope_physico = np.load(f"{base_path}/train_{chain}_epitope_{precision}_physico.npz", allow_pickle=True)
if chain == 'paired':
  train_tra_physico = np.load(f"{base_path}/train_paired_TRA_{precision}_physico.npz", allow_pickle=True)
train_trb_physico = np.load(f"{base_path}/train_{chain}_TRB_{precision}_physico.npz", allow_pickle=True)

test_epitope_physico = np.load(f"{base_path}/test_{chain}_epitope_{precision}_physico.npz", allow_pickle=True)
if chain == 'paired':
  test_tra_physico = np.load(f"{base_path}/test_paired_TRA_{precision}_physico.npz", allow_pickle=True)
test_trb_physico = np.load(f"{base_path}/test_{chain}_TRB_{precision}_physico.npz", allow_pickle=True)

validation_epitope_physico = np.load(f"{base_path}/validation_{chain}_epitope_{precision}_physico.npz", allow_pickle=True)
if chain == 'paired':
  validation_tra_physico = np.load(f"{base_path}/validation_paired_TRA_{precision}_physico.npz", allow_pickle=True)
validation_trb_physico = np.load(f"{base_path}/validation_{chain}_TRB_{precision}_physico.npz", allow_pickle=True)

In [None]:
train_epitope_physico_df = create_dataframe(train_epitope_physico)
if chain == 'paired':
  train_tra_physico_df = create_dataframe(train_tra_physico)
train_trb_physico_df = create_dataframe(train_trb_physico)

test_epitope_physico_df = create_dataframe(test_epitope_physico)
if chain == 'paired':
  test_tra_physico_df = create_dataframe(test_tra_physico)
test_trb_physico_df = create_dataframe(test_trb_physico)

validation_epitope_physico_df = create_dataframe(validation_epitope_physico)
if chain == 'paired':
  validation_tra_physico_df = create_dataframe(validation_tra_physico)
validation_trb_physico_df = create_dataframe(validation_trb_physico)


In [None]:
train_epitope_physico_df

In [None]:
train_trb_physico_df

In [None]:
epitope_scaler = get_scaler(train_epitope_physico_df)
if chain == 'paired':
  tra_scaler = get_scaler(train_tra_physico_df)
trb_scaler = get_scaler(train_trb_physico_df)


In [None]:
number_of_pyhsico_features = 101
assert epitope_scaler.n_features_in_ == number_of_pyhsico_features
if chain == 'paired':
  assert tra_scaler.n_features_in_ == number_of_pyhsico_features
assert trb_scaler.n_features_in_ == number_of_pyhsico_features

In [None]:
scaled_train_epitope = scale(train_epitope_physico_df, epitope_scaler)
scaled_train_epitope

In [None]:
if chain == 'paired':
  scaled_train_tra = scale(train_tra_physico_df, tra_scaler)
scaled_train_trb = scale(train_trb_physico_df, trb_scaler)

scaled_test_epitope = scale(test_epitope_physico_df, epitope_scaler)
if chain == 'paired':
  scaled_test_tra = scale(test_tra_physico_df, tra_scaler)
scaled_test_trb = scale(test_trb_physico_df, trb_scaler)

scaled_validation_epitope = scale(validation_epitope_physico_df, epitope_scaler)
if chain == 'paired':
  scaled_validation_tra = scale(validation_tra_physico_df, tra_scaler)
scaled_validation_trb = scale(validation_trb_physico_df, trb_scaler)

In [None]:
scaled_validation_trb

In [None]:
scaled_train_epitope_dict = dataframe_to_sequence_dict(scaled_train_epitope)
if chain == 'paired':
  scaled_train_tra_dict = dataframe_to_sequence_dict(scaled_train_tra)
scaled_train_trb_dict = dataframe_to_sequence_dict(scaled_train_trb)

scaled_test_epitope_dict = dataframe_to_sequence_dict(scaled_test_epitope)
if chain == 'paired':
  scaled_test_tra_dict = dataframe_to_sequence_dict(scaled_test_tra)
scaled_test_trb_dict = dataframe_to_sequence_dict(scaled_test_trb)

scaled_validation_epitope_dict = dataframe_to_sequence_dict(scaled_validation_epitope)
if chain == 'paired':
  scaled_validation_tra_dict = dataframe_to_sequence_dict(scaled_validation_tra)
scaled_validation_trb_dict = dataframe_to_sequence_dict(scaled_validation_trb)

In [None]:
scaled_train_epitope_dict

In [None]:
np.savez(f"{base_path}/scaled_train_{chain}_epitope_{precision}_physico.npz", **scaled_train_epitope_dict)
if chain == 'paired':
  np.savez(f"{base_path}/scaled_train_paired_TRA_{precision}_physico.npz", **scaled_train_tra_dict)
np.savez(f"{base_path}/scaled_train_{chain}_TRB_{precision}_physico.npz", **scaled_train_trb_dict)

In [None]:
np.savez(f"{base_path}/scaled_test_{chain}_epitope_{precision}_physico.npz", **scaled_test_epitope_dict)
if chain == 'paired':
  np.savez(f"{base_path}/scaled_test_paired_TRA_{precision}_physico.npz", **scaled_test_tra_dict)
np.savez(f"{base_path}/scaled_test_{chain}_TRB_{precision}_physico.npz", **scaled_test_trb_dict)

In [None]:
np.savez(f"{base_path}/scaled_validation_{chain}_epitope_{precision}_physico.npz", **scaled_validation_epitope_dict)
if chain == 'paired':
  np.savez(f"{base_path}/scaled_validation_paired_TRA_{precision}_physico.npz", **scaled_validation_tra_dict)
np.savez(f"{base_path}/scaled_validation_{chain}_TRB_{precision}_physico.npz", **scaled_validation_trb_dict)