In [1]:
import numpy as np 
import pandas as pd
# from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import peptides

In [2]:
def create_dataframe(dictionary): 
    dataframes_dict = {}
    for sequence, values in dictionary.items():
        df = pd.DataFrame(data={"Sequence": [sequence], **{f"Value_{i+1}": [v] for i, v in enumerate(values)}})
        dataframes_dict[sequence] = df
    
    df = pd.concat(dataframes_dict.values(), ignore_index=True)
    return df

In [3]:
def get_scaler(train_df): 
    sequence_col = train_df[["Sequence"]]
    numerical_cols = train_df.drop(columns=["Sequence"])
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler = scaler.fit(numerical_cols.to_numpy())

    return scaler

In [4]:
def scale(df, scaler): 
    seq_colmn = df[["Sequence"]]
    num_colmns = df.drop(columns=["Sequence"])
    scaled_num_data = scaler.transform(num_colmns.to_numpy())
    scaled_numerical_df = pd.DataFrame(scaled_num_data, columns=num_colmns.columns)
    df_scaled = pd.concat([seq_colmn, scaled_numerical_df], axis=1)

    return df_scaled

In [5]:
def dataframe_to_sequence_dict(df):
    sequence_dict = {row["Sequence"]: row[1:].values.astype(np.float32) for _, row in df.iterrows()}
    return sequence_dict

In [6]:
base_path = "/teamspace/studios/this_studio/BA/physico"
precision = "gene"
beta_or_paired = "paired"

In [7]:
train_epitope_physico = np.load(f"{base_path}/train_{beta_or_paired}_epitope_{precision}_physico.npz", allow_pickle=True)
train_tra_physico = np.load(f"{base_path}/train_paired_TRA_{precision}_physico.npz", allow_pickle=True)
train_trb_physico = np.load(f"{base_path}/train_{beta_or_paired}_TRB_{precision}_physico.npz", allow_pickle=True)

test_epitope_physico = np.load(f"{base_path}/test_{beta_or_paired}_epitope_{precision}_physico.npz", allow_pickle=True)
test_tra_physico = np.load(f"{base_path}/test_paired_TRA_{precision}_physico.npz", allow_pickle=True)
test_trb_physico = np.load(f"{base_path}/test_{beta_or_paired}_TRB_{precision}_physico.npz", allow_pickle=True)

validation_epitope_physico = np.load(f"{base_path}/validation_{beta_or_paired}_epitope_{precision}_physico.npz", allow_pickle=True)
validation_tra_physico = np.load(f"{base_path}/validation_paired_TRA_{precision}_physico.npz", allow_pickle=True)
validation_trb_physico = np.load(f"{base_path}/validation_{beta_or_paired}_TRB_{precision}_physico.npz", allow_pickle=True)

In [8]:
train_epitope_physico_df = create_dataframe(train_epitope_physico)
train_tra_physico_df = create_dataframe(train_tra_physico)
train_trb_physico_df = create_dataframe(train_trb_physico)

test_epitope_physico_df = create_dataframe(test_epitope_physico)
test_tra_physico_df = create_dataframe(test_tra_physico)
test_trb_physico_df = create_dataframe(test_trb_physico)

validation_epitope_physico_df = create_dataframe(validation_epitope_physico)
validation_tra_physico_df = create_dataframe(validation_tra_physico)
validation_trb_physico_df = create_dataframe(validation_trb_physico)


In [9]:
train_epitope_physico_df

Unnamed: 0,Sequence,Value_1,Value_2,Value_3,Value_4,Value_5,Value_6,Value_7,Value_8,Value_9,...,Value_92,Value_93,Value_94,Value_95,Value_96,Value_97,Value_98,Value_99,Value_100,Value_101
0,ELAGIGILTA,-0.221000,-0.598000,0.075000,0.023000,-0.202000,0.430000,0.239000,0.074000,0.234000,...,-1.580000,-1.000241,0.089986,0.107611,1.520000,11.280000,3.849983,0.000000,957.134827,479.284393
1,LLWNGPMAV,-0.352222,-0.274444,0.044444,-0.243333,-0.178889,0.182222,0.271111,0.003333,-0.198889,...,-1.630000,-0.002016,0.241398,0.193627,1.011111,37.544445,6.100000,0.000000,1000.223633,500.767853
2,MEVTPSGTWL,-0.065000,-0.259000,0.091000,-0.309000,-0.204000,0.150000,0.010000,-0.386000,0.018000,...,0.077000,-1.000241,0.142803,0.297897,0.130000,8.590000,3.849983,0.000000,1120.285522,560.770813
3,NNILIATCV,-0.361111,-0.741111,-0.095556,0.385556,0.062222,0.248889,-0.064444,-0.244444,-0.182222,...,-0.671111,-0.063990,0.041075,0.125061,1.511111,70.733330,5.922540,0.000000,960.156311,509.273499
4,YLEPGPVTV,-0.058889,-0.363333,-0.080000,-0.146667,-0.508889,-0.263333,0.218889,-0.165556,0.095556,...,-0.491111,-1.001091,0.207077,0.327634,0.344444,42.255554,3.849983,0.000000,974.121765,487.763275
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,YLQDSLATT,-0.002222,-0.277778,-0.175556,0.053333,-0.007778,0.180000,-0.170000,-0.443333,0.260000,...,1.254444,-1.002420,0.254548,0.149760,-0.122222,87.911110,3.749987,0.000000,1011.096558,506.253296
689,GSLSPELRPIF,0.080000,-0.319091,-0.093636,-0.120000,-0.247273,-0.036364,0.240909,-0.026364,0.026364,...,0.895455,-0.000244,0.400166,0.365502,0.154545,99.854546,6.410150,6.020129,1215.414795,608.340210
690,QKRPIPIKYKAM,0.195000,-0.251667,-0.492500,-0.386667,0.076667,-0.248333,0.270833,0.031667,0.077500,...,1.937500,3.996245,0.350910,0.520666,-0.958333,42.908333,11.131928,24.080517,1472.855347,736.942139
691,RLLPLLALL,-0.721111,-0.730000,-0.581111,-0.277778,0.051111,0.045556,0.314444,0.075556,-0.213333,...,-1.823333,0.997981,0.386082,0.325794,2.055556,30.288889,10.550002,6.020129,1021.354614,511.360229


In [11]:
train_trb_physico_df

Unnamed: 0,Sequence,Value_1,Value_2,Value_3,Value_4,Value_5,Value_6,Value_7,Value_8,Value_9,...,Value_92,Value_93,Value_94,Value_95,Value_96,Value_97,Value_98,Value_99,Value_100,Value_101
0,CASSHLSGVRSYNEQFF,0.123529,0.029412,-0.027059,0.290000,0.147059,0.186471,0.078235,-0.105294,0.273529,...,1.962353,0.027840,0.446293,0.323236,-0.305882,53.482353,7.356563,6.020129,1932.097290,994.949585
1,CASSQVIEGRAYEQYF,0.079375,-0.018750,-0.100625,0.083125,0.118125,0.078125,-0.005625,0.041875,0.488125,...,1.807500,-1.062144,0.237363,0.474507,-0.343750,58.812500,4.258106,6.020129,1851.020386,954.433228
2,CASSSAGAKNIQYF,0.146429,-0.210714,0.197857,0.217857,0.197857,0.233571,0.165000,-0.106429,0.436429,...,0.894286,0.934864,0.374214,0.269708,0.014286,56.735714,8.522939,6.020129,1446.597534,752.348450
3,CASSDWGQGGYEQYF,0.197333,0.346000,0.382000,0.061333,-0.008000,0.208667,-0.001333,0.122000,0.359333,...,1.498000,-2.063470,0.316160,0.241469,-0.880000,20.613333,3.550066,0.000000,1697.752197,877.841187
4,CAISEENIDTQYF,0.112308,-0.133077,-0.127692,0.254615,-0.039231,0.079231,-0.347692,-0.100000,0.252308,...,1.901538,-3.060844,0.050188,0.345731,-0.323077,59.761539,3.424707,0.000000,1532.641602,795.343018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18215,CASSLGVAGTDTQYF,0.012000,-0.266667,0.288667,0.176000,-0.005333,0.252667,0.046667,-0.183333,0.356000,...,0.508000,-1.064394,0.195942,0.263640,0.320000,-3.560000,3.749971,0.000000,1519.645996,788.851013
18216,CSVGTGTYEQYF,-0.051667,0.092500,0.180000,0.199167,-0.016667,0.163333,0.009167,-0.201667,0.339167,...,0.915833,-1.063916,0.156855,0.142569,-0.258333,-15.650000,3.849961,0.000000,1354.453613,706.295349
18217,CASSYPGQQETQYF,0.246429,0.103571,0.033571,0.018571,0.080714,-0.087143,-0.012143,-0.153571,0.427143,...,1.862143,-1.063916,0.240417,0.182180,-0.985714,48.599998,3.849961,0.000000,1608.699097,833.346130
18218,CASSLRGGRNPYEQYF,0.253125,0.093125,0.069375,0.088750,0.141875,0.082500,0.226250,0.013750,0.187500,...,2.690000,0.936077,0.585429,0.367889,-0.950000,44.381248,8.506496,12.040258,1848.022705,952.939026


In [12]:
epitope_scaler = get_scaler(train_epitope_physico_df)
tra_scaler = get_scaler(train_tra_physico_df)
trb_scaler = get_scaler(train_trb_physico_df)


In [13]:
number_of_pyhsico_features = 101
assert epitope_scaler.n_features_in_ == number_of_pyhsico_features
assert tra_scaler.n_features_in_ == number_of_pyhsico_features
assert trb_scaler.n_features_in_ == number_of_pyhsico_features

In [14]:
scaled_train_epitope = scale(train_epitope_physico_df, epitope_scaler)
scaled_train_epitope

Unnamed: 0,Sequence,Value_1,Value_2,Value_3,Value_4,Value_5,Value_6,Value_7,Value_8,Value_9,...,Value_92,Value_93,Value_94,Value_95,Value_96,Value_97,Value_98,Value_99,Value_100,Value_101
0,ELAGIGILTA,-0.052214,-0.658957,0.217295,0.084029,-0.293972,0.594960,0.225608,0.459629,0.323305,...,-0.556341,-0.339984,-0.766076,-0.761186,0.389033,-0.528305,-0.876420,-1.0,-0.843550,-0.843538
1,LLWNGPMAV,-0.188039,-0.328798,0.176645,-0.316405,-0.257092,0.193519,0.268693,0.335288,-0.502119,...,-0.565373,-0.120272,-0.370341,-0.549861,0.206381,-0.317937,-0.404748,-1.0,-0.804389,-0.804463
2,MEVTPSGTWL,0.109258,-0.313039,0.238581,-0.415135,-0.297163,0.141314,-0.081651,-0.349756,-0.088559,...,-0.257015,-0.339984,-0.628031,-0.293692,-0.109870,-0.549851,-0.876420,-1.0,-0.695270,-0.695328
3,NNILIATCV,-0.197240,-0.804989,-0.009608,0.629135,0.127660,0.301530,-0.181537,-0.100684,-0.470339,...,-0.392156,-0.133913,-0.893910,-0.718315,0.385842,-0.052107,-0.441949,-1.0,-0.840804,-0.788993
4,YLEPGPVTV,0.115584,-0.419501,0.011086,-0.171066,-0.783688,-0.528353,0.198624,0.038123,0.059322,...,-0.359640,-0.340172,-0.460042,-0.220634,-0.032901,-0.280203,-0.876420,-1.0,-0.828111,-0.828116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,YLQDSLATT,0.174238,-0.332200,-0.116038,0.129636,0.015957,0.189919,-0.323165,-0.450635,0.372881,...,-0.044318,-0.340464,-0.335971,-0.657634,-0.200399,0.085480,-0.897382,-1.0,-0.794507,-0.794486
689,GSLSPELRPIF,0.259345,-0.374356,-0.007055,-0.130972,-0.366215,-0.160625,0.228169,0.283036,-0.072612,...,-0.109167,-0.119882,0.044621,-0.127600,-0.101060,0.181143,-0.339731,-0.6,-0.608812,-0.608808
690,QKRPIPIKYKAM,0.378379,-0.305556,-0.537694,-0.531908,0.150709,-0.504050,0.268320,0.385142,0.024894,...,0.079071,0.759759,-0.084117,0.253608,-0.500499,-0.274974,0.650097,0.6,-0.374837,-0.374905
691,RLLPLLALL,-0.569868,-0.793651,-0.655580,-0.368192,0.109929,-0.027903,0.326835,0.462366,-0.529661,...,-0.600297,0.099831,0.007809,-0.225153,0.581256,-0.376051,0.528108,-0.6,-0.785184,-0.785197


In [16]:
scaled_train_tra = scale(train_tra_physico_df, tra_scaler)
scaled_train_trb = scale(train_trb_physico_df, trb_scaler)

scaled_test_epitope = scale(test_epitope_physico_df, epitope_scaler)
scaled_test_tra = scale(test_tra_physico_df, tra_scaler)
scaled_test_trb = scale(test_trb_physico_df, trb_scaler)

scaled_validation_epitope = scale(validation_epitope_physico_df, epitope_scaler)
scaled_validation_tra = scale(validation_tra_physico_df, tra_scaler)
scaled_validation_trb = scale(validation_trb_physico_df, trb_scaler)

In [17]:
scaled_validation_trb

Unnamed: 0,Sequence,Value_1,Value_2,Value_3,Value_4,Value_5,Value_6,Value_7,Value_8,Value_9,...,Value_92,Value_93,Value_94,Value_95,Value_96,Value_97,Value_98,Value_99,Value_100,Value_101
0,CAEFRGNNNDMRF,0.714964,0.207697,-0.263332,0.223528,-0.051722,0.443723,0.005467,0.247498,-0.377482,...,0.659765,-0.000185,-0.109570,0.769011,-0.829860,-0.649607,-0.343182,-0.2,-0.247027,-0.247035
1,CAVRNNDMRF,0.436492,-0.186416,-0.447954,0.122788,0.145831,0.238938,-0.030191,0.157693,-0.410046,...,0.625713,0.249464,-0.369824,0.266655,-0.578573,-0.422672,0.141726,-0.2,-0.541734,-0.541759
2,CAAYGGTGNQFYF,0.082254,0.435311,0.301443,0.312180,-0.351044,0.380648,0.578501,0.117957,0.421591,...,-0.340843,-0.001164,-0.649794,-0.665166,-0.273464,-0.497473,-0.430417,-1.0,-0.395270,-0.395270
3,CAFMRDYGGATNKLIF,0.027342,-0.053631,-0.172279,0.003611,-0.251774,0.361024,0.283659,0.179453,0.050908,...,-0.152587,0.249178,0.138078,-0.539374,-0.171412,-0.538291,0.135868,-0.2,-0.049554,-0.049595
4,CADQAINTDKLIF,0.208196,-0.491076,-0.370297,0.043001,-0.337033,0.117133,-0.430204,0.049548,0.044941,...,-0.074208,-0.250683,-0.233903,-0.522661,-0.168003,-0.719938,-0.823086,-0.6,-0.350304,-0.350278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5791,CAEINARLMF,-0.115755,-0.463080,-0.525824,-0.107707,0.076286,0.153296,0.079837,0.252302,0.064297,...,-0.157786,-0.000296,-0.391902,0.255232,0.078557,-0.282622,-0.361776,-0.6,-0.590807,-0.590857
5792,CAAAAYNQGGKLIF,0.104030,-0.228264,0.022646,-0.047838,-0.122413,0.307660,0.589006,0.334205,0.496136,...,-0.463357,0.249068,-0.075500,-0.615400,-0.017345,-0.550657,0.135882,-0.6,-0.371460,-0.371449
5793,CAVVFSGGYNKLIF,-0.301570,-0.169084,-0.163091,0.218577,-0.450030,0.393563,0.391883,0.238245,0.142747,...,-0.603267,0.249068,-0.437973,-0.525469,0.269685,-0.334673,0.135882,-0.6,-0.294337,-0.294347
5794,CAGAGTNAGKSTF,0.588022,-0.353370,0.587039,-0.000519,-0.151071,0.655375,0.656200,-0.243010,0.432053,...,-0.207623,0.249280,-0.315960,-0.609454,-0.284373,-0.676726,0.140717,-0.6,-0.576523,-0.576488


In [18]:
scaled_train_epitope_dict = dataframe_to_sequence_dict(scaled_train_epitope)
scaled_train_tra_dict = dataframe_to_sequence_dict(scaled_train_tra)
scaled_train_trb_dict = dataframe_to_sequence_dict(scaled_train_trb)

scaled_test_epitope_dict = dataframe_to_sequence_dict(scaled_test_epitope)
scaled_test_tra_dict = dataframe_to_sequence_dict(scaled_test_tra)
scaled_test_trb_dict = dataframe_to_sequence_dict(scaled_test_trb)

scaled_validation_epitope_dict = dataframe_to_sequence_dict(scaled_validation_epitope)
scaled_validation_tra_dict = dataframe_to_sequence_dict(scaled_validation_tra)
scaled_validation_trb_dict = dataframe_to_sequence_dict(scaled_validation_trb)

In [19]:
scaled_train_epitope_dict

{'ELAGIGILTA': array([-0.052213817834854126, -0.6589568853378296, 0.21729491651058197,
        0.084029421210289, -0.29397162795066833, 0.594959557056427,
        0.22560787200927734, 0.4596285820007324, 0.32330507040023804,
        0.3499423861503601, -0.5365776419639587, -0.5736470818519592,
        0.1114286258816719, 0.44034695625305176, 0.07908855378627777,
        -0.3987515866756439, 0.6754407286643982, -0.5380889177322388,
        -0.4201805889606476, -0.3151029348373413, -0.6333836913108826,
        -0.04006926715373993, -0.17486658692359924, -0.4697168469429016,
        -0.17241713404655457, 0.2906155586242676, 0.3602791726589203,
        -0.14175115525722504, -0.045300647616386414, -0.6603907942771912,
        -0.09999990463256836, -0.30257028341293335, -0.01928737759590149,
        0.45195794105529785, -0.7706934213638306, -0.39202815294265747,
        0.3171977698802948, -0.48504459857940674, -0.31504422426223755,
        -0.009014055132865906, -0.6220458745956421, -0.2857

In [20]:
np.savez(f"./scaled_train_{beta_or_paired}_epitope_{precision}_physico.npz", **scaled_train_epitope_dict)
np.savez(f"./scaled_train_paired_TRA_{precision}_physico.npz", **scaled_train_tra_dict)
np.savez(f"./scaled_train_{beta_or_paired}_TRB_{precision}_physico.npz", **scaled_train_trb_dict)

In [21]:
np.savez(f"./scaled_test_{beta_or_paired}_epitope_{precision}_physico.npz", **scaled_test_epitope_dict)
np.savez(f"./scaled_test_paired_TRA_{precision}_physico.npz", **scaled_test_tra_dict)
np.savez(f"./scaled_test_{beta_or_paired}_TRB_{precision}_physico.npz", **scaled_test_trb_dict)

In [22]:
np.savez(f"./scaled_validation_{beta_or_paired}_epitope_{precision}_physico.npz", **scaled_validation_epitope_dict)
np.savez(f"./scaled_validation_paired_TRA_{precision}_physico.npz", **scaled_validation_tra_dict)
np.savez(f"./scaled_validation_{beta_or_paired}_TRB_{precision}_physico.npz", **scaled_validation_trb_dict)