## Data Is available at 

<a href="https://zenodo.org/records/6972738#.YvDMenZBxaQ">Drug Response Prediction, Additional data</a>


## Imports

In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [56]:
x_train = pd.read_csv('data/train.csv', index_col=0)
y_train = pd.read_csv('data/train_targets.csv', index_col=0)
x_augment_rna = pd.read_csv('data/additional/CCLE_RNAseq.csv', index_col=0)
x_augment_cell_drug = pd.read_csv('data/additional/CCLE_cell_drug_labels.csv', index_col=0)

## Partition the train set and labels to have a test set without augmentation

In [57]:
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=42)
x_train.to_csv("data/x_train_partition.csv", index=True)
y_train.to_csv("data/y_train_partition.csv", index=True)
x_test.to_csv("data/x_test_partition.csv", index=True)
y_test.to_csv("data/y_test_partition.csv", index=True)

## Let's analyse how we can merge these together 

In [58]:
x_augment_rna.index.name = 'cell_line_id'
x_augment_rna.head(5)

Unnamed: 0_level_0,A1CF,ABCC5,ABCF1,ABHD4,ABHD6,ABI1,ABL1,ABL2,ACAA1,ACAT2,...,ZNF384,ZNF395,ZNF429,ZNF451,ZNF479,ZNF521,ZNF586,ZNF589,ZNRF3,ZRSR2
cell_line_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22RV1_PROSTATE,-0.078135,-0.047979,-0.007551,-0.078826,-0.093184,-0.008702,-0.035462,-0.082019,0.020303,0.161903,...,0.065796,-0.044987,-0.057619,-0.051806,-0.102852,-0.102824,-0.072581,-0.06956,-0.084609,-0.08317
42MGBA_CENTRAL_NERVOUS_SYSTEM,-0.141742,-0.106508,0.217639,-0.040339,-0.098107,-0.041462,-0.009984,-0.086607,-0.000808,0.077247,...,-0.001544,-0.086878,-0.126022,-0.0983,-0.14178,-0.093731,-0.129042,-0.130126,-0.130707,-0.11309
5637_URINARY_TRACT,-0.136656,-0.10354,0.116922,-0.114223,-0.130835,-0.054733,0.02498,-0.065783,-0.001689,0.057101,...,-0.038451,-0.053665,-0.127741,-0.10866,-0.136656,-0.136656,-0.116838,-0.127115,-0.116212,-0.100483
639V_URINARY_TRACT,-0.139272,-0.038257,0.18514,-0.107543,-0.111542,-0.057074,-0.064311,-0.102287,0.069347,0.154516,...,0.070452,-0.123426,-0.117675,-0.06911,-0.139272,-0.139081,-0.122512,-0.076195,-0.1048,-0.099773
697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,-0.114391,-0.04716,0.12713,-0.111325,-0.104127,-0.014949,-0.008429,-0.092379,0.023783,0.01552,...,0.053509,-0.039253,-0.090507,-0.0775,-0.114197,-0.114036,-0.085827,-0.090862,-0.106161,-0.083826


In [59]:
cols_set = set(x_augment_rna.columns)
common_genes = cols_set.intersection(set(x_train.columns))
print(f'Number of genes in train: {len(x_train.columns)}\nNumber of genes in augment: {len(x_augment_rna.columns)}\nNumber of common genes: {len(common_genes)}')

Number of genes in train: 19920
Number of genes in augment: 1478
Number of common genes: 1460


What we can do is only keep the common genes to do our training and see what we get 

In [60]:
x_augment_cell_drug.head(5)
print("Shape of x_augment_cell_drug: ", x_augment_cell_drug.shape, "Shape of x_augment_rna: ", x_augment_rna.shape)

Shape of x_augment_cell_drug:  (10853, 3) Shape of x_augment_rna:  (469, 1478)


### Get only the response of Erlotinib

In [61]:
print(f"Erlotinib in drug response data ? {'Erlotinib' in set(x_augment_cell_drug['drug_id'])}")

Erlotinib in drug response data ? True


In [62]:
x_augment_erlotinib = x_augment_cell_drug[x_augment_cell_drug['drug_id'] == 'Erlotinib']
x_augment_erlotinib

Unnamed: 0,cell_line_id,drug_id,labels
2014,22RV1_PROSTATE,Erlotinib,2.079442
2015,42MGBA_CENTRAL_NERVOUS_SYSTEM,Erlotinib,2.079442
2016,5637_URINARY_TRACT,Erlotinib,0.891798
2017,639V_URINARY_TRACT,Erlotinib,2.079442
2018,697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,Erlotinib,1.442903
...,...,...,...
2511,WM88_SKIN,Erlotinib,2.079442
2512,WM983B_SKIN,Erlotinib,2.079442
2513,YKG1_CENTRAL_NERVOUS_SYSTEM,Erlotinib,2.079442
2514,ZR751_BREAST,Erlotinib,2.079442


Now just have to match the cell_line_ids to get appropriate targets 

In [63]:
x_augment_with_targets = pd.merge(x_augment_rna, x_augment_erlotinib, left_index=True, right_on='cell_line_id')

In [64]:
x_augment_with_targets

Unnamed: 0,A1CF,ABCC5,ABCF1,ABHD4,ABHD6,ABI1,ABL1,ABL2,ACAA1,ACAT2,...,ZNF451,ZNF479,ZNF521,ZNF586,ZNF589,ZNRF3,ZRSR2,cell_line_id,drug_id,labels
2014,-0.078135,-0.047979,-0.007551,-0.078826,-0.093184,-0.008702,-0.035462,-0.082019,0.020303,0.161903,...,-0.051806,-0.102852,-0.102824,-0.072581,-0.069560,-0.084609,-0.083170,22RV1_PROSTATE,Erlotinib,2.079442
2015,-0.141742,-0.106508,0.217639,-0.040339,-0.098107,-0.041462,-0.009984,-0.086607,-0.000808,0.077247,...,-0.098300,-0.141780,-0.093731,-0.129042,-0.130126,-0.130707,-0.113090,42MGBA_CENTRAL_NERVOUS_SYSTEM,Erlotinib,2.079442
2016,-0.136656,-0.103540,0.116922,-0.114223,-0.130835,-0.054733,0.024980,-0.065783,-0.001689,0.057101,...,-0.108660,-0.136656,-0.136656,-0.116838,-0.127115,-0.116212,-0.100483,5637_URINARY_TRACT,Erlotinib,0.891798
2017,-0.139272,-0.038257,0.185140,-0.107543,-0.111542,-0.057074,-0.064311,-0.102287,0.069347,0.154516,...,-0.069110,-0.139272,-0.139081,-0.122512,-0.076195,-0.104800,-0.099773,639V_URINARY_TRACT,Erlotinib,2.079442
2018,-0.114391,-0.047160,0.127130,-0.111325,-0.104127,-0.014949,-0.008429,-0.092379,0.023783,0.015520,...,-0.077500,-0.114197,-0.114036,-0.085827,-0.090862,-0.106161,-0.083826,697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,Erlotinib,1.442903
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2511,-0.147884,-0.014286,0.207441,-0.036878,-0.106288,-0.043199,-0.023951,0.154915,0.060344,0.004963,...,-0.095481,-0.147884,-0.092341,-0.124191,-0.102169,-0.128146,-0.085326,WM88_SKIN,Erlotinib,2.079442
2512,-0.141831,-0.024712,0.297954,-0.109206,-0.077352,-0.064009,0.020370,0.021758,0.040346,0.046015,...,-0.088189,-0.141831,-0.141446,-0.118693,-0.109746,-0.111828,-0.108126,WM983B_SKIN,Erlotinib,2.079442
2513,-0.151082,-0.044802,0.172368,0.022808,-0.097515,-0.055046,-0.000070,-0.107161,0.141679,-0.016247,...,-0.089064,-0.151082,-0.103448,-0.119326,-0.110491,-0.129186,-0.063710,YKG1_CENTRAL_NERVOUS_SYSTEM,Erlotinib,2.079442
2514,-0.134768,0.098857,-0.051489,-0.104381,-0.123886,-0.012330,-0.088465,-0.105565,0.028237,0.036305,...,-0.036536,-0.134990,-0.134546,-0.109858,-0.114004,-0.118890,-0.058929,ZR751_BREAST,Erlotinib,2.079442


We have the full data, let's split it in train and train_targets and append them to our original dataframes

In [65]:
x_train = pd.read_csv('data/x_train_partition.csv', index_col=0)
x_train_targets = pd.read_csv('data/y_train_partition.csv', index_col=0)

### Augment Train Targets

In [66]:
x_augmented_targets = x_augment_with_targets[["cell_line_id", "drug_id", "labels"]]
x_augmented_targets = x_augmented_targets.rename(columns={"labels": "AUC"})
x_augmented_targets["tissue"] = x_augmented_targets["cell_line_id"].str.split("_").str[-1]

# start from last index in sample name
start_index = int(x_train_targets.index[-1].replace("CL", "")) + 1
x_augmented_targets["sample"] = ["CL" + str(start_index + i) for i in range(len(x_augmented_targets))]
x_augmented_targets = x_augmented_targets[['sample', 'AUC', 'tissue']]
x_augmented_targets.set_index('sample', inplace=True)
x_augmented_targets.head(5)

Unnamed: 0_level_0,AUC,tissue
sample,Unnamed: 1_level_1,Unnamed: 2_level_1
CL104,2.079442,PROSTATE
CL105,2.079442,SYSTEM
CL106,0.891798,TRACT
CL107,2.079442,TRACT
CL108,1.442903,TISSUE


The target data is calculated with area under the dose-response curve (AUC)

To have an AAC value let's define $AAC = 1 - AUC$

We just need to make small adjustments:
1. **Handle Negative AUCs** <br/>
If AUC contains negative values, add a constant to shift all AUC values into a non-negative range:
$$AUC_{shifted} =AUC+∣min(AUC)∣$$

2. **Normalize to [0,1]** <br/>
If the values of 1 - AUC do not naturally fall into [0, 1], we apply normalization:
$$AAC_{normalised} = \frac{AAC - min(AAC)}{max(AAC) - min(AAC)}

In [67]:
min_AUC = min(x_augmented_targets['AUC'])
# shift AUC values to be positive
x_augmented_targets['AUC'] = x_augmented_targets['AUC'].apply(lambda x: (x + abs(min_AUC)))

In [68]:
# compute AAC as 1 - AUC
x_augmented_targets['AAC'] = 1 - x_augmented_targets['AUC'] 

In [69]:
# normalize AAC values
x_augmented_targets['AAC'] = x_augmented_targets['AAC'].apply(lambda x: (x - min(x_augmented_targets['AAC'])) / (max(x_augmented_targets['AAC']) - min(x_augmented_targets['AAC'])))
x_augmented_targets.drop(columns=['AUC'], inplace=True)
x_augmented_targets.head(5)

Unnamed: 0_level_0,tissue,AAC
sample,Unnamed: 1_level_1,Unnamed: 2_level_1
CL104,PROSTATE,0.0
CL105,SYSTEM,0.0
CL106,TRACT,0.223001
CL107,TRACT,0.0
CL108,TISSUE,0.119521


In [70]:
augmented_targets_data = pd.concat([x_train_targets, x_augmented_targets], ignore_index=False)
augmented_targets_data.to_csv("data/train_targets_augmented.csv", index=True)

### Augment train data

In [71]:
x_train_augmented = x_augment_with_targets.drop(columns=["cell_line_id", "drug_id", "labels"])
start_index = int(x_train.index[-1].replace("CL", "")) + 1
x_train_augmented["sample"] = ["CL" + str(start_index + i) for i in range(len(x_train_augmented))]
x_train_augmented.set_index('sample', inplace=True)

In [72]:
x_train = x_train.loc[:, list(common_genes)]
x_train_augmented = x_train_augmented.loc[:, list(common_genes)]

In [73]:
augmented_train_data = pd.concat([x_train, x_train_augmented], ignore_index=False)
augmented_train_data.to_csv("data/train_augmented.csv", index=True)

In [74]:
augmented_train_data

Unnamed: 0,IDE,YWHAE,SS18L1,SOCS2,SETD1B,NUDCD3,PDGFA,CNDP2,CRELD2,EIF4EBP1,...,XPA,GRM3,EGR1,GNA11,CISD1,PAK6,RAC1,CDH17,MRPS16,ERBB3
CL482,84.714771,403.597046,28.838380,115.887539,43.022670,66.724138,0.676018,149.852804,6.428305,120.307697,...,11.995032,1.212986,93.696869,26.398954,28.929113,0.896174,89.630366,1.066946,96.031151,0.953978
CL293,81.520482,557.608810,40.980639,39.786449,69.207976,97.802864,0.710556,129.169512,27.514433,72.990367,...,19.172810,0.298354,3.048738,61.269347,30.829748,17.602266,126.140289,0.313831,60.403484,386.930320
CL350,65.878195,639.182763,22.544726,5.242902,36.136658,126.495748,8.325902,288.411688,30.478781,29.698442,...,21.713986,0.000000,166.887113,65.981006,23.953366,8.789732,272.020890,670.180279,97.520213,320.044470
CL175,24.676206,1009.756740,38.343093,23.912459,50.204933,102.036359,9.627393,107.220276,25.582140,75.134790,...,9.962079,0.049746,64.889136,68.626745,23.906524,8.872781,295.358597,1.052809,59.558009,59.376839
CL136,28.576144,312.805823,23.278108,9.975429,33.510426,45.956612,12.957197,82.807917,37.281612,314.387806,...,10.433557,0.013283,56.800736,96.621506,24.997848,0.100443,119.184569,0.235648,71.723386,0.679204
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CL567,-0.096174,2.374586,-0.072154,-0.109999,-0.123579,0.160461,-0.109387,0.619655,0.218085,0.485608,...,-0.110814,-0.147884,-0.007312,-0.046421,-0.001113,-0.147436,0.768465,-0.146987,0.229585,0.550287
CL568,-0.088304,4.281093,-0.041603,-0.129799,-0.118731,0.000123,0.022761,0.215851,0.118014,0.588150,...,-0.098987,-0.141831,0.110803,-0.035896,-0.019506,-0.141677,0.719962,-0.140327,0.136486,0.270227
CL569,-0.070966,7.915109,-0.073783,-0.081552,-0.105667,0.244972,-0.148777,0.466111,0.100789,0.142063,...,-0.090174,-0.151082,0.464446,0.077442,-0.023930,-0.143740,1.036737,-0.151039,0.065106,-0.149588
CL570,-0.070699,1.557608,0.011285,-0.121369,-0.108526,0.085940,-0.113264,0.536164,-0.006111,0.589351,...,-0.079545,-0.134990,0.420980,-0.046566,0.069062,-0.117631,1.094688,-0.134139,0.262713,0.224516


### We're done!

In [75]:
augmented_train_data.shape

(1061, 1460)