## Imports

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Filtering Data

In [13]:
# Reading and Retaining Necessary Information
df = pd.read_csv("data.csv")
columns_to_keep = ['Un1_P', 'Un2_P', "DNA_P", "seq"]
cleaned_df = df[columns_to_keep]
cleaned_df

Unnamed: 0,Un1_P,Un2_P,DNA_P,seq
0,3,1,7,AAGGGCCGTCGGTGATTTTAGTCTTCCTCAGTGGTTACCAGGTCTC...
1,0,1,0,TTTATGCAAATGTTCCGTGAATGGAAGGTAACGAACAGATGTGACT...
2,3,6,14,AATCCGCAATTCTCACAGTGACGTGGGCGTGGCTCCCCACCAATGA...
3,9,25,23,CGAAGCGAGAGCACGGGAGACCAGAGTGGCCCCTAGGAGGGCCGTT...
4,12,11,21,ATAGCTTAAGTGTGATTCACTCTGAGTCATTTACTGCTGCTGCTGC...
...,...,...,...,...
65530,44,51,60,ATTCATTCATTTCCCGGAGCTCGGCTAGCGCACGCCCCTCTAGCCG...
65531,21,21,32,TCATTAACAATCTTATTAGAAATGACTGTTTAGTTTTGGGAAGGGC...
65532,9,10,24,ACCCTCAAACAATGTTTCTCCTAGACGTCATATCCGGTCTCCTGAC...
65533,39,51,36,GCATGGGGTGCCGGTTATGAGGTAGTAAGCCCTTTCGTTCAAGTTT...


In [14]:
# Dropping unnecessary rows that result in dive by 0 errors 
filtered_df = cleaned_df[cleaned_df['DNA_P'] != 0]
filtered_df = filtered_df[(filtered_df['Un1_P'] != 0) & (filtered_df['Un2_P'] != 0)]
filtered_df

Unnamed: 0,Un1_P,Un2_P,DNA_P,seq
0,3,1,7,AAGGGCCGTCGGTGATTTTAGTCTTCCTCAGTGGTTACCAGGTCTC...
2,3,6,14,AATCCGCAATTCTCACAGTGACGTGGGCGTGGCTCCCCACCAATGA...
3,9,25,23,CGAAGCGAGAGCACGGGAGACCAGAGTGGCCCCTAGGAGGGCCGTT...
4,12,11,21,ATAGCTTAAGTGTGATTCACTCTGAGTCATTTACTGCTGCTGCTGC...
5,15,26,14,AATCAAACCCTCTGTCGTCAAGACTCCCCGCCCCACCCACCTTTCC...
...,...,...,...,...
65529,13,7,30,GGTGAGGGAAGAGTGTCGAGTAAGTTGTTAGGATTACCGTCGCCCA...
65530,44,51,60,ATTCATTCATTTCCCGGAGCTCGGCTAGCGCACGCCCCTCTAGCCG...
65531,21,21,32,TCATTAACAATCTTATTAGAAATGACTGTTTAGTTTTGGGAAGGGC...
65532,9,10,24,ACCCTCAAACAATGTTTCTCCTAGACGTCATATCCGGTCTCCTGAC...


In [15]:
# Determining log2_foldchange
filtered_df['log2_foldchange'] = np.log2((filtered_df['Un1_P'] + filtered_df['Un2_P']) / 2 / filtered_df['DNA_P'])
filtered_df

Unnamed: 0,Un1_P,Un2_P,DNA_P,seq,log2_foldchange
0,3,1,7,AAGGGCCGTCGGTGATTTTAGTCTTCCTCAGTGGTTACCAGGTCTC...,-1.807355
2,3,6,14,AATCCGCAATTCTCACAGTGACGTGGGCGTGGCTCCCCACCAATGA...,-1.637430
3,9,25,23,CGAAGCGAGAGCACGGGAGACCAGAGTGGCCCCTAGGAGGGCCGTT...,-0.436099
4,12,11,21,ATAGCTTAAGTGTGATTCACTCTGAGTCATTTACTGCTGCTGCTGC...,-0.868755
5,15,26,14,AATCAAACCCTCTGTCGTCAAGACTCCCCGCCCCACCCACCTTTCC...,0.550197
...,...,...,...,...,...
65529,13,7,30,GGTGAGGGAAGAGTGTCGAGTAAGTTGTTAGGATTACCGTCGCCCA...,-1.584963
65530,44,51,60,ATTCATTCATTTCCCGGAGCTCGGCTAGCGCACGCCCCTCTAGCCG...,-0.337035
65531,21,21,32,TCATTAACAATCTTATTAGAAATGACTGTTTAGTTTTGGGAAGGGC...,-0.607683
65532,9,10,24,ACCCTCAAACAATGTTTCTCCTAGACGTCATATCCGGTCTCCTGAC...,-1.337035


In [16]:
# Keeps necessary columns and saves into a csv
df = filtered_df[["seq", "log2_foldchange"]]
df.to_csv('filtered_data.csv', index=False)

## Creating Train and Test Data sets

In [17]:
y = pd.read_csv('filtered_data.csv')
embeddings = np.load('all_embeddings.npy')
num_features = embeddings.shape[1]
embeddings_df = pd.DataFrame(data=embeddings, columns=[f'feature_{i}' for i in range(num_features)])
combined_df = pd.concat([y, embeddings_df], axis=1)
combined_df

Unnamed: 0,seq,log2_foldchange,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_502,feature_503,feature_504,feature_505,feature_506,feature_507,feature_508,feature_509,feature_510,feature_511
0,AAGGGCCGTCGGTGATTTTAGTCTTCCTCAGTGGTTACCAGGTCTC...,-1.807355,-0.221479,0.375265,-0.070072,0.018169,0.122353,-0.062364,-0.204157,0.204243,...,0.182669,-0.184257,0.012784,-0.079647,-0.187456,-0.028130,-0.149950,-0.095397,0.045924,-0.038616
1,AATCCGCAATTCTCACAGTGACGTGGGCGTGGCTCCCCACCAATGA...,-1.637430,-0.082955,0.445444,0.009490,0.060718,0.130965,0.186992,-0.114871,0.279662,...,0.191590,-0.124149,0.030825,0.140356,-0.127855,0.050343,0.071205,-0.172399,0.028063,-0.249046
2,CGAAGCGAGAGCACGGGAGACCAGAGTGGCCCCTAGGAGGGCCGTT...,-0.436099,-0.063608,0.028950,-0.236808,-0.018406,0.178925,-0.033419,-0.085391,0.233167,...,0.129316,-0.129225,0.051348,0.065044,-0.272028,0.210748,-0.125177,-0.108903,0.220304,0.092894
3,ATAGCTTAAGTGTGATTCACTCTGAGTCATTTACTGCTGCTGCTGC...,-0.868755,-0.151784,0.088810,-0.022082,0.030434,0.073392,0.107212,-0.074913,0.311212,...,0.132638,-0.044720,-0.015729,-0.103420,-0.128833,-0.060064,-0.110427,-0.061071,0.006627,-0.213193
4,AATCAAACCCTCTGTCGTCAAGACTCCCCGCCCCACCCACCTTTCC...,0.550197,-0.152393,0.410881,-0.112441,-0.013755,-0.037293,-0.051986,-0.072344,0.138729,...,0.248422,-0.261388,0.021608,0.140008,-0.147731,0.088916,0.001743,-0.108572,0.181858,0.038152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54378,GGTGAGGGAAGAGTGTCGAGTAAGTTGTTAGGATTACCGTCGCCCA...,-1.584963,-0.150751,0.204058,-0.026122,0.131232,0.032702,0.014841,-0.086881,0.199306,...,0.109036,-0.207033,-0.005270,-0.187050,-0.230642,0.104303,-0.119739,0.083344,0.104305,0.045247
54379,ATTCATTCATTTCCCGGAGCTCGGCTAGCGCACGCCCCTCTAGCCG...,-0.337035,-0.110486,0.132200,-0.029922,0.084088,0.160334,-0.037929,-0.130806,0.229146,...,0.105075,-0.157981,0.008488,-0.059236,-0.143570,0.179987,-0.073518,-0.085842,0.087208,-0.151903
54380,TCATTAACAATCTTATTAGAAATGACTGTTTAGTTTTGGGAAGGGC...,-0.607683,-0.129157,0.359411,0.043890,0.058684,-0.233653,-0.012922,-0.020539,0.373452,...,0.039749,0.004676,0.092948,-0.134254,0.055820,-0.017262,-0.065442,-0.084155,0.049110,-0.332159
54381,ACCCTCAAACAATGTTTCTCCTAGACGTCATATCCGGTCTCCTGAC...,-1.337035,-0.163624,0.317240,-0.037378,0.110128,0.108444,0.088166,-0.219039,0.141461,...,0.053094,-0.319559,-0.098971,0.000889,-0.105216,-0.015243,-0.030015,-0.187441,0.113340,-0.106090


In [18]:
target_column = 'log2_foldchange'

# Shuffle the DataFrame
df_shuffled = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the data into train, validation, and test sets
train_ratio = 0.7
validation_ratio = 0.1
test_ratio = 0.2

# Calculate the split indices
train_idx, test_idx = train_test_split(df_shuffled.index, test_size=0.2, random_state=42)

# Create the train, validation, and test DataFrames
df_train = df_shuffled.loc[train_idx]
df_test = df_shuffled.loc[test_idx]
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

# Display the sizes of the resulting DataFrames
print(f"Train set size: {len(df_train)}")
print(f"Test set size: {len(df_test)}")

Train set size: 43506
Test set size: 10877


In [19]:
# Save DataFrames to CSV files
df_train.to_csv('train_data.csv', index=False)
df_test.to_csv('test_data.csv', index=False)