# nn.py

In [None]:
"""
This script trains a deep learning model to predict a target variable based on input features.

- The input features and target variable are read from CSV files.
- The model is a feed-forward neural network with batch normalization and dropout layers.
- The model is trained using Stochastic Gradient Descent (SGD) with a learning rate of 0.001.
- The loss function is Mean Squared Error (MSE).
- The model's performance is evaluated on a separate test dataset after each training epoch.
- Training and test losses are printed for each epoch.
- The trained model parameters are saved after each epoch.
- The script supports training on multiple GPUs if they are available.
"""
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import os
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)
from functions import FullConnectedBlock, NeuralNetwork2



############ Load Training Data ############################

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Device:', device)
print(f'Number of available GPUs: {torch.cuda.device_count()}')

data1 = pd.read_csv("/mnt/data/macaulay/datas/training_omicExpression_Embeddings.csv")
data2 = pd.read_csv("/mnt/data/macaulay/datas/training_gene_embeddings.csv")
df_Y = pd.read_csv('/mnt/data/macaulay/datas/training_crispr.csv')
print('Training data loaded successfully')

batch_size1 = 30
avg_train_losses = []
combined_batches = []
input_dim = data1.shape[1] + data2.shape[1] - 1
loss_fn = nn.MSELoss()



model = NeuralNetwork2(input_dim)

optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

if torch.cuda.device_count() >= 4:
    print("Using 4 GPUs!")
    model = nn.DataParallel(model, device_ids=list(range(4)))

print('Model initialized successfully')
saved_models = os.listdir('/mnt/data/macaulay/model_state2')


epochs = [int(file.split('_')[-1].split('.')[0]) for file in saved_models if 'crispr_fc1_model_state_epoch_' in file]


last_epoch = max(epochs) if epochs else 0
start_epoch = last_epoch + 1
end_epoch = start_epoch + 20






for epoch in range(start_epoch, end_epoch):
    # ... training loop ...

    print(f"Starting New Epoch")

    Y_data_loader = 0
    epoch_loss = 0
    model_path = f'/mnt/data/macaulay/model_state2/crispr_fc1_model_state_epoch_{epoch-1}.pth'
    model.to(device)
    if os.path.exists(model_path):

        model.load_state_dict(torch.load(model_path))
        print(f'Model {epoch - 1} loaded successfully for epoch {epoch}')

    else:
        print('No saved model found. Training from scratch.')

    for i in range(0, len(data1), batch_size1):
        batch_data1 = data1.iloc[i:i + batch_size1]
        batch_data2 = data2.iloc[0:len(data2)]


        batch_data1['key'] = 1
        batch_data2['key'] = 1
        batch_data1 = batch_data1[list(data1.columns[0:]) + ['key']]
        batch_data2 = batch_data2[list(data2.columns[1:]) + ['key']]
        combined_batch = pd.merge(batch_data1, batch_data2, on='key').drop(columns=['key'])
        combined_batches.append(combined_batch)


        X_train = pd.concat(combined_batches)
        Y_train = df_Y.iloc[Y_data_loader:(Y_data_loader + (len(data2) * batch_size1))]
        Y_data_loader += (len(data2) * batch_size1)

        combined_batches = []


        X_train1 = torch.tensor(X_train.values, dtype=torch.float32)
        Y_train1 = torch.tensor(Y_train.values.reshape(-1, 1), dtype=torch.float32)  # Reshape Y_train to (num_samples, 1)

        train_data = TensorDataset(X_train1, Y_train1)
        train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True, num_workers=4, pin_memory=True)

        train_loss = 0.0
        model.train()

        for batch, (inputs, targets) in enumerate(train_dataloader):
            inputs = inputs.to(device)
            targets = targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_fn(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_dataloader)

        print(f"Batch {i//batch_size1 + 1}.{epoch}: Avg. training loss = {avg_train_loss:.4f}")
        epoch_loss += avg_train_loss

        print(f"{i+batch_size1} of cell line is done")

    avg_epoch_loss = epoch_loss / ((len(data1) // batch_size1) * (len(data2) // len(data2)))
    avg_train_losses.append(avg_epoch_loss)

    torch.save(model.state_dict(), f'/mnt/data/macaulay/model_state2/crispr_fc1_model_state_epoch_{epoch}.pth')
    print('model saved successfully')
    print(f"Epoch {epoch}/{end_epoch-1}: Avg. epoch loss = {avg_epoch_loss:.4f}")


loss_df = pd.DataFrame(avg_train_losses, columns=["Avg_Train_Loss"])
loss_filepath = "datas/crispr_training_loss.csv"
loss_df.to_csv(loss_filepath, index=False)
print(f"Training loss saved to {loss_filepath}")

In [55]:

import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import os
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)
from functions import FullConnectedBlock, NeuralNetwork2



def initialize_environment(data1_path, data2_path, df_Y_path, learning_rate):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print('Device:', device)
    print(f'Number of available GPUs: {torch.cuda.device_count()}')

    data1 = pd.read_csv(data1_path)
    data2 = pd.read_csv(data2_path)
    df_Y = pd.read_csv(df_Y_path)
    print('Training data loaded successfully')

    loss_fn = nn.MSELoss()
    input_dim = data1.shape[1] + data2.shape[1] - 1
    model = NeuralNetwork2(input_dim)
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    if torch.cuda.device_count() >= 4:
        print("Using 4 GPUs!")
        model = nn.DataParallel(model, device_ids=list(range(4)))

    print('Model initialized successfully')
    
    return device, data1, data2, df_Y, model, optimizer, loss_fn

def cartesian_product(data1, data2):
    data1 = data1.iloc[:, 1:]
    data1['key'] = 1
    data2['key'] = 1
    combined_data = pd.merge(data1, data2, on='key').drop(columns=['key'])
    return combined_data

def cartesian_product_generator(data1, data2, df_Y, batch_size1):
    for i in range(0, len(data1), batch_size1):
        start_idx = i * len(data2)
        end_idx = (i + batch_size1) * len(data2)
        
        batch_data1 = data1.iloc[i:i + batch_size1]
        combined_data = cartesian_product(batch_data1, data2)
        
        batch_Y = df_Y.iloc[start_idx:end_idx]
        
        yield combined_data, batch_Y

def load_model(model, epoch, model_save_path):
    model_path = os.path.join(model_save_path, f'crispr_fc1_model_state_epoch_{epoch-1}.pth')
    if os.path.exists(model_path):
        model.load_state_dict(torch.load(model_path))
        print(f'Model {epoch - 1} loaded successfully for epoch {epoch}')
    else:
        print('No saved model found. Training from scratch.')
    return model

# Main Training Loop

def main_training_loop(data1_path, data2_path, df_Y_path, batch_size1, learning_rate, num_epochs, model_save_path):
    
    
    device, data1, data2, df_Y, model, optimizer, loss_fn = initialize_environment(data1_path, data2_path, df_Y_path, learning_rate)
    
    saved_models = os.listdir(model_save_path)
    epochs = [int(file.split('_')[-1].split('.')[0]) for file in saved_models if 'crispr_fc1_model_state_epoch_' in file]
    last_epoch = max(epochs) if epochs else 0
    start_epoch = last_epoch + 1
    end_epoch = start_epoch + num_epochs
    
    all_train_losses = []

    for epoch in range(start_epoch, end_epoch):
        print(f"Starting New Epoch {epoch}")
        model = load_model(model, epoch, model_save_path)
        
        for batch_X, batch_Y in cartesian_product_generator(data1, data2, df_Y, batch_size1):
            X_train = torch.tensor(batch_X.values, dtype=torch.float32)
            Y_train = torch.tensor(batch_Y.values.reshape(-1, 1), dtype=torch.float32)
            
            train_data = TensorDataset(X_train, Y_train)
            train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True, num_workers=4, pin_memory=True)

            train_loss = 0.0
            model.train()
            print('training')
            for inputs, targets in train_dataloader:
                inputs, targets = inputs.to(device), targets.to(device)
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = loss_fn(outputs, targets)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()

            avg_train_loss = train_loss / len(train_dataloader)
            all_train_losses.append(avg_train_loss)

            print(f"Epoch {epoch}.{batch_X.index[0]//batch_size1 + 1}: Avg. training loss = {avg_train_loss:.4f}")

        torch.save(model.state_dict(), f'{model_save_path}crispr_fc1_model_state_epoch_{epoch}.pth')
        print('Model saved successfully')

    loss_df = pd.DataFrame(all_train_losses, columns=["Avg_Train_Loss"])
    loss_filepath = os.path.join(model_save_path, "crispr_training_loss.csv")
    loss_df.to_csv(loss_filepath, index=False)
    print(f"Training loss saved to {loss_filepath}")




In [56]:
main_training_loop(
    data1_path="datas/training_gene_embeddings.csv", #gene embeddings
    data2_path="datas/training_omicExpression_Embeddings.csv", #omic expression embeddings
    df_Y_path="datas/training_crispr.csv", 
    batch_size1 = 1, 
    learning_rate=0.001, 
    num_epochs=20, 
    model_save_path='datas/'
)


Device: cpu
Number of available GPUs: 0
Training data loaded successfully
Model initialized successfully
Starting New Epoch 1
No saved model found. Training from scratch.
training
Epoch 1.1: Avg. training loss = 0.5493
training
Epoch 1.1: Avg. training loss = 0.6046
training


KeyboardInterrupt: 

In [59]:
def cartesian_product_generator(data1, data2, df_Y, batch_size1):
    for i in range(0, len(data1), batch_size1):
        start_idx = i * len(data2)
        end_idx = (i + batch_size1) * len(data2)
        
        batch_data1 = data1.iloc[i:i + batch_size1]
        combined_data = cartesian_product(batch_data1, data2)
        
        batch_Y = df_Y.iloc[start_idx:end_idx]
        
        yield combined_data, batch_Y

def cartesian_product(data1, data2):
    data1 = data1.iloc[:, 1:]
    data1['key'] = 1
    data2['key'] = 1
    combined_data = pd.merge(data1, data2, on='key').drop(columns=['key'])
    return combined_data

In [58]:
import pandas as pd
data2 = pd.read_csv("datas/test_omicExpression_Embeddings.csv")
data1 = pd.read_csv("datas/training_gene_embeddings.csv")
df_Y = pd.read_csv('datas/A_test_gene__Y_crispr.csv')
print('Training data loaded successfully')
batch_size1 = 1




Training data loaded successfully


In [65]:
for batch_X, batch_Y in cartesian_product_generator(data1, data2, df_Y, batch_size1):
    display(batch_X)
    display(batch_Y)
    break

Unnamed: 0,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,...,502_y,503_y,504_y,505_y,506_y,507_y,508_y,509_y,510_y,511_y
0,0.013528,-0.073574,0.050425,-0.007168,-0.083504,0.009434,-0.243107,-0.036453,-0.003885,0.214968,...,-0.002187,0.000594,0.000299,-0.000457,-6e-06,-1.2e-05,-0.000443,0.000373,0.000129,-0.000948
1,0.013528,-0.073574,0.050425,-0.007168,-0.083504,0.009434,-0.243107,-0.036453,-0.003885,0.214968,...,0.000141,-0.000154,0.000114,0.000162,0.000534,0.000236,-0.000229,0.000265,-4.7e-05,0.00022
2,0.013528,-0.073574,0.050425,-0.007168,-0.083504,0.009434,-0.243107,-0.036453,-0.003885,0.214968,...,0.000143,-0.00016,0.00011,0.000163,0.000534,0.00024,-0.00023,0.000263,-4.1e-05,0.000224
3,0.013528,-0.073574,0.050425,-0.007168,-0.083504,0.009434,-0.243107,-0.036453,-0.003885,0.214968,...,0.000141,-0.000159,0.000111,0.00016,0.000535,0.00024,-0.000233,0.000262,-4.6e-05,0.000224
4,0.013528,-0.073574,0.050425,-0.007168,-0.083504,0.009434,-0.243107,-0.036453,-0.003885,0.214968,...,0.000141,-0.000161,0.000113,0.000163,0.000538,0.00024,-0.000231,0.000263,-5e-05,0.000219
5,0.013528,-0.073574,0.050425,-0.007168,-0.083504,0.009434,-0.243107,-0.036453,-0.003885,0.214968,...,0.000143,-0.000161,0.000111,0.000164,0.000532,0.000239,-0.000229,0.000267,-4.4e-05,0.000221
6,0.013528,-0.073574,0.050425,-0.007168,-0.083504,0.009434,-0.243107,-0.036453,-0.003885,0.214968,...,0.000142,-0.000159,0.000112,0.000163,0.000536,0.000239,-0.000232,0.000264,-4.6e-05,0.000224
7,0.013528,-0.073574,0.050425,-0.007168,-0.083504,0.009434,-0.243107,-0.036453,-0.003885,0.214968,...,0.000143,-0.000159,0.000111,0.000163,0.000534,0.000235,-0.000234,0.000264,-4.6e-05,0.000222
8,0.013528,-0.073574,0.050425,-0.007168,-0.083504,0.009434,-0.243107,-0.036453,-0.003885,0.214968,...,0.00014,-0.000158,0.000113,0.000163,0.000537,0.000238,-0.000232,0.000265,-4.6e-05,0.000225
9,0.013528,-0.073574,0.050425,-0.007168,-0.083504,0.009434,-0.243107,-0.036453,-0.003885,0.214968,...,0.000142,-0.000159,0.000112,0.000164,0.000536,0.000239,-0.000237,0.000266,-4.7e-05,0.000224


Unnamed: 0,Y
0,0.271543
1,-0.567019
2,-0.504965
3,-0.590148
4,-0.281329
5,-0.421348
6,-0.107869
7,-0.586895
8,-0.442476
9,-0.211472


In [66]:
data1

Unnamed: 0,Gene,0,1,2,3,4,5,6,7,8,...,1890,1891,1892,1893,1894,1895,1896,1897,1898,1899
0,ZFX,0.013528,-0.073574,0.050425,-0.007168,-0.083504,0.009434,-0.243107,-0.036453,-0.003885,...,0.005840,0.054215,-0.018552,0.010348,0.048311,0.011924,0.011584,0.030220,0.333860,0.010832
1,LAMP2,0.004373,-0.052026,0.115090,-0.018382,-0.366196,0.019519,-0.149446,-0.032409,-0.004365,...,0.053146,0.022796,0.041019,0.017856,-0.112012,0.043097,0.028423,0.056248,0.138608,0.031280
2,ITGA2B,0.005604,-0.024311,0.063571,-0.014624,-0.464599,0.016603,-0.144890,-0.017510,-0.003596,...,0.075977,-0.017172,0.020559,0.034451,-0.044476,-0.010408,0.003882,0.095413,0.050243,0.038366
3,ASB4,0.003966,-0.157932,0.034745,-0.003395,0.023854,-0.018922,0.011001,-0.031456,-0.005197,...,0.040247,0.077687,-0.035935,0.004659,-0.122684,-0.008640,0.038181,0.139848,-0.008251,-0.013466
4,GDE1,0.013251,0.258344,0.148767,-0.011984,-0.054923,0.024582,-0.055574,-0.057606,-0.006614,...,-0.049405,-0.035010,-0.057596,0.064666,-0.099617,-0.003899,0.068397,-0.026983,-0.023365,-0.025853
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17538,TMEM247,0.007376,-0.055579,0.015968,0.000652,-0.090398,0.004820,-0.177906,-0.015425,-0.006904,...,0.040532,0.027138,-0.139576,0.042335,-0.019513,-0.029749,0.099430,-0.019195,0.082027,0.096192
17539,EEF1AKMT4,0.006579,0.054662,0.058391,-0.011576,-0.136212,-0.006775,-0.078994,-0.114747,-0.006499,...,0.010785,0.021735,-0.020145,-0.087090,-0.070418,0.054884,0.017232,0.025832,0.109838,0.177065
17540,TBCE,0.004036,-0.087206,0.031435,-0.010328,0.106398,0.000606,-0.061067,-0.038477,-0.006378,...,0.005424,0.037653,0.015974,0.058448,-0.102026,-0.009397,-0.068728,-0.000311,-0.100703,0.022476
17541,CCDC39,0.006182,-0.038689,0.041625,-0.007740,-0.266323,-0.044024,0.244122,-0.006107,-0.002780,...,0.068222,0.006219,0.012965,0.039683,-0.014748,-0.024827,0.035910,-0.009677,0.042979,0.058219


In [44]:
batch_X.iloc[:, 1898:]

Unnamed: 0,1898,1899,0_y,1_y,2_y,3_y,4_y,5_y,6_y,7_y,...,502_y,503_y,504_y,505_y,506_y,507_y,508_y,509_y,510_y,511_y
0,0.33386,0.010832,0.000024,0.000033,-0.000010,0.000005,0.000176,-0.000150,-0.000153,0.000060,...,-0.000140,0.000144,-0.000168,-0.000127,0.000486,-0.000168,0.000188,-0.000161,0.000007,-0.000165
1,0.33386,0.010832,0.000024,0.000033,-0.000011,0.000006,0.000174,-0.000148,-0.000154,0.000062,...,-0.000141,0.000146,-0.000168,-0.000130,0.000486,-0.000167,0.000185,-0.000161,0.000005,-0.000164
2,0.33386,0.010832,-0.000017,-0.000032,0.000128,-0.000038,0.000098,-0.000182,0.000033,0.000178,...,-0.000005,0.000037,0.000053,-0.000043,-0.000504,-0.000078,0.000085,-0.000107,0.000034,-0.000070
3,0.33386,0.010832,-0.000016,-0.000033,0.000127,-0.000041,0.000099,-0.000183,0.000032,0.000179,...,-0.000006,0.000036,0.000050,-0.000044,-0.000501,-0.000076,0.000087,-0.000109,0.000034,-0.000071
4,0.33386,0.010832,-0.000018,-0.000031,0.000129,-0.000042,0.000096,-0.000185,0.000032,0.000180,...,-0.000004,0.000034,0.000049,-0.000044,-0.000501,-0.000077,0.000084,-0.000108,0.000033,-0.000071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
966,0.33386,0.010832,-0.000161,0.000083,0.000215,0.000011,0.000161,0.000663,-0.000133,0.000253,...,-0.000511,0.000225,-0.000158,-0.000136,0.000217,-0.000201,-0.000028,-0.000254,0.000039,-0.000173
967,0.33386,0.010832,-0.000167,0.000080,0.000215,0.000015,0.000158,0.000668,-0.000137,0.000259,...,-0.000511,0.000229,-0.000156,-0.000143,0.000218,-0.000203,-0.000038,-0.000258,0.000039,-0.000176
968,0.33386,0.010832,-0.000164,0.000083,0.000214,0.000011,0.000156,0.000669,-0.000138,0.000253,...,-0.000508,0.000228,-0.000158,-0.000139,0.000219,-0.000204,-0.000039,-0.000259,0.000039,-0.000175
969,0.33386,0.010832,-0.000166,0.000082,0.000214,0.000013,0.000159,0.000664,-0.000139,0.000255,...,-0.000507,0.000231,-0.000155,-0.000143,0.000217,-0.000202,-0.000039,-0.000259,0.000038,-0.000174


In [62]:
batch_Y

Unnamed: 0,Y
0,0.271543
1,-0.567019
2,-0.504965
3,-0.590148
4,-0.281329
5,-0.421348
6,-0.107869
7,-0.586895
8,-0.442476
9,-0.211472


In [67]:
data2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,key
0,0.000364,0.000712,0.000426,-0.000169,0.00031,-0.004777,2.9e-05,-0.000258,0.00011,0.000228,...,0.000594,0.000299,-0.000457,-6e-06,-1.2e-05,-0.000443,0.000373,0.000129,-0.000948,1
1,3e-06,9e-06,-0.000139,4e-05,-0.000273,0.000309,0.000109,-0.000223,-0.000303,-0.000253,...,-0.000154,0.000114,0.000162,0.000534,0.000236,-0.000229,0.000265,-4.7e-05,0.00022,1
2,-1e-06,1.7e-05,-0.000145,4e-05,-0.000268,0.0003,0.00011,-0.000233,-0.000301,-0.000253,...,-0.00016,0.00011,0.000163,0.000534,0.00024,-0.00023,0.000263,-4.1e-05,0.000224,1
3,-3e-06,1.3e-05,-0.000141,3.5e-05,-0.000269,0.000302,0.000113,-0.000228,-0.0003,-0.000249,...,-0.000159,0.000111,0.00016,0.000535,0.00024,-0.000233,0.000262,-4.6e-05,0.000224,1
4,-8e-06,1.8e-05,-0.000138,3.7e-05,-0.000269,0.000305,0.000115,-0.00023,-0.000298,-0.000252,...,-0.000161,0.000113,0.000163,0.000538,0.00024,-0.000231,0.000263,-5e-05,0.000219,1
5,-4e-06,1.7e-05,-0.000137,3.6e-05,-0.00027,0.000302,0.000113,-0.000231,-0.000302,-0.000251,...,-0.000161,0.000111,0.000164,0.000532,0.000239,-0.000229,0.000267,-4.4e-05,0.000221,1
6,-4e-06,1.4e-05,-0.000141,3.9e-05,-0.000271,0.000303,0.000114,-0.00023,-0.000299,-0.000252,...,-0.000159,0.000112,0.000163,0.000536,0.000239,-0.000232,0.000264,-4.6e-05,0.000224,1
7,-4e-06,9e-06,-0.000139,3.9e-05,-0.00027,0.000302,0.000112,-0.000228,-0.0003,-0.000249,...,-0.000159,0.000111,0.000163,0.000534,0.000235,-0.000234,0.000264,-4.6e-05,0.000222,1
8,-2e-06,1.1e-05,-0.000141,3.7e-05,-0.000274,0.000304,0.000115,-0.00023,-0.000304,-0.000255,...,-0.000158,0.000113,0.000163,0.000537,0.000238,-0.000232,0.000265,-4.6e-05,0.000225,1
9,-4e-06,1.4e-05,-0.00014,4e-05,-0.000277,0.000304,0.000118,-0.00023,-0.000297,-0.000251,...,-0.000159,0.000112,0.000164,0.000536,0.000239,-0.000237,0.000266,-4.7e-05,0.000224,1


In [82]:
epochs = []
epoch = 1

for minor in range(1, len(data1) + 1):
    epoched = f'{epoch}.{minor:02}'  
    epochs.append(float(epoched)) 

In [84]:
len(epochs)

17543

In [None]:
#a.py
"""

Goal
Test the model on the saved models, and calculate the metrics for each gene concatinated with the cell line embeddings

it calculate the correlation by each seen gene
"""
import torch 
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import os
import torch.nn.functional as F
from functions import FullConnectedBlock, NeuralNetwork2
pd.options.mode.chained_assignment = None





# Initialize lists to store metrics


avg_test_losses = []
mae_values = []
rmse_values = []
r2_values = []
correlation_coefficients = []
p_values = []



# Loop through the saved models
# for epoch in range(num_epochs):

target_epochs = [8]

# Loop through the target epochs
for epoch in target_epochs:
    #model_epoch = epoch
    Y_data_count = 0

    model_path = f'/mnt/data/macaulay/model_state2/crispr_fc1_model_state_epoch_{epoch}.pth'
    if os.path.exists(model_path):
        

        data1_test = pd.read_csv("/mnt/data/macaulay/datas/test_omicExpression_Embeddings.csv") #30 >>
        data2_test = pd.read_csv("/mnt/data/macaulay/datas/training_gene_embeddings.csv") #17000 >>
        df_Y_test = pd.read_csv('/mnt/data/macaulay/datas/A_test_gene__Y_crispr.csv') #>>
        print('Test data loaded successfully')




        batch_size1 = len(data1_test)
        batch_size_Y = len(data1_test)
        batch_size2 = 1
        combined_batches_test = []

        input_dim = data1_test.shape[1] + data2_test.shape[1] - 1
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print('Device:', device)
        model = NeuralNetwork2(input_dim)

        # If multiple GPUs
        if torch.cuda.device_count() >= 4:
            print("Using 4 GPUs!")
            model = nn.DataParallel(model, device_ids=list(range(4)))

        model.to(device)
        model.load_state_dict(torch.load(model_path))
        model.eval()  # Set the model to evaluation mode

        # Define the loss function as Mean Squared Error
        loss_fn = nn.MSELoss()

        

        for i in range(0, len(data1_test), batch_size1):
            batch_data1_test = data1_test.iloc[i:i + batch_size1]
            batch_data2_test = data2_test.iloc[0:len(data2_test)]


            batch_data1_test['key'] = 1
            batch_data2_test['key'] = 1
            batch_data1_test = batch_data1_test[list(data1_test.columns[0:]) + ['key']]
            batch_data2_test = batch_data2_test[list(data2_test.columns[1:]) + ['key']]
            combined_batch = pd.merge(batch_data1_test, batch_data2_test, on='key').drop(columns=['key'])
            combined_batches.append(combined_batch)


            X_test = pd.concat(combined_batches)
            Y_test = df_Y.iloc[Y_data_loader:(Y_data_loader + (len(data2_test) * batch_size1))]
            Y_data_loader += (len(data2_test) * batch_size1)

            combined_batches = []

            
            X_test1 = torch.tensor(X_test.values, dtype=torch.float32)
            Y_test1 = torch.tensor(Y_test.values.reshape(-1, 1), dtype=torch.float32)
            test_data = TensorDataset(X_test1, Y_test1)
            test_dataloader = DataLoader(test_data, batch_size=128, shuffle=True)

            #print('Test data preprocessed successfully')
            ##################### The first 1 Batch which forms 100 rows after concat, are preprocessed and fed to the neural network under the same loop #####################

            # Evaluate the model on the test data
            test_loss = 0.0
            actual_outputs = []
            predicted_outputs = []
            with torch.no_grad():  # Disable gradient calculation
                for inputs, targets in test_dataloader:
                    inputs = inputs.to(device)
                    targets = targets.to(device)
                    outputs = model(inputs)
                    loss = loss_fn(outputs, targets)
                    test_loss += loss.item()
                    actual_outputs.extend(targets.cpu().numpy().flatten().tolist())
                    predicted_outputs.extend(outputs.cpu().numpy().flatten().tolist())

            avg_test_loss = test_loss / len(test_dataloader)
            avg_test_losses.append(avg_test_loss)

            # Calculate Pearson correlation coefficient
            correlation_coefficient, p_value = pearsonr(actual_outputs, predicted_outputs)
            correlation_coefficients.append(correlation_coefficient)
            p_values.append(p_value)  # Append the p-value

            mae = mean_absolute_error(actual_outputs, predicted_outputs)
            rmse = np.sqrt(mean_squared_error(actual_outputs, predicted_outputs))
            r2 = r2_score(actual_outputs, predicted_outputs)

            # Append metrics to their respective lists
            mae_values.append(mae)
            rmse_values.append(rmse)
            r2_values.append(r2)

            print(f'Epoch {epoch} : gene {i+1}/{target_epochs}: Avg. test loss = {avg_test_loss:.4f}')
            print(f'Epoch {epoch} : gene {i+1}/{target_epochs}: Correlation Coefficient = {correlation_coefficient:.4f}')
            print(f'Epoch {epoch} : gene {i+1}/{target_epochs}: Mean Absolute Error = {mae:.4f}')
            print(f'Epoch {epoch} : gene {i+1}/{target_epochs}: Root Mean Square Error = {rmse:.4f}')
            print(f'Epoch {epoch} : gene {i+1}/{target_epochs}: R2 Score = {r2:.4f}')


    else:
        print(f'Model for epoch {epoch} not found!')


# Create a list to store the full epoch labels
epochs = []

# Loop through the target epochs
for major in target_epochs:
    for minor in range(1, len(data2_test) + 1):
        epoch = f'{major}.{minor:02}'  # Format as a string with two decimal places
        epochs.append(float(epoch))

# Create a DataFrame to hold the metrics
metrics_df = pd.DataFrame({
    'Epoch': epochs,
    'Correlation_Coefficient': correlation_coefficients,
    'P_Value': p_values,  # Include the p-values
    'Test_Loss': avg_test_losses,
    'MAE': mae_values,
    'RMSE': rmse_values,
    'R2_Score': r2_values
})

# Write the DataFrame to a CSV file
metrics_path = f'datas/metrics/A_seen_genewise_correlation_metrics_summary_crispr_{target_epochs[0]}.csv'
metrics_df.to_csv(metrics_path, index=False)
print(f'Metrics saved to {metrics_path}')




In [None]:
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import os
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)
from functions import FullConnectedBlock, NeuralNetwork2
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
pd.options.mode.chained_assignment = None

# Initialization and Environment Setup
def initialize_environment(data1_path, data2_path, df_Y_path, learning_rate):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print('Device:', device)
    print(f'Number of available GPUs: {torch.cuda.device_count()}')

    data1 = pd.read_csv(data1_path)
    data2 = pd.read_csv(data2_path)
    df_Y = pd.read_csv(df_Y_path)
    print('Training data loaded successfully')

    loss_fn = nn.MSELoss()
    input_dim = data1.shape[1] + data2.shape[1] - 1
    model = NeuralNetwork2(input_dim)
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    if torch.cuda.device_count() >= 4:
        print("Using 4 GPUs!")
        model = nn.DataParallel(model, device_ids=list(range(4)))
    print('Model initialized successfully')
    
    return device, data1, data2, df_Y, model, optimizer, loss_fn

def load_test_data(data1_test_path_A, data2_test_path_A, df_Y_test_path_A,
                    data1_test_path_B, data2_test_path_B, df_Y_test_path_B,
                      data1_test_path_C, data2_test_path_C, df_Y_test_path_C):
    
    data1_test_A = pd.read_csv(data1_test_path_A)
    data2_test_A = pd.read_csv(data2_test_path_A)
    df_Y_test_A = pd.read_csv(df_Y_test_path_A)
    data1_test_B = pd.read_csv(data1_test_path_B)
    data2_test_B = pd.read_csv(data2_test_path_B)
    df_Y_test_B = pd.read_csv(df_Y_test_path_B)
    data1_test_C = pd.read_csv(data1_test_path_C)
    data2_test_C = pd.read_csv(data2_test_path_C)
    df_Y_test_C = pd.read_csv(df_Y_test_path_C)
    print('Test data loaded successfully')
    return data1_test_A, data2_test_A, df_Y_test_A, data1_test_B, data2_test_B, df_Y_test_B, data1_test_C, data2_test_C, df_Y_test_C



def cartesian_product(data1, data2):
    data1 = data1.iloc[:, 1:]
    data1['key'] = 1
    data2['key'] = 1
    combined_data = pd.merge(data1, data2, on='key').drop(columns=['key'])
    return combined_data

def cartesian_product_generator(data1, data2, df_Y, batch_size1):
    for i in range(0, len(data1), batch_size1):
        start_idx = i * len(data2)
        end_idx = (i + batch_size1) * len(data2)
        batch_data1 = data1.iloc[i:i + batch_size1]
        combined_data = cartesian_product(batch_data1, data2)
        batch_Y = df_Y.iloc[start_idx:end_idx]
        yield combined_data, batch_Y

def load_model(model, epoch, model_save_path):
    model_path = os.path.join(model_save_path, f'crispr_fc1_model_state_epoch_{epoch-1}.pth')
    if os.path.exists(model_path):
        model.load_state_dict(torch.load(model_path))
        print(f'Model {epoch - 1} loaded successfully for epoch {epoch}')
    else:
        print('No saved model found. Training from scratch.')
    return model

# Evaluation Function
def evaluate_model_on_test_data(model, data1_test, data2_test, df_Y_test, epoch, loss_fn, device, test_batch_size=128):
    model.eval()
    avg_test_losses = []
    mae_values = []
    rmse_values = []
    r2_values = []
    correlation_coefficients = []
    p_values = []
    gene_batches = 1
    for batch_X, batch_Y in cartesian_product_generator(data1_test, data2_test, df_Y_test, gene_batches):
        X_test = torch.tensor(batch_X.values, dtype=torch.float32)
        Y_test = torch.tensor(batch_Y.values.reshape(-1, 1), dtype=torch.float32)
        test_data = TensorDataset(X_test, Y_test)
        test_dataloader = DataLoader(test_data, batch_size=test_batch_size, shuffle=True)

        test_loss = 0.0
        actual_outputs = []
        predicted_outputs = []
        with torch.no_grad():
            for inputs, targets in test_dataloader:
                inputs = inputs.to(device)
                targets = targets.to(device)
                outputs = model(inputs)
                loss = loss_fn(outputs, targets)
                test_loss += loss.item()
                actual_outputs.extend(targets.cpu().numpy().flatten().tolist())
                predicted_outputs.extend(outputs.cpu().numpy().flatten().tolist())

        avg_test_loss = test_loss / len(test_dataloader)
        avg_test_losses.append(avg_test_loss)
        correlation_coefficient, p_value = pearsonr(actual_outputs, predicted_outputs)
        correlation_coefficients.append(correlation_coefficient)
        p_values.append(p_value)
        mae = mean_absolute_error(actual_outputs, predicted_outputs)
        rmse = np.sqrt(mean_squared_error(actual_outputs, predicted_outputs))
        r2 = r2_score(actual_outputs, predicted_outputs)
        mae_values.append(mae)
        rmse_values.append(rmse)
        r2_values.append(r2)

    epochs = []
    for minor in range(1, len(data1_test) + 1):
        epoched = f'{epoch}.{minor:02}'  
        epochs.append(float(epoched))

    metrics_df = pd.DataFrame({
        'Epoch': epochs,
        'Correlation_Coefficient': correlation_coefficients,
        'P_Value': p_values,
        'Test_Loss': avg_test_losses,
        'MAE': mae_values,
        'RMSE': rmse_values,
        'R2_Score': r2_values
    })
    return metrics_df

# Main Training Loop
def updated_main_training_loop(data1_path, data2_path, df_Y_path, 
                               data1_test_path_A, data2_test_path_A, df_Y_test_path_A, 
                               data1_test_path_B, data2_test_path_B, df_Y_test_path_B,
                               data1_test_path_C, data2_test_path_C, df_Y_test_path_C,
                               batch_size1, learning_rate, num_epochs, model_save_path, test_batch_size=128):
    

    device, data1, data2, df_Y, model, optimizer, loss_fn = initialize_environment(data1_path, data2_path, df_Y_path, learning_rate)

        
    saved_models = os.listdir(model_save_path)
    epochs = [int(file.split('_')[-1].split('.')[0]) for file in saved_models if 'crispr_fc1_model_state_epoch_' in file]
    last_epoch = max(epochs) if epochs else 0
    start_epoch = last_epoch + 1
    end_epoch = start_epoch + num_epochs
    
    for epoch in range(start_epoch, end_epoch):
        model.train()
        model = load_model(model, epoch, model_save_path)
        for batch_X, batch_Y in cartesian_product_generator(data1, data2, df_Y, batch_size1):
            X_train = torch.tensor(batch_X.values, dtype=torch.float32)
            Y_train = torch.tensor(batch_Y.values.reshape(-1, 1), dtype=torch.float32)
            train_data = TensorDataset(X_train, Y_train)
            train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True, num_workers=4, pin_memory=True)

            train_loss = 0.0
            for inputs, targets in train_dataloader:
                inputs, targets = inputs.to(device), targets.to(device)
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = loss_fn(outputs, targets)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()

        torch.save(model.state_dict(), os.path.join(model_save_path, f'crispr_fc1_model_state_epoch_{epoch}.pth'))
        
        # Evaluate the model on test data after each epoch

        data1_test_A, data2_test_A, df_Y_test_A, data1_test_B, data2_test_B, df_Y_test_B, data1_test_C, data2_test_C, df_Y_test_C = load_test_data(data1_test_path_A, data2_test_path_A, df_Y_test_path_A,
                                                                                                                                                data1_test_path_B, data2_test_path_B, df_Y_test_path_B,
                                                                                                                                                  data1_test_path_C, data2_test_path_C, df_Y_test_path_C)



        metrics_df_A = evaluate_model_on_test_data(model, data1_test_A, data2_test_A, df_Y_test_A, epoch, loss_fn, device, test_batch_size)
        metrics_path_A = os.path.join(model_save_path, f"A_metrics_epoch_{epoch}.csv")
        metrics_df_A.to_csv(metrics_path_A, index=False)
        print(f'Metrics for epoch {epoch} saved to {metrics_path_A}')

        metrics_df_B = evaluate_model_on_test_data(model, data1_test_B, data2_test_B, df_Y_test_B, epoch, loss_fn, device, test_batch_size)
        metrics_path_B = os.path.join(model_save_path, f"B_metrics_epoch_{epoch}.csv")
        metrics_df_B.to_csv(metrics_path_B, index=False)
        print(f'Metrics for epoch {epoch} saved to {metrics_path_B}')

        metrics_df_C = evaluate_model_on_test_data(model, data1_test_C, data2_test_C, df_Y_test_C, epoch, loss_fn, device, test_batch_size)
        metrics_path_C = os.path.join(model_save_path, f"C_metrics_epoch_{epoch}.csv")
        metrics_df_C.to_csv(metrics_path_C, index=False)
        print(f'Metrics for epoch {epoch} saved to {metrics_path_C}')




In [None]:
updated_main_training_loop(
    data1_path, data2_path, df_Y_path,
    data1_test_path, data2_test_path, df_Y_test_path,
    batch_size1, learning_rate, num_epochs, model_save_path
)

In [85]:
import pandas as pd
import qnorm

# Sample dataframe
df = pd.DataFrame({
    'A': [5, 2, 3, 4],
    'B': [4, 1, 4, 2],
    'C': [3, 4, 6, 8]
})

# Normalize column-wise (default behavior, axis=1)
df_normalized_columns = qnorm.quantile_normalize(df)

# Normalize row-wise (set axis=0)
df_normalized_rows = qnorm.quantile_normalize(df_normalized_columns.T).T

print("Original Dataframe:")
print(df)
print("\nAfter Column-wise Normalization:")
print(df_normalized_columns)
print("\nAfter Row-wise Normalization:")
print(df_normalized_rows)


Original Dataframe:
   A  B  C
0  5  4  3
1  2  1  4
2  3  4  6
3  4  2  8

After Column-wise Normalization:
          A         B         C
0  5.666667  5.166667  2.000000
1  2.000000  2.000000  3.000000
2  3.000000  5.166667  4.666667
3  4.666667  3.000000  5.666667

After Row-wise Normalization:
        A       B      C
0  4.8750  4.1250  2.500
1  3.3125  3.3125  4.875
2  2.5000  4.8750  4.125
3  4.1250  2.5000  4.875
