In [1]:
import pandas as pd
from model import ContrastiveLoss, MultiPosConLoss, SequenceEncoder
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch
import torch.nn as nn
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, matthews_corrcoef
import wandb
from collections import namedtuple

In [2]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mzongl[0m ([33mcsc240_lztp[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

# Load Data

In [3]:
# rename columns
def remove_rename(df: pd.DataFrame) -> pd.DataFrame:
    df = df.drop(["peak.1", "gene.1"], axis=1)

    num_cells = (len(df.columns)-4) // 2
    new_cols = ['peak', 'gene', 'Pair', 'is_pair'] + ["atac." + str(x) for x in range(num_cells)] + ["rna." + str(x) for x in range(num_cells)]
    df = df.rename(columns=dict(zip(df.columns, new_cols)))
    return df

def set_labels(df: pd.DataFrame) -> pd.DataFrame:
    df["is_pair"] = df["is_pair"].apply(lambda x: 1 if x else 0)
    return df
    
def normalize(df: pd.DataFrame) -> pd.DataFrame:
    # only counts
    num_df = df[df.columns[4:]]
    
    # Summing each column (sample) to get library sizes
    library_sizes = num_df.sum(axis=0)
    
    # Normalizing to CPM
    cpm_df = num_df.div(library_sizes, axis=1) * 10**6
    
    # If you want to log-transform the CPM data
    log_cpm_df = cpm_df.apply(lambda x: np.log2(x + 1))

    return pd.concat([df[df.columns[:4]], log_cpm_df], axis=1)
    
DATA_PATH = "../data/Tab_delimited_text/"
train_df = pd.read_csv(DATA_PATH + "train.csv")
test_df = pd.read_csv(DATA_PATH + "test.csv")

train_df = remove_rename(train_df)
train_df = set_labels(train_df)
train_df = normalize(train_df)

test_df = remove_rename(test_df)
test_df = set_labels(test_df)
test_df = normalize(test_df)

train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# Train Embedding Model

In [4]:
# dataset
class SequencingDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
        self.features_atac = dataframe.drop(["peak", "gene", "Pair", "is_pair"] + [x for x in dataframe.columns if "rna" in x], axis=1).values
        self.features_rna = dataframe.drop(["peak", "gene", "Pair", "is_pair"] + [x for x in dataframe.columns if "atac" in x], axis=1).values
        self.labels = dataframe["is_pair"].values

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x1 = torch.tensor(self.features_atac[idx], dtype=torch.float32)
        x2 = torch.tensor(self.features_rna[idx], dtype=torch.float32)
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x1, x2, y

In [5]:
def relationship_score(model, sequence1, sequence2):
    model.eval()  # Set model to evaluation mode
    
    with torch.no_grad():  # No need to track gradients during inference
        embedding1 = model(sequence1)
        embedding2 = model(sequence2)
        
        # Compute distance between embeddings
        distance = nn.functional.pairwise_distance(embedding1, embedding2)

        relation = torch.sigmoid(-distance)
        
    return relation.item()

def make_binary_task(model, dataloader):
    X = []
    y = []
    for point in dataloader:
        relation = relationship_score(model, point[0], point[1])
        X.append(relation)
        y.append(point[2].item())
    return np.array(X).reshape(-1, 1), y

In [6]:
# data loading
train_dataset = SequencingDataset(train_df)
val_dataset = SequencingDataset(val_df)
test_dataset = SequencingDataset(test_df)

# Create DataLoader
train2_dataloader = DataLoader(train_dataset)
val_dataloader = DataLoader(val_dataset)
test_dataloader = DataLoader(test_dataset)

# define metrics
metrics = [
    ("f1", f1_score),
    ("acc", accuracy_score),
    ("mcc", matthews_corrcoef),
]

def train(use_wandb=True):
    if use_wandb:
        wandb.init()
        config = wandb.config
    else:
        config = {
            "batch_size": 8,
            "embedding_dim": 64,
            "epochs": 10,
            "lr": 1e-3,
            "margin": 1.0,
        }
        Config = namedtuple("configuration", list(config.keys()))
        config = Config(**config)
    train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
    
    # Assuming you have defined your model `SequenceEncoder` and loaded it
    model = SequenceEncoder(embedding_dim=config.embedding_dim)
    optimizer = optim.Adam(model.parameters(), lr=config.lr)
    
    # Training loop
    criterion = ContrastiveLoss(margin=config.margin)
    
    num_epochs = config.epochs
    for epoch in range(num_epochs):
        for batch_idx, (atac, rna, labels) in enumerate(train_dataloader):
            optimizer.zero_grad()
            
            atac_features = model(atac)
            rna_features = model(rna)
            
            loss = criterion(atac_features, rna_features, labels)
            loss.backward()
            optimizer.step()

        if use_wandb:
            wandb.log({"epoch": epoch+1, "loss": loss.item()})
        # print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")
        
    X, y = make_binary_task(model, train2_dataloader)
    clf = LogisticRegression(random_state=42).fit(X, y)

    X, y_true = make_binary_task(model, val_dataloader)
    y_pred = clf.predict(X)

    if use_wandb:
        wandb.log({"val_" + metricname: metric(y_true, y_pred) for metricname, metric in metrics})
    else:
        print({"val_" + metricname: metric(y_true, y_pred) for metricname, metric in metrics})
    return model, clf

In [9]:
# parameter sweep
sweep_configuration = {
    "method": "grid",
    "name": "contrastive",
    "metric": {"goal": "maximize", "name": "val_f1"},
    "parameters": {
        "batch_size": {"values": [16, 32, 64]},
        "embedding_dim": {"values": [64, 128, 256]},
        "epochs": {"min": 3, "max": 10, "distribution": "int_uniform"},
        "lr": {"max": 1e-1, "min": 1e-5, "distribution": "log_uniform_values"},
        "margin": {"min": 0.5, "max": 1.5},
    },
}

sweep_id = wandb.sweep(sweep=sweep_configuration, project="hackathon-2024")

Create sweep with ID: 8lf1jk1n
Sweep URL: https://wandb.ai/csc240_lztp/hackathon-2024/sweeps/8lf1jk1n


In [10]:
wandb.agent(sweep_id, function=train, count=30)

[34m[1mwandb[0m: Agent Starting Run: 108cywli with config:
[34m[1mwandb[0m: 	batch_size: 4
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	lr: 0.0001154506272506112
[34m[1mwandb[0m: 	margin: 1.0867454096026874


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011163564144468788, max=1.0…

VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▅█
loss,█▁▁
val_acc,▁
val_f1,▁
val_mcc,▁

0,1
epoch,3.0
loss,0.86835
val_acc,0.46667
val_f1,0.61905
val_mcc,-0.07039


[34m[1mwandb[0m: Agent Starting Run: kukk5u4u with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	epochs: 6
[34m[1mwandb[0m: 	lr: 0.0004319362458375587
[34m[1mwandb[0m: 	margin: 0.9485007517515488


VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▄▅▇█
loss,█▃▁▁▃▁
val_acc,▁
val_f1,▁
val_mcc,▁

0,1
epoch,6.0
loss,0.18591
val_acc,0.48333
val_f1,0.65169
val_mcc,0.0


[34m[1mwandb[0m: Agent Starting Run: myzzomug with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	epochs: 7
[34m[1mwandb[0m: 	lr: 0.001045790969981246
[34m[1mwandb[0m: 	margin: 0.9303192539427846


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011163545833308792, max=1.0…

VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▅▆▇█
loss,█▁▂▁▁▁▁
val_acc,▁
val_f1,▁
val_mcc,▁

0,1
epoch,7.0
loss,0.5413
val_acc,0.48333
val_f1,0.65169
val_mcc,0.0


[34m[1mwandb[0m: Agent Starting Run: u5uozrm0 with config:
[34m[1mwandb[0m: 	batch_size: 4
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	epochs: 4
[34m[1mwandb[0m: 	lr: 4.048996795118957e-05
[34m[1mwandb[0m: 	margin: 1.3321669566330954


VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▃▆█
loss,▂█▂▁
val_acc,▁
val_f1,▁
val_mcc,▁

0,1
epoch,4.0
loss,1.45365
val_acc,0.45
val_f1,0.61176
val_mcc,-0.14262


[34m[1mwandb[0m: Agent Starting Run: e9am54mc with config:
[34m[1mwandb[0m: 	batch_size: 4
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	epochs: 7
[34m[1mwandb[0m: 	lr: 0.018996069317532137
[34m[1mwandb[0m: 	margin: 0.8969712214307501


VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▅▆▇█
loss,▁▁▁█▁▁▁
val_acc,▁
val_f1,▁
val_mcc,▁

0,1
epoch,7.0
loss,14.92507
val_acc,0.48333
val_f1,0.64368
val_mcc,-0.00619


[34m[1mwandb[0m: Agent Starting Run: gsv56hcc with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	epochs: 7
[34m[1mwandb[0m: 	lr: 0.030765652837904757
[34m[1mwandb[0m: 	margin: 0.7055300949974258


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▅▆▇█
loss,▁█▁▁▁▁▁
val_acc,▁
val_f1,▁
val_mcc,▁

0,1
epoch,7.0
loss,0.27999
val_acc,0.48333
val_f1,0.65169
val_mcc,0.0


[34m[1mwandb[0m: Agent Starting Run: 0ep65oh3 with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	lr: 0.03639569638060656
[34m[1mwandb[0m: 	margin: 0.994303159095732


VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,▁▁▁█▁▁▁▁▁▁
val_acc,▁
val_f1,▁
val_mcc,▁

0,1
epoch,10.0
loss,21.81164
val_acc,0.48333
val_f1,0.65169
val_mcc,0.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 25wythkm with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	epochs: 4
[34m[1mwandb[0m: 	lr: 1.7559388845380295e-05
[34m[1mwandb[0m: 	margin: 1.459367972764954


VBox(children=(Label(value='0.001 MB of 0.006 MB uploaded\r'), FloatProgress(value=0.18373680943513346, max=1.…

0,1
epoch,▁▃▆█
loss,▅█▅▁
val_acc,▁
val_f1,▁
val_mcc,▁

0,1
epoch,4.0
loss,10.25145
val_acc,0.48333
val_f1,0.65169
val_mcc,0.0


[34m[1mwandb[0m: Agent Starting Run: gl7b6ujb with config:
[34m[1mwandb[0m: 	batch_size: 4
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	lr: 0.0026900170147286266
[34m[1mwandb[0m: 	margin: 1.1763820184352332


VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▅█
loss,▁▃█
val_acc,▁
val_f1,▁
val_mcc,▁

0,1
epoch,3.0
loss,0.69192
val_acc,0.48333
val_f1,0.65169
val_mcc,0.0


[34m[1mwandb[0m: Agent Starting Run: za706r0r with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	epochs: 9
[34m[1mwandb[0m: 	lr: 0.019120252387131106
[34m[1mwandb[0m: 	margin: 0.8096864124483761


VBox(children=(Label(value='0.001 MB of 0.006 MB uploaded\r'), FloatProgress(value=0.18373680943513346, max=1.…

0,1
epoch,▁▂▃▄▅▅▆▇█
loss,▁▅▁▁▁▁▁█▁
val_acc,▁
val_f1,▁
val_mcc,▁

0,1
epoch,9.0
loss,0.36876
val_acc,0.48333
val_f1,0.65169
val_mcc,0.0


[34m[1mwandb[0m: Agent Starting Run: 9o8b5wgo with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	lr: 0.00019792801211801923
[34m[1mwandb[0m: 	margin: 1.0845279667847645


VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▅█
loss,█▄▁
val_acc,▁
val_f1,▁
val_mcc,▁

0,1
epoch,3.0
loss,1.11651
val_acc,0.51667
val_f1,0.57971
val_mcc,0.04717


[34m[1mwandb[0m: Agent Starting Run: ia6ohed8 with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	lr: 1.5711592297876668e-05
[34m[1mwandb[0m: 	margin: 1.0719894620976047


VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▃▅▆█
loss,█▅▁▂▁
val_acc,▁
val_f1,▁
val_mcc,▁

0,1
epoch,5.0
loss,7.19079
val_acc,0.46667
val_f1,0.6
val_mcc,-0.06071


[34m[1mwandb[0m: Agent Starting Run: jhhxljot with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	lr: 1.206606570833793e-05
[34m[1mwandb[0m: 	margin: 1.4019892187710268
[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
[34m[1mwandb[0m: [32m[41mERROR[0m Problem finishing run
Exception in thread Thread-74 (_run_job):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/wandb/agents/pyagent.py", line 307, in _run_job
    self._function()
  File "/tmp/ipykernel_24976/2606322927.py", line 20, in train
  File "/opt/conda/lib/python3.11/site-packages/wandb/sdk/wandb_init.py", line 1191, in init
    wandb._sentry.reraise(e)
  File "/opt/conda/lib/python3.11/site-packages/wandb/analytics/sentry.py", line 155, in reraise
    raise exc.with_traceback(sys.exc_info()[2])
  File "/opt/conda/lib/python3.11/site-packages/wandb/sdk/wandb_init.py", line 1177, in init
   

# Test

In [28]:
model, clf = train(use_wandb=False)
X, y = make_binary_task(model, train2_dataloader)
clf = LogisticRegression(random_state=42).fit(X, y)

{'val_f1': 0.06666666666666667, 'val_acc': 0.5333333333333333, 'val_mcc': 0.13460334176483385}


In [None]:
X, y = make_binary_task(model, val_dataloader)
y_pred = clf.predict(X)
y_true = y

print(classification_report(y_true, y_pred))

In [None]:
X_test, y_test = make_binary_task(model, test_dataloader)

In [None]:
y_pred = clf.predict(X_test)
y_pred