In [2]:
import os

import pandas as pd
import numpy as np
import torch

from sklearn.metrics import (
    accuracy_score,
    mean_absolute_error,
    mean_squared_error,
    root_mean_squared_error,
    r2_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split

# Baseline Imports
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
# Tabpfn
from tabpfn import TabPFNRegressor

# Set CUDA
os.environ['CUDA_VISIBLE_DEVICES'] = '7'
print("CUDA Available:", torch.cuda.is_available())
print("Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No CUDA device")

TypeError: 

In [95]:
# Load datasets
data_dir = "/home/vivian.chu/vivian-sandbox/other/xAI-cancer-competition/.data"
train_data = pd.read_csv(f"{data_dir}/train.csv")
train_targets = pd.read_csv(f"{data_dir}/train_targets.csv")
test_data = pd.read_csv(f"{data_dir}/test.csv")

# Train and target files
train_data = pd.read_csv(train_df1)  # All columns, including cell line IDs
# Clean column names and merge targets with train data
train_data.rename(columns={'Unnamed: 0': 'sampleId'}, inplace=True)
train_data['sampleId'] = train_data['sampleId'].astype(str)
train_data.columns = train_data.columns.str.replace("-", ".", regex=True)

## Filter out genes expressed in < 30% of samples

In [96]:
# Calculate the percentage of non-zero values (expression) for each gene column
gene_expression_percentage = (train_data.iloc[:, 1:] != 0).mean()  # Exclude the 'sampleId' column

# Filter genes expressed in at least 30% of the samples
filtered_genes = gene_expression_percentage[gene_expression_percentage >= 0.9].index

# Filter the train_data to include only the selected genes and 'sampleId'
filtered_train_data = train_data[['sampleId'] + list(filtered_genes)]

# Replace train_data
train_data = filtered_train_data.copy()

## Subset to highly variable genes

In [97]:
# Extract gene expression data (excluding sample IDs)
gene_expression_data = train_data.iloc[:, 1:]

# Calculate mean and variance for each gene
gene_means = gene_expression_data.mean(axis=0)
gene_variances = gene_expression_data.var(axis=0)

# Compute normalized variance (e.g., coefficient of variation)
gene_cv = gene_variances / gene_means

# Rank genes by their normalized variance
highly_variable_genes = gene_cv.sort_values(ascending=False).head(500)  # Select top 1000 variable genes

# Filter the dataframe to retain only these genes
filtered_genes_data = train_data[['sampleId'] + list(highly_variable_genes.index)]

train_data = filtered_genes_data.copy()

# Train, Validation and Target

In [100]:
X = train_data.copy()
y = train_targets["AAC"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
# Train and evaluate TabPFN
reg = TabPFNRegressor(random_state=42)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_val)

# Calculate Variance Explained (R2 Score)
score = root_mean_squared_error(y_val, y_pred)
print(f"TabPFN RMSE: {score:.4f}")

In [None]:
# Define models
models = [
    ('TabPFN', TabPFNRegressor(random_state=42)),
    ('RandomForest', RandomForestRegressor(random_state=42)),
    ('XGBoost', XGBRegressor(random_state=42)),
    ('CatBoost', CatBoostRegressor(random_state=42, verbose=0))
]

# Calculate scores
scoring = 'neg_root_mean_squared_error'
scores = {name: cross_val_score(model, X, y, cv=5, scoring=scoring, n_jobs=-1).mean()
          for name, model in models}


# Plot results
df = pd.DataFrame(list(scores.items()), columns=['Model', 'RMSE'])
df.RMSE = -df.RMSE # Smaller is better
ax = df.plot(x='Model', y='RMSE', kind='bar', figsize=(10, 6))
ax.set_ylim(df['RMSE'].min() * 0.99, df['RMSE'].max() * 1.01)
ax.set_title('Model Comparison - 5-fold Cross-validation \n (Root Mean Squared Error - Smaller is better)')