In [16]:
import numpy as np
import matplotlib as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA

In [13]:
ccle = np.load("ccle/data.npy")

In [5]:
drugbank = np.load("drugbank/data.npy")

In [14]:
gdsc = np.load("gdsc/data.npy")

In [9]:
offside = np.load("offside/data.npy")

In [15]:
pdx = np.load("pdx/data.npy")

In [7]:
repurposing_hub = np.load("repurposing_hub/data.npy")

In [3]:
sider = np.load("sider/data.npy")

In [2]:
stitch = np.load("stitch/data_900.npy").astype(np.uint16)

In [8]:
#drugbankY = drugbank[:, -1] #classification
#offsideY = offside[:, -1] #classification
repurposing_hubY = repurposing_hub[:, -1] #classification
#siderY = sider[:, -1] #classification
#stitchY = stitch[:, -1] #classification

In [9]:
#drugbank = drugbank[:, :-1]
#offside = offside[:, :-1]
repurposing_hub = repurposing_hub[:, :-1]
#sider = sider[:, :-1]
#stitch = stitch[:, :-1]

In [11]:
print(repurposing_hubY)

[ 1.  1.  1. ... -1. -1. -1.]


In [21]:
def analysis(X, y):
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    nb_class = GaussianNB()
    accuracies = []
    auroc_scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        n_components = min(X_train.shape[1], 693)
        pca = PCA(n_components = n_components)
        
        X_train_pca = pca.fit_transform(X_train)
        X_test_pca = pca.fit_transform(X_test)
        
        
        nb_class.fit(X_train_pca, y_train)
        
        y_pred = nb_class.predict(X_test_pca)
        
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        y_proba = nb_class.predict_proba(X_test_pca)[:, 1]
        auroc = roc_auc_score(y_test, y_proba)
        auroc_scores.append(auroc)
        
    for fold, (accuracy, auroc) in enumerate(zip(accuracies, auroc_scores), start=1):
        print(f"Fold {fold}: Accuracy = {accuracy: .4f}, AUROC = {auroc: .4f}")
        
    average_accuracy = np.mean(accuracies)
    average_auroc = np.mean(auroc_scores)
    print("\nAverage Accuracy:", average_accuracy)
    print("Average AUROC Score:", average_auroc)
        

In [16]:
analysis(drugbank, drugbankY)

Fold 1: Accuracy =  0.6511, AUROC =  0.6902
Fold 2: Accuracy =  0.6463, AUROC =  0.6821
Fold 3: Accuracy =  0.6468, AUROC =  0.6796

Average Accuracy: 0.6480420248328559
Average AUROC Score: 0.6839863557728231


In [8]:
analysis(offside, offsideY)

Fold 1: Accuracy =  0.7048, AUROC =  0.6452
Fold 2: Accuracy =  0.6940, AUROC =  0.6456
Fold 3: Accuracy =  0.6969, AUROC =  0.6384

Average Accuracy: 0.6985530827593273
Average AUROC Score: 0.64303661139092


In [7]:
analysis(sider, siderY)

Fold 1: Accuracy =  0.8257, AUROC =  0.6653
Fold 2: Accuracy =  0.8309, AUROC =  0.6690
Fold 3: Accuracy =  0.8452, AUROC =  0.6337

Average Accuracy: 0.8339231593909778
Average AUROC Score: 0.6559913959687306


In [6]:
analysis(stitch, stitchY)

Fold 1: Accuracy =  0.1031, AUROC =  0.5215
Fold 2: Accuracy =  0.1229, AUROC =  0.5224
Fold 3: Accuracy =  0.1246, AUROC =  0.5210

Average Accuracy: 0.11686472915740354
Average AUROC Score: 0.5216583301832992


In [12]:
analysis(repurposing_hub, repurposing_hubY)

Fold 1: Accuracy =  0.6150, AUROC =  0.6350
Fold 2: Accuracy =  0.6114, AUROC =  0.6309
Fold 3: Accuracy =  0.6124, AUROC =  0.6369

Average Accuracy: 0.6129231214914874
Average AUROC Score: 0.6342812642101564


In [24]:
def linAnalysis(X):
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    rmse_scores = []
    accuracy_scores = []
    
    for i, (train_index, test_index) in enumerate(kf.split(X), 1):
        X_train, X_test = X[train_index], X[test_index]

        pca = PCA(n_components=2)
        X_train_pca = pca.fit_transform(X_train)
        X_test_pca = pca.transform(X_test)

        model = LinearRegression()
        model.fit(X_train_pca, X_train)

        y_pred = model.predict(X_test_pca)
        rmse = np.sqrt(mean_squared_error(X_test, y_pred))
        rmse_scores.append(rmse)

        print(f"Fold {i}: RMSE = {rmse}")
        
    average_rmse = np.mean(rmse_scores)
    print("Average RMSE Score:", average_rmse)

In [25]:
linAnalysis(ccle)

Fold 1: RMSE = 0.707691653388026
Fold 2: RMSE = 0.7085340525038896
Fold 3: RMSE = 0.7080955176375918
Average RMSE Score: 0.7081070745098357


In [26]:
linAnalysis(pdx)

Fold 1: RMSE = 0.6920392372124256
Fold 2: RMSE = 0.6888561005385097
Fold 3: RMSE = 0.6974736223839395
Average RMSE Score: 0.6927896533782917


In [27]:
linAnalysis(gdsc)

Fold 1: RMSE = 0.7108343977841044
Fold 2: RMSE = 0.710914804517822
Fold 3: RMSE = 0.7111724840323289
Average RMSE Score: 0.7109738954447518
