In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA

In [2]:
stitch = np.load("stitch/data_900.npy").astype(np.int32)

In [3]:
drugbank = np.load("drugbank/data.npy").astype(np.int32)

In [4]:
repurposing_hub = np.load("repurposing_hub/data.npy").astype(np.int32)

In [5]:
pdx = np.load("pdx/data.npy").astype(np.int32)

In [6]:
gdsc = np.load("gdsc/data.npy").astype(np.int32)

In [7]:
ccle = np.load("ccle/data.npy").astype(np.int32)

In [8]:
sider = np.load("sider/data.npy").astype(np.int32)

In [9]:
offside = np.load("offside/data.npy").astype(np.int32)

In [10]:
stitchY = stitch[:, -1] #classification

In [11]:
drugbankY = drugbank[:, -1] #classification

In [12]:
repurposing_hubY = repurposing_hub[:, -1] #classification

In [13]:
siderY = sider[:, -1] #classification

In [14]:
offsideY = offside[:, -1] #classification

In [15]:
stitch = stitch[:, :-1]

In [16]:
drugbank = drugbank[:, :-1]

In [17]:
repurposing_hub = repurposing_hub[:, :-1]

In [18]:
sider = sider[:, :-1]

In [19]:
offside = offside[:, :-1]

In [20]:
pca1 = PCA(n_components = 50)
stitchPCA = pca1.fit_transform(stitch)
drugbankPCA = pca1.transform(drugbank)
repurposing_hubPCA = pca1.transform(repurposing_hub)

In [21]:
drugTargetsY = np.concatenate((stitchY, drugbankY, repurposing_hubY), axis = 0)

In [22]:
drugTargets = np.concatenate((np.array(stitchPCA), np.array(drugbankPCA), np.array(repurposing_hubPCA)), axis = 0)

In [23]:
pca2 = PCA(n_components = 50)
pdxPCA = pca2.fit_transform(pdx)
cclePCA = pca2.transform(ccle)
gdscPCA = pca2.transform(gdsc)

In [24]:
drugResponse = np.concatenate((np.array(pdxPCA), np.array(cclePCA), np.array(gdscPCA)), axis = 0)

In [25]:
pca3 = PCA(n_components = 50)
offsidePCA = pca3.fit_transform(offside)
siderPCA = pca3.transform(sider)

In [26]:
drugSideEffectsY = np.concatenate((siderY, offsideY), axis = 0)

In [27]:
drugSideEffects = np.concatenate((np.array(siderPCA), np.array(offsidePCA)), axis = 0)

In [35]:
def linearRegression(X, Z, oldMetric, n_components=2):
    # Apply PCA to reduce dimensionality of X
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)  # Fit and transform PCA on X
    
    # Train linear regression model on PCA-reduced X
    model = LinearRegression()
    model.fit(X_pca, X)  # Note: We're assuming the goal is to reconstruct X from X_pca

    # Apply the same PCA transformation to Z
    pcaZ = PCA(n_components = n_components)
    Z_pca = pcaZ.fit_transform(Z)  # Transform Z with the fitted PCA

    # Predict the original higher-dimensional data of Z from its PCA-reduced version
    Z_pred = model.predict(Z_pca)
    
    Z_predA = adjust_dimensions(Z, Z_pred)
    
    # Calculate and print the RMSE between Z and its prediction for evaluation
    mse = mean_squared_error(Z, Z_predA)
    gains = gain(oldMetric, mse)
    print(f"gain: {gains}")

In [28]:
def linAnalysisClass2(X, y, Z, w, oldMetric):
    # Train the model on the entire training set
    pca = PCA(n_components=50)
    X_pca = pca.fit_transform(X)  # Transform the training data
    pcaZ = PCA(n_components=50)
    Z_pca = pcaZ.fit_transform(Z)
    
    model = LinearRegression()
    model.fit(X_pca, y)
    
    # Predict on the test set
    y_pred_continuous = model.predict(Z_pca)
    
    # Calculate AUC for the continuous predictions
    auc = roc_auc_score(w, y_pred_continuous)
    
    # Convert continuous predictions to class predictions based on the mean of the predictions
    y_pred_class = [1 if pred > np.mean(y_pred_continuous) else -1 for pred in y_pred_continuous]
    
    # Calculate accuracy for the class predictions
    accuracy = accuracy_score(w, y_pred_class)
    
    gains = gain(oldMetric, auc)
    
    # Print the results
    print(f"Gain = {gains}")
    
    # Calculate and display the confusion matrix
    #confusion_matrix_result = confusion_matrix(w, y_pred_class)
    #cm_display = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix_result, display_labels=[False, True])
    #cm_display.plot(cmap='viridis')
    
    

In [29]:
def adjust_dimensions(Z, Z_pred):
    if Z.shape[1] != Z_pred.shape[1]:
        # If Z_pred has fewer features, add zeros to match Z
        if Z_pred.shape[1] < Z.shape[1]:
            padding = np.zeros((Z_pred.shape[0], Z.shape[1] - Z_pred.shape[1]))
            Z_pred_adjusted = np.hstack((Z_pred, padding))
        # If Z_pred has more features, truncate to match Z
        else:
            Z_pred_adjusted = Z_pred[:, :Z.shape[1]]
    else:
        Z_pred_adjusted = Z_pred
    
    return Z_pred_adjusted


In [30]:
def gain(oldMetric, newMetric):
    gains = (oldMetric - newMetric)/oldMetric
    return gains

In [45]:
def linRegNormal(X, n_components=2, n_folds=3):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    mse_scores = []

    for i, (train_index, test_index) in enumerate(kf.split(X), 1):
        X_train, X_test = X[train_index], X[test_index]

        pca = PCA(n_components=n_components)
        X_train_pca = pca.fit_transform(X_train)
        X_test_pca = pca.transform(X_test)

        model = LinearRegression()
        model.fit(X_train_pca, X_train)  # Assuming X_train is used as y for unsupervised learning

        y_pred = model.predict(X_test_pca)
        mse = mean_squared_error(X_test, y_pred)  # MSE between X_test and predicted values
        mse_scores.append(mse)

        print(f"Fold {i}: MSE = {mse}")
        
    avg_mse = np.mean(mse_scores)
    print(f"Average MSE across {n_folds} folds: {avg_mse}")


    return mse_scores

In [39]:
def linRegClass(X, y, n_components=2, n_folds=3):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    auroc_scores = []

    for i, (train_index, test_index) in enumerate(kf.split(X), 1):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        pca = PCA(n_components=n_components)
        X_train_pca = pca.fit_transform(X_train)
        X_test_pca = pca.transform(X_test)

        model = LinearRegression()
        model.fit(X_train_pca, y_train)

        y_pred = model.predict(X_test_pca)

        auroc = roc_auc_score(y_test, y_pred)
        auroc_scores.append(auroc)

        print(f"Fold {i}: AUROC = {auroc}")

    avg_auroc = np.mean(auroc_scores)
    print(f"Average AUROC across {n_folds} folds: {avg_auroc}")

    return avg_auroc


In [40]:
linRegClass(drugTargets, drugTargetsY)

Fold 1: AUROC = 0.528551954775859
Fold 2: AUROC = 0.5329402833244491
Fold 3: AUROC = 0.5297121000981451
Average AUROC across 3 folds: 0.5304014460661511


0.5304014460661511

In [41]:
linRegClass(drugSideEffects, drugSideEffectsY)

Fold 1: AUROC = 0.5271571516090339
Fold 2: AUROC = 0.5249550183122819
Fold 3: AUROC = 0.510320120732555
Average AUROC across 3 folds: 0.5208107635512903


0.5208107635512903

In [46]:
linRegNormal(drugResponse)

Fold 1: MSE = 0.46290900980294963
Fold 2: MSE = 0.462357227011842
Fold 3: MSE = 0.46248437864066455
Average MSE across 3 folds: 0.46258353848515205


[0.46290900980294963, 0.462357227011842, 0.46248437864066455]

In [32]:
AStitch = 0.541
ADrug = 0.651
ARep = 0.653
ASider = 0.5401
AOff = 0.923

MPDX = 0.6928
MGDSC = 0.711
MCCLE = 0.7081

ADT = 0.5304
ADSE = 0.5208
MDR = 0.4626

In [33]:
linAnalysisClass2(drugTargets, drugTargetsY, drugSideEffects, drugSideEffectsY, ADSE)

Gain = 0.0864771307460889


In [34]:
linAnalysisClass2(drugSideEffects, drugSideEffectsY, drugTargets, drugTargetsY, ADT)

Gain = 0.0948214951545776


In [36]:
linearRegression(drugTargets, drugResponse, MDR)

gain: -0.09574006814426135


In [37]:
linearRegression(drugSideEffects, drugResponse, MDR)

gain: -0.11966048205567552


In [38]:
linearRegression(drugResponse, drugSideEffects, ADSE)

gain: -1.0374886626404949


In [39]:
linearRegression(drugResponse, drugTargets, ADT)

gain: 0.989898228345489


In [None]:
linAnalysisClass2(stitch, stitchY, drugbank, drugbankY, ADrug)

In [25]:
linAnalysisClass2(stitch, stitchY, repurposing_hub, repurposing_hubY, ARep)

Gain = 0.2411248627749035


In [46]:
linearRegression(stitch, pdx, MPDX)

gain: 0.6126867359251321


In [47]:
linearRegression(stitch, gdsc, MGDSC)

gain: 0.6195811343893053


In [48]:
linearRegression(stitch, ccle, MCCLE)

gain: 0.6189910106443999


In [24]:
linAnalysisClass2(stitch, stitchY, sider, siderY, ASider)

Gain = 0.029596435641812874


In [25]:
linAnalysisClass2(stitch, stitchY, offside, offsideY, AOff)

Gain = 0.45982710377955127


In [26]:
linAnalysisClass2(drugbank, drugbankY, stitch, stitchY, AStitch)

Gain = 0.111102834906391


In [27]:
linAnalysisClass2(drugbank, drugbankY, repurposing_hub, repurposing_hubY, ARep)

Gain = 0.2537334190997533


In [49]:
linearRegression(drugbank, pdx, MPDX)

gain: 0.6129736986758029


In [50]:
linearRegression(drugbank, gdsc, MGDSC)

gain: 0.6197703212878195


In [51]:
linearRegression(drugbank, ccle, MCCLE)

gain: 0.6185341356119249


In [28]:
linAnalysisClass2(drugbank, drugbankY, sider, siderY, ASider)

Gain = 0.05776429657161896


In [29]:
linAnalysisClass2(drugbank, drugbankY, offside, offsideY, AOff)

Gain = 0.4627884192593452


In [30]:
linAnalysisClass2(repurposing_hub, repurposing_hubY, stitch, stitchY, AStitch)

Gain = 0.07751547746990523


In [31]:
linAnalysisClass2(repurposing_hub, repurposing_hubY, drugbank, drugbankY, ADrug)

Gain = 0.2969895955329035


In [52]:
linearRegression(repurposing_hub, pdx, MPDX)

gain: 0.6145882249586487


In [53]:
linearRegression(repurposing_hub, gdsc, MGDSC)

gain: 0.6198873825996124


In [54]:
linearRegression(repurposing_hub, ccle, MCCLE)

gain: 0.6189845277360111


In [32]:
linAnalysisClass2(repurposing_hub, repurposing_hubY, sider, siderY, ASider)

Gain = 0.1331362982532458


In [33]:
linAnalysisClass2(repurposing_hub, repurposing_hubY, offside, offsideY, AOff)

Gain = 0.4566464145527701


In [55]:
linearRegression(pdx, stitch, AStitch)

gain: 0.9745582999583898


In [56]:
linearRegression(pdx, drugbank, ADrug)

gain: 0.9788235917738247


In [29]:
linearRegression(pdx, repurposing_hub, ARep)

gain: 0.9789023722099702


In [25]:
linearRegression(pdx, gdsc, MGDSC)

gain: 0.6593917076247345


In [18]:
linearRegression(pdx, ccle, MCCLE)

gain: 0.6512200665441348


In [30]:
linearRegression(pdx, sider, ASider)

gain: 0.3729041533296005


In [31]:
linearRegression(pdx, offside, AOff)

gain: 0.6322559313783346


In [32]:
linearRegression(gdsc, stitch, AStitch)

gain: 0.9747383276826829


In [33]:
linearRegression(gdsc, drugbank, ADrug)

gain: 0.9789837689562725


In [34]:
linearRegression(gdsc, repurposing_hub, ARep)

gain: 0.9790575737004116


In [15]:
linearRegression(gdsc, pdx, MPDX)

gain: 0.6455089433722532


In [35]:
linearRegression(gdsc, ccle, MCCLE)

gain: 0.6532040979857425


In [36]:
linearRegression(gdsc, sider, ASider)

gain: 0.3742610587159729


In [37]:
linearRegression(gdsc, offside, AOff)

gain: 0.632955516762619


In [38]:
linearRegression(ccle, stitch, AStitch)

gain: 0.9752717116189489


In [39]:
linearRegression(ccle, drugbank, ADrug)

gain: 0.9794167369220912


In [40]:
linearRegression(ccle, repurposing_hub, ARep)

gain: 0.9794860131685212


In [41]:
linearRegression(ccle, pdx, MPDX)

gain: 0.64668695525931


In [42]:
linearRegression(ccle, gdsc, MGDSC)

gain: 0.6545268299078072


In [43]:
linearRegression(ccle, sider, ASider)

gain: 0.37303519831226173


In [44]:
linearRegression(gdsc, offside, AOff)

gain: 0.6326848071294903


In [45]:
linAnalysisClass2(sider, siderY, stitch, stitchY, AStitch)

Gain = 0.061859830468028815


In [35]:
linAnalysisClass2(sider, siderY, drugbank, drugbankY, ADrug)

Gain = 0.2772035790315484


In [36]:
linAnalysisClass2(sider, siderY, repurposing_hub, repurposing_hubY, ARep)

Gain = 0.2157202335122307


In [57]:
linearRegression(sider, pdx, MPDX)

gain: 0.6462446926222434


In [58]:
linearRegression(sider, gdsc, MGDSC)

gain: 0.6541130347842451


In [59]:
linearRegression(sider, ccle, MCCLE)

gain: 0.6531364577463494


In [37]:
linAnalysisClass2(sider, siderY, offside, offsideY, AOff)

Gain = 0.45125755496434766


In [38]:
linAnalysisClass2(offside, offsideY, stitch, stitchY, AStitch)

Gain = 0.10084942317372886


In [39]:
linAnalysisClass2(offside, offsideY, drugbank, drugbankY, ADrug)

Gain = 0.2797650844810367


In [40]:
linAnalysisClass2(offside, offsideY, repurposing_hub, repurposing_hubY, ARep)

Gain = 0.21174666200132236


In [60]:
linearRegression(offside, pdx, MPDX)

gain: 0.6467661979899578


In [61]:
linearRegression(offside, gdsc, MGDSC)

gain: 0.6534859867494103


In [62]:
linearRegression(offside, ccle, MCCLE)

gain: 0.6529373318023147


In [41]:
linAnalysisClass2(offside, offsideY, sider, siderY, ASider)

Gain = 0.053309426667882576
