In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, mean_squared_error
from sklearn.svm import NuSVR
from scipy.stats import pearsonr
import gc
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, mean_squared_error, mean_absolute_error
from math import sqrt

def analyze_embeddings(embeddings_df, test_gene_df=None):

    task_names = ['solubility',
# 'BivalentVsLys4', 'BivalentVsNonMethylated', 'Tf_range', 'TF_target_type', 'solubility', 'subcellular_location', 'phastcons'
                  ]
    task_types = ['classification',
    # 'classification', 'classification', 'classification', 'classification', 'classification', 'classification', 'regression'
                  ]

    for task_name, task_type in zip(task_names, task_types):

        try:
            df_task = pd.read_csv(f'GNN/solubility.csv')
        except FileNotFoundError:
            print(f"Error: The file for {task_name} does not exist.")
            continue
        except pd.errors.ParserError:
            print(f"Error: File format issue with {task_name}.csv.")
            continue

        task_name2 = df_task.columns[1]
        print(f'Currently analysing {task_name2}')


        df_task = df_task.rename(columns={df_task.columns[1]: f'{task_name2}'})
        merged_df = embeddings_df.merge(df_task, on='Gene name', how='inner')
        merged_df = merged_df.dropna()

        if merged_df.empty:
            print(f"Warning: No common genes found for {task_name}.")
            continue

        if test_gene_df is not None:

            if task_type == 'classification':
                label_encoder = LabelEncoder()
                merged_df[f'{task_name2}'] = label_encoder.fit_transform(merged_df[f'{task_name2}'])
            test_gene_df = test_gene_df.rename(columns={test_gene_df.columns[0]: 'Gene name'})
            test_data = merged_df[merged_df['Gene name'].isin(test_gene_df['Gene name'])]
            train_data = merged_df[~merged_df['Gene name'].isin(test_gene_df['Gene name'])]
            X_train = train_data.drop(['Gene name', f'{task_name2}'], axis=1)
            y_train = train_data[f'{task_name2}']
            X_test = test_data.drop(['Gene name', f'{task_name2}'], axis=1)
            y_test = test_data[f'{task_name2}']
        else:
            if task_type == 'classification':
                label_encoder = LabelEncoder()
                merged_df[f'{task_name2}'] = label_encoder.fit_transform(merged_df[f'{task_name2}'])
            X = merged_df.drop(['Gene name', f'{task_name2}'], axis=1)
            y = merged_df[f'{task_name2}']
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


        kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        if task_type == 'classification':

            num_classes = len(np.unique(y_train))

            def train_and_evaluate(X_train_fold, y_train_fold, X_test_fold, y_test_fold):
                model = LogisticRegression(C=1.0, max_iter=1000)
                model.fit(X_train_fold, y_train_fold)
                test_probabilities = model.predict_proba(X_test_fold)

                test_predictions = np.argmax(test_probabilities, axis=1)

                accuracy = accuracy_score(y_test_fold, test_predictions)
                f1 = f1_score(y_test_fold, test_predictions, average='weighted')
                precision = precision_score(y_test_fold, test_predictions, average='weighted')
                recall = recall_score(y_test_fold, test_predictions, average='weighted')

                if num_classes == 2:
                    auc = roc_auc_score(y_test_fold, test_probabilities[:, 1])
                else:  # Multi-class classification
                    auc = roc_auc_score(y_test_fold, test_probabilities, multi_class='ovr')

                return auc, accuracy, f1, precision, recall

            # Parallelizing the cross-validation
            results = Parallel(n_jobs=-1)(delayed(train_and_evaluate)(X_train.iloc[train_index], y_train.iloc[train_index], X_train.iloc[test_index], y_train.iloc[test_index]) for train_index, test_index in kfold.split(X_train, y_train))

            auc_scores, accuracies, f1s, precisions, recalls = zip(*results)

            mean_auc = np.mean(auc_scores)
            std_auc = np.std(auc_scores)
            mean_accuracy = np.mean(accuracies)
            std_accuracy = np.std(accuracies)
            mean_f1 = np.mean(f1s)
            std_f1 = np.std(f1s)
            mean_precision = np.mean(precisions)
            std_precision = np.std(precisions)
            mean_recall = np.mean(recalls)
            std_recall = np.std(recalls)
            print(f"AUC for {task_name}: {mean_auc:.2f} ± {std_auc:.2f}")
            print(f"Accuracy for {task_name}: {mean_accuracy:.2f} ± {std_accuracy:.2f}")
            print(f"F1 for {task_name}: {mean_f1:.2f} ± {std_f1:.2f}")
            print(f"Precision for {task_name}: {mean_precision:.2f} ± {std_precision:.2f}")
            print(f"Recall for {task_name}: {mean_recall:.2f} ± {std_recall:.2f}")


            metrics = ['AUC', 'Accuracy']
            mean_values = [mean_auc, mean_accuracy]

            x = np.arange(len(metrics))
            width = 0.35

            fig, ax = plt.subplots()
            bars = ax.bar(x, mean_values, width, color=['blue', 'orange'])

            ax.set_ylabel('Scores')
            ax.set_title(f'5 fold AUC and Accuracy of {task_name}')
            ax.set_xticks(x)
            ax.set_xticklabels(metrics)
            ax.set_ylim(0, 1)

            for bar in bars:
                height = bar.get_height()
                ax.annotate(f'{height:.2f}',
                            xy=(bar.get_x() + bar.get_width() / 2, height),
                            xytext=(0, 3),
                            textcoords="offset points",
                            ha='center', va='bottom')

            plt.show()

analyze_embeddings(embeddings_df)

Currently analysing Solubility


ValueError: could not convert string to float: 'High Count'

In [2]:
import pandas as pd
embeddings_df = pd.read_csv('GNN/gcn_triplets_features.csv')
embeddings_df.rename(columns={'node': 'Gene name'}, inplace=True)
embeddings_df

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,Gene name,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature55,feature56,feature57,feature58,feature59,feature60,feature61,feature62,feature63,feature64
0,FES,0.743222,0.235243,0.352072,1.756445,0.743565,-1.508854,-3.176088,-0.940406,1.629964,...,-0.955505,0.330420,0.224715,0.982453,0.020559,-1.924390,-0.973003,1.812171,-0.198376,0.745180
1,SLC7A7,0.532286,0.560859,0.246730,1.508120,0.478819,-0.854478,-2.329918,-0.624550,1.022054,...,-0.598676,-0.016088,-0.208056,0.609815,0.169559,-1.695399,-0.262267,1.149558,-0.115290,0.701267
2,HSPA2,1.819809,1.217307,1.204578,3.261093,1.266466,-1.694946,-4.142288,-1.602228,2.270458,...,-2.058925,0.748306,0.380028,2.251389,-0.729229,-3.374894,-1.932471,3.118355,-0.053532,0.580218
3,ZSCAN9,-0.049771,0.297367,0.202798,0.406264,-0.392944,0.019731,-1.533474,-0.134429,1.112097,...,-0.357202,-0.237703,-0.097645,0.017778,0.036101,-1.237287,0.132048,0.553661,0.307169,0.202550
4,SNX10,0.053182,0.195924,0.207336,0.596792,-0.283178,-0.264548,-1.961040,-0.234913,1.344523,...,-0.467625,-0.103853,0.009128,0.161284,0.019535,-1.408144,-0.086195,0.848493,0.285213,0.317987
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14083,IQCJ-SCHIP1,-0.307897,0.222599,0.088348,0.243157,-0.603942,0.076903,-1.794751,-0.046529,1.266210,...,-0.224223,-0.410543,-0.182258,-0.240350,0.229533,-1.328864,0.398958,0.433612,0.348908,0.304247
14084,CLPS,-0.030397,0.359858,0.022437,0.636999,-0.053850,-0.220293,-1.487177,-0.180808,0.746018,...,-0.182334,-0.323413,-0.327768,-0.023477,0.295560,-1.135074,0.322596,0.416174,0.100651,0.514219
14085,DNER,0.372892,0.662111,0.574072,1.532861,0.600428,-1.320759,-3.449981,-1.180403,1.757178,...,-0.808224,-0.358834,0.296531,0.802689,0.493470,-2.160690,-0.696728,1.541111,-0.717573,0.579141
14086,SOX7,0.385752,0.238906,0.213002,1.374729,0.338651,-1.215234,-3.103146,-0.720489,1.641950,...,-0.659299,-0.048657,0.015113,0.496584,0.274928,-1.819720,-0.460867,1.379758,-0.131238,0.808782


In [3]:
import pandas as pd
col_name = ['protein']
for i in range(1,64):
  col_name.append('feature'+str(i))
features_df = pd.read_csv('GNN/gcn_triplets_features.csv')
print(features_df)
labels_df = pd.read_csv('GNN/solubility.csv')
labels_df.rename(columns={'Gene name': 'node'}, inplace=True)
result = pd.merge(features_df, labels_df, on='node')
result

              node  feature1  feature2  feature3  feature4  feature5  \
0              FES  0.743222  0.235243  0.352072  1.756445  0.743565   
1           SLC7A7  0.532286  0.560859  0.246730  1.508120  0.478819   
2            HSPA2  1.819809  1.217307  1.204578  3.261093  1.266466   
3           ZSCAN9 -0.049771  0.297367  0.202798  0.406264 -0.392944   
4            SNX10  0.053182  0.195924  0.207336  0.596792 -0.283178   
...            ...       ...       ...       ...       ...       ...   
14083  IQCJ-SCHIP1 -0.307897  0.222599  0.088348  0.243157 -0.603942   
14084         CLPS -0.030397  0.359858  0.022437  0.636999 -0.053850   
14085         DNER  0.372892  0.662111  0.574072  1.532861  0.600428   
14086         SOX7  0.385752  0.238906  0.213002  1.374729  0.338651   
14087       CXCL14  0.593781  0.327969  0.123732  1.834183  0.862133   

       feature6  feature7  feature8  feature9  ...  feature55  feature56  \
0     -1.508854 -3.176088 -0.940406  1.629964  ...  -0.9555

Unnamed: 0,node,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature59,feature60,feature61,feature62,feature63,feature64,Solubility,Label,Word_Count,Count_Category
0,ERAP2,0.753897,0.631133,0.427581,1.782455,0.481419,-0.884905,-2.686477,-0.722017,1.401010,...,-0.135192,-2.098118,-0.548589,1.556824,0.126840,0.639594,Membrane,0,117,High Count
1,ADAMTSL5,-0.186542,0.207906,0.018607,0.335001,-0.197333,-0.169863,-1.384160,-0.145617,0.778067,...,0.323807,-0.923131,0.282468,0.295773,0.050299,0.376505,Soluble,1,28,Low Count
2,TBC1D30,0.249541,0.322157,0.501383,0.842090,-0.010922,-0.601853,-2.427240,-0.617756,1.647056,...,-0.011332,-1.655825,-0.636937,1.287607,-0.059919,0.183111,Membrane,0,55,High Count
3,KCNK18,0.015749,0.352334,0.121614,0.688152,0.102661,-0.496016,-1.731121,-0.396495,0.824390,...,0.404570,-1.118492,0.070874,0.545471,-0.211385,0.462640,Membrane,0,184,High Count
4,NDNF,-0.076272,0.322188,0.177611,0.529901,-0.139105,-0.344516,-1.834110,-0.341659,1.055206,...,0.340516,-1.207081,0.073859,0.566437,-0.096292,0.356826,Soluble,1,129,High Count
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1350,TRABD2B,-0.241699,0.176269,0.029612,0.153664,-0.386701,0.060677,-1.153191,-0.023378,0.766541,...,0.217727,-0.852572,0.346148,0.210977,0.195861,0.236231,Membrane,0,96,High Count
1351,RPS9,2.142154,1.474390,1.369221,3.583318,1.168501,-1.464075,-4.080817,-1.525270,2.339054,...,-1.100922,-3.659000,-2.045669,3.350450,0.306895,0.509614,Soluble,1,205,High Count
1352,SLC22A16,0.266339,0.551295,0.178985,0.978763,0.127065,-0.304876,-1.527941,-0.312292,0.690271,...,0.152300,-1.285455,0.078404,0.661898,0.032808,0.470590,Membrane,0,93,High Count
1353,FBN3,0.306774,0.515952,0.406222,1.302955,0.274931,-0.933556,-2.978215,-0.794996,1.637023,...,0.282289,-1.983814,-0.414793,1.326478,-0.231975,0.581117,Soluble,1,90,High Count


In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

X = result.drop(columns=['node', 'Solubility', 'Label', 'Word_Count', 'Count_Category']).values
y = result['Label'].values

# Data normalization
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Divide training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensor
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
#y_test = torch.tensor(y_test, dtype=torch.float32)


model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
threshold = 0.5

y_pred_classes = (y_pred > threshold).astype(int)
accuracy = (y_pred_classes == y_test).mean()
print(f"Accuracy: {accuracy}")

sk_acc = accuracy_score(y_test ,y_pred)
print(f"sk: {sk_acc}")

Mean Squared Error: 0.14907588257449092
Accuracy: 0.8007380073800738


ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import torch
from torch.utils.data import TensorDataset
import numpy as np

# Assuming `result` contains your dataset

# Prepare features and target variable
X = result.drop(columns=['node', 'Solubility', 'Label', 'Word_Count', 'Count_Category']).values
y = result['Label'].values

# Data normalization
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Divide training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensor
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)

model = LogisticRegression(C=1.0, max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6531365313653137


In [20]:
# 创建并训练逻辑回归模型
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.metrics import mean_squared_error, accuracy_score

model = LogisticRegression(C=1.0, max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# 计算准确率
accuracy = roc_auc_score(y_test, y_pred)
#auc = roc_auc_score(y_test, y_pred, multi_class='ovr')
print(f"Accuracy: {accuracy}")
sk_acc = accuracy_score(y_test ,y_pred)
print(y_test)
print(y_pred)
print(f"sk: {sk_acc}")

Accuracy: 0.6498631636562672
[1 0 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 0 0 1 1 1 0 0 1 0 0 1 0 1 0 0 0
 1 1 0 0 1 0 1 0 1 1 0 1 1 0 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0
 1 1 1 1 1 1 1 1 0 1 0 0 0 0 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 1 0 1 0 0 0 0 1
 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 0 1 0 1 0 0 0 1 0 0 1 1 1 0 0 0 1 1 1 0
 0 0 0 1 0 1 0 0 0 1 0 1 1 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 0 1 0 1 0 0 0 0 0
 0 0 0 1 1 0 0 0 0 0 0 1 0 0 1 1 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 1 0 1 0 0 0
 0 0 0 1 1 0 0 0 0 1 1 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 1 1 1
 0 1 1 0 1 1 0 0 0 0 0 1]
[1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1.
 0. 1. 0. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 0. 0.
 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1.
 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1.
 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0.
 1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0. 

In [42]:
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

kf = KFold(n_splits=5, random_state=42, shuffle=True)

mse_scores = []
accuracy_scores = []

for train_index, test_index in kf.split(X):
    # 分割数据
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # 训练模型
    model = LinearRegression()
    model.fit(X_train, y_train)

    # 预测
    y_pred = model.predict(X_test)
    
    # 计算 MSE
    mse = mean_squared_error(y_test, y_pred)
    mse_scores.append(mse)
    
    # 计算准确率（对于回归任务，这可能不适用，除非你有特定的阈值来决定分类）
    threshold = 0.5
    y_pred_classes = (y_pred > threshold).astype(int)
    accuracy = (y_pred_classes == y_test).mean()
    accuracy_scores.append(accuracy)

# 输出平均 MSE 和准确率
print(f"Average Mean Squared Error: {np.mean(mse_scores)}")
print(f"Average Accuracy: {np.mean(accuracy_scores)}")

Average Mean Squared Error: 0.13987583189899747
Average Accuracy: 0.8199261992619926


In [10]:
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import cross_validate
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 定义模型
model = LogisticRegression(C=1.0, max_iter=1000)

# 定义AUC计算方法为多类别
auc_scorer = make_scorer(roc_auc_score, needs_proba=True, multi_class='ovr')

# 执行交叉验证，计算AUC
cv_results = cross_validate(model, X, y, cv=5, scoring=auc_scorer, return_train_score=True)

# 输出AUC分数
print(f"Train AUC: {cv_results['train_score'].mean():.2f}")
print(f"Test AUC: {cv_results['test_score'].mean():.2f}")

Train AUC: 0.76
Test AUC: 0.75


