In [10]:
import pickle
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve,auc
import warnings
warnings.filterwarnings("ignore")

##### 1. CRISPR-based

In [None]:
seeds = [0,42,100,1000,10000,100000,1000000,10000000,100000000,1000000000]

ft_posi_seed = pickle.load(open('PCorr_PPI_datasets/ft_posi_seed_key.pkl', 'rb'))['CRISPR']
ft_nega_seed = pickle.load(open('PCorr_PPI_datasets/ft_nega_seed_key.pkl', 'rb'))['CRISPR']

X, y, X_train, X_val, X_test, y_train, y_val, y_test = {}, {}, {}, {}, {}, {}, {}, {}
for s in seeds:
    ft_posi = ft_posi_seed[s]
    ft_posi['label'] = [1] * len(ft_posi)
    ft_nega = ft_nega_seed[s]
    ft_nega['label'] = [0] * len(ft_nega)
    ft = pd.concat([ft_posi, ft_nega], axis=0)
    X[s] = np.array(ft.iloc[:, :-1], dtype='float32')
    y[s] = np.array(ft.iloc[:, [-1]], dtype='float32')
    X_train[s], X_temp, y_train[s], y_temp = train_test_split(X[s], y[s], test_size=0.2, random_state=42)
    X_val[s], X_test[s], y_val[s], y_test[s] = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 1. n_estimators
para = [100, 200, 300, 400, 500]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=i)
    auc_list = []
    for s in seeds:
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# n_estimators = 300, Avg AUC = 0.8125981420554982

# 2. max_features
para = [0.1,'log2','sqrt',0.3,0.5,1.0]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# max_features = sqrt, Avg AUC = 0.8125981420554982

# 3. max_depth
para = [1,3,5,10,15,20,30,None]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features='sqrt',
                                max_depth=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# max_depth = 15, Avg AUC = 0.8060971756286707

# 4. min_samples_split
para = [2,3,4,5,6,7,8,9,10,15,20]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features='sqrt',max_depth=15,
                                min_samples_split=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# min_samples_split = 9, Avg AUC = 0.8041859399581581

# 5. min_samples_leaf
para = [1,2,3,4,5,6,7,8,9,10,20]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features='sqrt', max_depth=15,
                                min_samples_split=9, min_samples_leaf=i)
    
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# min_samples_leaf = 4, Avg AUC = 0.8024833382196626

##### 2. CRISPR-based

In [None]:
seeds = [0,42,100,1000,10000,100000,1000000,10000000,100000000,1000000000]

ft_posi_seed = pickle.load(open('PCorr_PPI_datasets/ft_posi_seed_key.pkl', 'rb'))['RNAi']
ft_nega_seed = pickle.load(open('PCorr_PPI_datasets/ft_nega_seed_key.pkl', 'rb'))['RNAi']

X, y, X_train, X_val, X_test, y_train, y_val, y_test = {}, {}, {}, {}, {}, {}, {}, {}
for s in tqdm(seeds):
    ft_posi = ft_posi_seed[s]
    ft_posi['label'] = [1] * len(ft_posi)
    ft_nega = ft_nega_seed[s]
    ft_nega['label'] = [0] * len(ft_nega)
    ft = pd.concat([ft_posi, ft_nega], axis=0)
    X[s] = np.array(ft.iloc[:, :-1], dtype='float32')
    y[s] = np.array(ft.iloc[:, [-1]], dtype='float32')
    X_train[s], X_temp, y_train[s], y_temp = train_test_split(X[s], y[s], test_size=0.2, random_state=42)
    X_val[s], X_test[s], y_val[s], y_test[s] = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 1. n_estimators
para = [100, 200, 300, 400, 500]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=i)
    auc_list = []
    for s in tqdm(seeds):
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# n_estimators = 300, Avg AUC = 0.7768196921061578

# 2. max_features
para = [0.1,'log2','sqrt',0.3,0.5,1.0]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# max_features = sqrt, Avg AUC = 0.7768196921061578

# 3. max_depth
para = [1,3,5,10,15,20,30,None]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features='sqrt',
                                max_depth=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# max_depth = 15, Avg AUC = 0.7712516266491364

# 4. min_samples_split
para = [2,3,4,5,6,7,8,9,10,15,20]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features='sqrt',max_depth=15,
                                min_samples_split=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# min_samples_split = 8, Avg AUC = 0.7708639364257901

# 5. min_samples_leaf
para = [1,2,3,4,5,6,7,8,9,10,20]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features='sqrt',max_depth=15,
                                min_samples_split=8,min_samples_leaf=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# min_samples_leaf = 3, Avg AUC = 0.7706853562555268