In [2]:
import pickle
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve,auc
import warnings
warnings.filterwarnings("ignore")

##### 1. CRISPR-based RanNeg

In [None]:
seeds = [0,42,100,1000,10000,100000,1000000,10000000,100000000,1000000000]

ft_posi_seed = pickle.load(open('PCorr_TPI_datasets/ft_posi_seed_key.pkl', 'rb'))['CRISPR']
ft_nega_seed = pickle.load(open('PCorr_TPI_datasets/ft_RanNeg_seed_key.pkl', 'rb'))['CRISPR']

X, y, X_train, X_val, X_test, y_train, y_val, y_test = {}, {}, {}, {}, {}, {}, {}, {}
for s in seeds:
    ft_posi = ft_posi_seed[s]
    ft_posi['label'] = [1] * len(ft_posi)
    ft_nega = ft_nega_seed[s]
    ft_nega['label'] = [0] * len(ft_nega)
    ft = pd.concat([ft_posi, ft_nega], axis=0)

    X[s] = np.array(ft.iloc[:, :-1], dtype='float32')
    y[s] = np.array(ft.iloc[:, [-1]], dtype='float32')
    X_train[s], X_temp, y_train[s], y_temp = train_test_split(X[s], y[s], test_size=0.2, random_state=42)
    X_val[s], X_test[s], y_val[s], y_test[s] = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 1. n_estimators
para = [100, 200, 300, 400, 500]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=i)
    auc_list = []
    for s in tqdm(seeds):
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# n_estimators = 300, Avg AUC = 0.9678244046901625

# 2. max_features
para = [0.1,'log2','sqrt',0.3,0.5,1.0]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# max_features = 0.3, Avg AUC = 0.9766888315574569

# 3. max_depth
para = [1,3,5,10,15,20,30,None]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features=0.3,
                                max_depth=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# max_depth = 15, Avg AUC = 0.9670488336310772

# 4. min_samples_split
para = [2,3,4,5,6,7,8,9,10,15,20]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features=0.3,max_depth=15,
                                min_samples_split=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# min_samples_split = 10, Avg AUC = 0.9653312297283859

# 5. min_samples_leaf
para = [1,2,3,4,5,6,7,8,9,10,20]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features=0.3,max_depth=15,
                                min_samples_split=10,min_samples_leaf=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)

# min_samples_leaf = 5, Avg AUC = 0.9633443793947075

##### 2. CRISPR-based DecoyNeg

In [None]:
seeds = [0,42,100,1000,10000,100000,1000000,10000000,100000000,1000000000]

ft_posi_seed = pickle.load(open('PCorr_TPI_datasets/ft_posi_seed_key.pkl', 'rb'))['CRISPR']
ft_nega_seed = pickle.load(open('PCorr_TPI_datasets/ft_DecoyNeg_seed_key.pkl', 'rb'))['CRISPR']

X, y, X_train, X_val, X_test, y_train, y_val, y_test = {}, {}, {}, {}, {}, {}, {}, {}
for s in tqdm(seeds):
    ft_posi = ft_posi_seed[s]
    ft_posi['label'] = [1] * len(ft_posi)
    ft_nega = ft_nega_seed[s]
    ft_nega['label'] = [0] * len(ft_nega)
    ft = pd.concat([ft_posi, ft_nega], axis=0)

    X[s] = np.array(ft.iloc[:, :-1], dtype='float32')
    y[s] = np.array(ft.iloc[:, [-1]], dtype='float32')
    X_train[s], X_temp, y_train[s], y_temp = train_test_split(X[s], y[s], test_size=0.2, random_state=42)
    X_val[s], X_test[s], y_val[s], y_test[s] = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 1. n_estimators
para = [100, 200, 300, 400, 500]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=i)
    auc_list = []
    for s in tqdm(seeds):
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# n_estimators = 300, Avg AUC = 0.9850188843271503

# 2. max_features
para = [0.1,'log2','sqrt',0.3,0.5,1.0]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# max_features = 0.3, Avg AUC = 0.987090207841462

# 3. max_depth
para = [1,3,5,10,15,20,30,None]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features=0.3,
                                max_depth=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# max_depth = 15, Avg AUC = 0.9853083672910061

# 4. min_samples_split
para = [2,3,4,5,6,7,8,9,10,15,20]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features=0.3,max_depth=15,
                                min_samples_split=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# min_samples_split = 9, Avg AUC = 0.984836589650625

# 5. min_samples_leaf
para = [1,2,3,4,5,6,7,8,9,10,20]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features=0.3,max_depth=15,
                                min_samples_split=9,min_samples_leaf=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# min_samples_leaf = 4, Avg AUC = 0.984580597879587

##### 3. RNAi-based RanNeg

In [None]:
seeds = [0,42,100,1000,10000,100000,1000000,10000000,100000000,1000000000]

ft_posi_seed = pickle.load(open('PCorr_TPI_datasets/ft_posi_seed_key.pkl', 'rb'))['RNAi']
ft_nega_seed = pickle.load(open('PCorr_TPI_datasets/ft_RanNeg_seed_key.pkl', 'rb'))['RNAi']

X, y, X_train, X_val, X_test, y_train, y_val, y_test = {}, {}, {}, {}, {}, {}, {}, {}
for s in tqdm(seeds):
    ft_posi = ft_posi_seed[s]
    ft_posi['label'] = [1] * len(ft_posi)
    ft_nega = ft_nega_seed[s]
    ft_nega['label'] = [0] * len(ft_nega)
    ft = pd.concat([ft_posi, ft_nega], axis=0)
    X[s] = np.array(ft.iloc[:, :-1], dtype='float32')
    y[s] = np.array(ft.iloc[:, [-1]], dtype='float32')
    X_train[s], X_temp, y_train[s], y_temp = train_test_split(X[s], y[s], test_size=0.2, random_state=42)
    X_val[s], X_test[s], y_val[s], y_test[s] = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 1. n_estimators
para = [100, 200, 300, 400, 500]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=i)
    auc_list = []
    for s in tqdm(seeds):
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# n_estimators = 300, Avg AUC = 0.9701820972563432

# 2. max_features
para = [0.1,'log2','sqrt',0.3,0.5,1.0]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# max_features = 0.3, Avg AUC = 0.9767384812341584

# 3. max_depth
para = [1,3,5,10,15,20,30,None]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features=0.3,
                                max_depth=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# max_depth = 15, Avg AUC = 0.9732541927928484

# 4. min_samples_split
para = [2,3,4,5,6,7,8,9,10,15,20]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features=0.3,max_depth=15,
                                min_samples_split=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# min_samples_split = 7, Avg AUC = 0.9725296510014729

# 5. min_samples_leaf
para = [1,2,3,4,5,6,7,8,9,10,20]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features=0.3,max_depth=15,
                                min_samples_split=7,min_samples_leaf=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# min_samples_leaf = 4, Avg AUC = 0.970200248680867

##### 4. RNAi-based DecoyNeg

In [None]:
seeds = [0,42,100,1000,10000,100000,1000000,10000000,100000000,1000000000]

ft_posi_seed = pickle.load(open('PCorr_TPI_datasets/ft_posi_seed_key.pkl', 'rb'))['RNAi']
ft_nega_seed = pickle.load(open('PCorr_TPI_datasets/ft_DecoyNeg_seed_key.pkl', 'rb'))['RNAi']

X, y, X_train, X_val, X_test, y_train, y_val, y_test = {}, {}, {}, {}, {}, {}, {}, {}
for s in tqdm(seeds):
    ft_posi = ft_posi_seed[s]
    ft_posi['label'] = [1] * len(ft_posi)
    ft_nega = ft_nega_seed[s]
    ft_nega['label'] = [0] * len(ft_nega)
    ft = pd.concat([ft_posi, ft_nega], axis=0)
    X[s] = np.array(ft.iloc[:, :-1], dtype='float32')
    y[s] = np.array(ft.iloc[:, [-1]], dtype='float32')
    X_train[s], X_temp, y_train[s], y_temp = train_test_split(X[s], y[s], test_size=0.2, random_state=42)
    X_val[s], X_test[s], y_val[s], y_test[s] = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 1. n_estimators
para = [100, 200, 300, 400, 500]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=i)
    auc_list = []
    for s in tqdm(seeds):
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# n_estimators = 300, Avg AUC = 0.9850574473939101

# 2. max_features
para = [0.1,'log2','sqrt',0.3,0.5,1.0]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# max_features = 0.3, Avg AUC = 0.9885181061920594

# 3. max_depth
para = [1,3,5,10,15,20,30,None]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features=0.3,
                                max_depth=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# max_depth = 15, Avg AUC = 0.9877463476728993

# 4. min_samples_split
para = [2,3,4,5,6,7,8,9,10,15,20]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features=0.3,max_depth=15,
                                min_samples_split=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# min_samples_split = 8, Avg AUC = 0.9874048136561185

# 5. min_samples_leaf
para = [1,2,3,4,5,6,7,8,9,10,20]

auc_r = {}
for i in tqdm(para):
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=300, max_features=0.3,max_depth=15,
                                min_samples_split=8,min_samples_leaf=i)
    auc_list = []
    n = 0
    for s in tqdm(seeds):
        n = n + 1
        rf.fit(X_train[s], y_train[s])
        y_pred = rf.predict(X_val[s])
        y_probs = rf.predict_proba(X_val[s])[:, 1]

        fpr, tpr, threshold = roc_curve(y_val[s], y_probs,
                                        pos_label=1)  # Calculate false positive rate, true positive rate, threshold
        auc_list.append(auc(fpr, tpr))  # Calculate area under curve
    auc_r[i] = np.mean(auc_list)
# min_samples_leaf = 4, Avg AUC = 0.9870500048885111