In [1]:
import pandas as pd
from sklearn import preprocessing

from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from imblearn.ensemble import BalancedRandomForestClassifier

from xgboost import XGBClassifier

from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import classification_report, average_precision_score, precision_recall_curve, auc, roc_auc_score, confusion_matrix

In [2]:
def smote_tomek_resample(df):
    smt=SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'), random_state=4262)
    X, y = df.drop(columns = ["label"]), df['label']
    X_res, y_res = smt.fit_resample(X,y)
    return X_res, y_res

In [3]:
def prepare_train_test_data(data, train_idx, test_idx, resample_method=False):
    """
    Given indices prepared from test splits, prepare x and y values for train/test from initial read data
    - removal of columns is performed within this function
    """
    # Check overlap
    train_gid, test_gid = set(data.iloc[train_idx, :].gene_id), set(data.iloc[test_idx, :].gene_id)
    print(train_gid.intersection(test_gid))

    # Drop identifiers
    data = data.drop(columns=["gene_id","transcript_id", "position"])

    # Split train and test
    train, test = data.iloc[train_idx, :], data.iloc[test_idx, :]

    if not resample_method:
        # Return X_train, y_train, X_test, y_test
        X_train, y_train = train.drop(columns=["label"]), train.label
        X_test, y_test = test.drop(columns=["label"]), test.label
    else:
        # SMOTETomek
        X_train, y_train = smote_tomek_resample(data)
        X_test, y_test = test.drop(columns=["label"]), test.label

    return X_train, y_train, X_test, y_test

In [4]:
df = pd.read_csv("nuc_encoded_mean_dataset.csv")

Using SmoteTomek

In [5]:
splitter = GroupShuffleSplit(n_splits=5, test_size=0.20, random_state = 4262)
temp = splitter.split(df, groups=df['gene_id'])

roc = []
pr = []
ap = []
counter = 0
for train_index, test_index in temp:
    X_train, y_train, X_test, y_test = prepare_train_test_data(df, train_index, test_index)
    # print(y_train.value_counts())
    # print(y_test.value_counts())

    clf = XGBClassifier(random_state=4262)
    clf.fit(X_train, y_train)

    test_pred = clf.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, test_pred).ravel()

    # print(f"True Negative: {tn}/{tn+fp}")
    # print(f"False Positive: {fp}/{tn+fp}")
    # print(f"False Negative: {fn}/{fn+tp}")
    # print(f"True Positive: {tp}/{fn+tp}")
    roc_auc = roc_auc_score(y_test, test_pred, labels = [0, 1])
    precision_, recall_, _ = precision_recall_curve(y_test, test_pred)
    pr_auc = auc(recall_, precision_)
    aps = average_precision_score(y_test, clf.predict_proba(X_test)[:,1])

    # print(f"ROC AUC: {roc_auc}")
    # print(f"PR AUC: {pr_auc}")
    # print(f"PR AUC #2: {aps}")

    roc.append(roc_auc)
    pr.append(pr_auc)
    ap.append(aps)


print(f"ROC AUC: {sum(roc)/len(roc)}")
print(f"PR AUC: {sum(pr)/len(pr)}")
print(f"PR AUC AVERAGE PRECISION: {sum(ap)/len(ap)}")

set()
set()
set()
set()
set()
ROC AUC: 0.6431855754658441
PR AUC: 0.4551812260266416
PR AUC AVERAGE PRECISION: 0.4038919025771399


Using BalancedRandomForestClassifier

In [None]:
splitter = GroupShuffleSplit(n_splits=5, test_size=0.20, random_state = 4262)
temp = splitter.split(df, groups=df['gene_id'])

roc = []
pr = []
ap = []
counter = 0
for train_index, test_index in temp:
    X_train, y_train, X_test, y_test = prepare_train_test_data(df, train_index, test_index, True)
    # print(y_train.value_counts())
    # print(y_test.value_counts())

    clf = BalancedRandomForestClassifier(random_state=4262)
    clf.fit(X_train, y_train)

    test_pred = clf.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, test_pred).ravel()

    # print(f"True Negative: {tn}/{tn+fp}")
    # print(f"False Positive: {fp}/{tn+fp}")
    # print(f"False Negative: {fn}/{fn+tp}")
    # print(f"True Positive: {tp}/{fn+tp}")
    roc_auc = roc_auc_score(y_test, test_pred, labels = [0, 1])
    precision_, recall_, _ = precision_recall_curve(y_test, test_pred)
    pr_auc = auc(recall_, precision_)
    aps = average_precision_score(y_test, clf.predict_proba(X_test)[:,1])

    # print(f"ROC AUC: {roc_auc}")
    # print(f"PR AUC: {pr_auc}")
    # print(f"PR AUC #2: {aps}")

    roc.append(roc_auc)
    pr.append(pr_auc)
    ap.append(aps)


print(f"ROC AUC: {sum(roc)/len(roc)}")
print(f"PR AUC: {sum(pr)/len(pr)}")
print(f"PR AUC AVERAGE PRECISION: {sum(ap)/len(ap)}")