In [1]:
import os
import pickle
import numpy as np
import pandas as pd
from numpy import nan as Nan
from numpy import inf as inf
from tqdm import notebook as tqdm
from scipy.sparse import csr_matrix
from sklearn.model_selection import cross_val_predict
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.model_selection import LeaveOneOut
from sklearn.feature_selection import SelectFromModel

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from data_preprocessing import *
from scoring import bootstrap_roc_auc


In [3]:
def load_csv(path):
    save_path = os.path.join(os.getcwd(), path)
    open_file = open(save_path, 'rb')
    labels_df = pickle.load(open_file)
    open_file.close()
    return labels_df


dfs = []
for i in range(5):
    dfs.append(load_csv(f"groups_and_oxi_states_5_frames/df_features_with_barrier_step_{i}.pkl"))

In [4]:
df = dfs[0]
X, y = (df.drop(['is_good', 'stru_label', 'stru_id', 'barrier'], axis=1).to_numpy(), df['is_good'].astype(int).to_numpy())
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

LogisticRegression

In [51]:
loo = LeaveOneOut()
loo.get_n_splits(X_scaled)
preds = np.zeros(len(y))
for i, (train_index, test_index) in enumerate(loo.split(X)):
    model = LogisticRegression(random_state = 42, max_iter = 300)
    model.fit(X_scaled[train_index, :], y[train_index])
    preds[test_index] = model.predict_proba(X_scaled[test_index])[:, 1]
estimated_mean, estimated_std = bootstrap_roc_auc(1000, y, preds)
print(estimated_mean, estimated_std)

0.7169641516566989 0.048631328349811355


GradientBoostingClassifier

In [54]:
loo = LeaveOneOut()
loo.get_n_splits(X_scaled)
preds = np.zeros(len(y))
for i, (train_index, test_index) in enumerate(loo.split(X)):
    model = GradientBoostingClassifier(random_state = 42)
    model.fit(X_scaled[train_index, :], y[train_index])
    preds[test_index] = model.predict_proba(X_scaled[test_index])[:, 1]
estimated_mean, estimated_std = bootstrap_roc_auc(1000, y, preds)
print(estimated_mean, estimated_std)

0.7157661007906527 0.04993538947895872


KNeighborsClassifier

In [55]:
loo = LeaveOneOut()
loo.get_n_splits(X_scaled)
preds = np.zeros(len(y))
for i, (train_index, test_index) in enumerate(loo.split(X)):
    model = KNeighborsClassifier()
    model.fit(X_scaled[train_index, :], y[train_index])
    preds[test_index] = model.predict_proba(X_scaled[test_index])[:, 1]
estimated_mean, estimated_std = bootstrap_roc_auc(1000, y, preds)
print(estimated_mean, estimated_std)

0.6773745740002095 0.048099549669542656


XGBClassifier

In [7]:
loo = LeaveOneOut()
loo.get_n_splits(X_scaled)
preds = np.zeros(len(y))
for i, (train_index, test_index) in enumerate(loo.split(X)):
    model = XGBClassifier(objective='binary:logistic', n_estimators=250)
    model.fit(X_scaled[train_index, :], y[train_index])
    preds[test_index] = model.predict_proba(X_scaled[test_index])[:, 1]
estimated_mean, estimated_std = bootstrap_roc_auc(1000, y, preds)
print(estimated_mean, estimated_std)

0.7150331143299083 0.051139167732529016


CatBoostClassifier

In [10]:
loo = LeaveOneOut()
loo.get_n_splits(X_scaled)
preds = np.zeros(len(y))
for i, (train_index, test_index) in enumerate(loo.split(X)):
    model = CatBoostClassifier(eval_metric='AUC', verbose = False, random_state = 42)
    model.fit(X_scaled[train_index, :], y[train_index])
    preds[test_index] = model.predict_proba(X_scaled[test_index])[:, 1]
estimated_mean, estimated_std = bootstrap_roc_auc(1000, y, preds)
print(estimated_mean, estimated_std)

0.7302289835116726 0.04862945901674476


 LogisticRegression with regularization