# Binary classification: one class vs the rest

In [1]:
import pandas as pd
import h2o
from h2o.automl import H2OAutoML

# Initialize H2O with a memory limit (adjust as needed)
h2o.init(max_mem_size="8G")

# ------------------------------
# 1. Load Full Dataset
# -------------------------------
data_path = '/kaggle/input/data-binary-class/mol_3d_descriptors_final.csv'
df = pd.read_csv(data_path)
print(f"Full dataset loaded with shape: {df.shape}")

target = "Potency_Change_Label"
non_feature_cols = [
    "canonical_smiles_1", "canonical_smiles_2",
    "Potency_Change", "Potency_Change_Category", "Potency_Change_Label"
]
# Use all features (or you can restrict further if needed)
features = [col for col in df.columns if col not in non_feature_cols]

# -------------------------------
# 2. Apply Variance Threshold
# -------------------------------
# Convert feature columns to numeric and drop those that are entirely NaN
X_all_numeric = df[features].apply(pd.to_numeric, errors='coerce').dropna(axis=1, how='all')
variances = X_all_numeric.var()
# Retain only features with variance greater than 0.8
var_thresh_features = variances[variances > 0.8].index.tolist()
print(f"{len(var_thresh_features)} features have variance > 0.8.")

# Use the intersection with your full features list:
final_features = [f for f in var_thresh_features if f in features]
print(f"Using {len(final_features)} features after applying variance threshold.")

# -------------------------------
# 3. One-vs-All Binary Classification with H2O AutoML
# -------------------------------
results = {}
unique_classes = df[target].unique()
print("Unique classes:", unique_classes)

for c in unique_classes:
    print(f"\nProcessing binary classification for class '{c}' vs. rest...")
    
    # Create a binary target: label samples as 0 if they belong to class c, else 1.
    df_binary = df.copy()
    df_binary[target] = (df[target] != c).astype(int)
    
    # Optional: Subsample the data to reduce memory usage (uncomment if needed)
    # df_binary = df_binary.sample(n=30000, random_state=42)
    
    # Subset the data to only the selected features and target.
    df_subset = df_binary[final_features + [target]].copy()
    
    # Convert to H2OFrame and set the target as categorical.
    hf = h2o.H2OFrame(df_subset)
    hf[target] = hf[target].asfactor()
    
    try:
        # Run H2O AutoML on the binary problem.
        aml = H2OAutoML(max_models=20, max_runtime_secs=600, seed=42)
        aml.train(x=final_features, y=target, training_frame=hf)
        lb = aml.leaderboard
        
        # Evaluate performance on the same H2OFrame (or create a separate test split if desired)
        perf = aml.leader.model_performance(test_data=hf)
        # For binary classification, accuracy is available via perf.accuracy()
        accuracy = perf.accuracy()[0][1]
        results[c] = {"aml": aml, "leaderboard": lb, "accuracy": accuracy}
        
        print(f"Leaderboard for class '{c}' vs. rest:")
        print(lb.head(lb.nrows))
        print(f"Accuracy for class '{c}' vs. rest: {accuracy:.4f}")
    except Exception as e:
        print(f"Error processing class '{c}': {e}")
    finally:
        # Remove all H2O frames to free memory before next iteration.
        h2o.remove_all()


Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.25" 2024-10-15; OpenJDK Runtime Environment (build 11.0.25+9-post-Ubuntu-1ubuntu122.04); OpenJDK 64-Bit Server VM (build 11.0.25+9-post-Ubuntu-1ubuntu122.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.10/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpke4ajuvo
  JVM stdout: /tmp/tmpke4ajuvo/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpke4ajuvo/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,4 months and 23 days
H2O_cluster_name:,H2O_from_python_unknownUser_15i41p
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,8 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


  df = pd.read_csv(data_path)


Full dataset loaded with shape: (56826, 3659)


  return op(a, b)


1332 features have variance > 0.8.
Using 1332 features after applying variance threshold.
Unique classes: [4 0 2 3 1 5]

Processing binary classification for class '4' vs. rest...
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
Leaderboard for class '4' vs. rest:
model_id                                 auc    logloss     aucpr    mean_per_class_error      rmse       mse
GBM_1_AutoML_1_20250325_181856      0.525308   0.441197  0.851427                0.5       0.366217  0.134115
XGBoost_1_AutoML_1_20250325_181856  0.522628   0.464353  0.85493                 0.5       0.376006  0.14138
GLM_1_AutoML_1_20250325_181856      0.51244    0.436172  0.853584                0.499434  0.364632  0.132956
[3 rows x 7 columns]

Accuracy for class '4' vs. rest: 0.8354

Processing binary classification for class '0' vs. rest...
Parse progress: |████████████████

# Binary classification: positive vs negative classes

In [3]:
import pandas as pd
import h2o
from h2o.automl import H2OAutoML

# -------------------------------
# 1. Load Full Dataset
# -------------------------------
data_path = '/kaggle/input/data-binary-class/mol_3d_descriptors_final.csv'
df = pd.read_csv(data_path)
print(f"Full dataset loaded with shape: {df.shape}")

target = "Potency_Change_Label"
non_feature_cols = [
    "canonical_smiles_1", "canonical_smiles_2",
    "Potency_Change", "Potency_Change_Category", "Potency_Change_Label"
]

# Use all features except the non-feature ones.
features = [col for col in df.columns if col not in non_feature_cols]

# -------------------------------
# 2. Apply Variance Threshold
# -------------------------------
# Convert feature columns to numeric (non-convertible values become NaN) and drop columns that are entirely NaN.
X_all_numeric = df[features].apply(pd.to_numeric, errors='coerce').dropna(axis=1, how='all')
variances = X_all_numeric.var()
var_thresh_features = variances[variances > 0.8].index.tolist()
print(f"{len(var_thresh_features)} features have variance > 0.8.")

# Use intersection with the original feature list
final_features = [f for f in var_thresh_features if f in features]
print(f"Using {len(final_features)} features after applying variance threshold.")

# -------------------------------
# 3. Create Binary Target (One-vs-All for Potency Change Direction)
# -------------------------------
# Original encoding:
# 0 → Large Negative Change, 1 → Large Positive Change,
# 2 → Moderate Negative Change, 3 → Moderate Positive Change,
# 4 → Small Negative Change, 5 → Small Positive Change
# We'll map negative changes (0,2,4) to 0 (improved potency) and positive changes (1,3,5) to 1 (reduced potency).
binary_mapping = {0: 0, 2: 0, 4: 0, 1: 1, 3: 1, 5: 1}
df['binary_label'] = df[target].map(binary_mapping)
print("Binary target value counts:")
print(df['binary_label'].value_counts())

# -------------------------------
# 4. Subset Data to Selected Features and Binary Target
# -------------------------------
df_subset = df[final_features + ['binary_label']].copy()
print(f"Subset data shape: {df_subset.shape}")

# -------------------------------
# 5. Convert to H2OFrame and Run AutoML (Binary Classification)
# -------------------------------
h2o.init(max_mem_size="8G")
hf = h2o.H2OFrame(df_subset)
hf['binary_label'] = hf['binary_label'].asfactor()  # Make sure target is categorical

x = final_features
y = 'binary_label'

aml = H2OAutoML(max_models=20, max_runtime_secs=600, seed=42)
aml.train(x=x, y=y, training_frame=hf)

lb = aml.leaderboard
print("AutoML Leaderboard (binary classification):")
print(lb.head(lb.nrows))

# -------------------------------
# 6. Evaluate Model Performance on the Same H2OFrame
# -------------------------------
perf = aml.leader.model_performance(test_data=hf)
accuracy = perf.accuracy()[0][1]  # For binary classification, .accuracy() returns a list of (threshold, accuracy)
print(f"Estimated Accuracy for binary classification: {accuracy:.4f}")


  df = pd.read_csv(data_path)


Full dataset loaded with shape: (56826, 3659)


  return op(a, b)


1332 features have variance > 0.8.
Using 1332 features after applying variance threshold.
Binary target value counts:
binary_label
0    28485
1    28341
Name: count, dtype: int64
Subset data shape: (56826, 1333)
Checking whether there is an H2O instance running at http://localhost:54321. connected.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,25 mins 47 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,4 months and 26 days
H2O_cluster_name:,H2O_from_python_unknownUser_0qtkzc
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,5.249 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
AutoML Leaderboard (binary classification):
model_id                                auc    logloss     aucpr    mean_per_class_error      rmse       mse
XGBoost_1_AutoML_2_20250328_91137  0.525343   0.727116  0.531726                0.495088  0.514153  0.264353
GLM_1_AutoML_2_20250328_91137      0.523485   0.692392  0.526643                0.499458  0.499623  0.249623
GBM_1_AutoML_2_20250328_91137      0.51661    0.695902  0.522246                0.5       0.501114  0.251115
[3 rows x 7 columns]

Estimated Accuracy for binary classification: 0.6483
