In [127]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline

# Load and Clean Data

In [52]:
marginal_df = pd.read_csv(r"C:\Users\erict\Documents\projects\gwave_classify\data\gwtc_marginal.csv")
confident_df = pd.read_csv(r"C:\Users\erict\Documents\projects\gwave_classify\data\gwtc_confident.csv")

In [53]:
confident_df["label"] = 1  
marginal_df["label"] = 0  
cleaned_df = pd.concat([confident_df, marginal_df], ignore_index=True)
gwtc_df = cleaned_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [77]:
drop_list = ["id", "commonName", "version", "catalog.shortName", "GPS", "reference", "jsonurl", "far_lower", "far_upper", "p_astro_lower", "p_astro_upper", "chirp_mass_lower", "chirp_mass_upper", "chirp_mass", "mass_1_source_lower", "mass_1_source_upper", "mass_2_source_upper", "mass_2_source_lower", "network_matched_filter_snr_lower", "network_matched_filter_snr_upper", "luminosity_distance_lower", "luminosity_distance_upper", "chi_eff_lower", "chi_eff_upper", "total_mass_source_lower", "total_mass_source_upper", "redshift_lower", "redshift_upper", "final_mass_source_lower", "final_mass_source_upper", "chirp_mass_source_lower", "chirp_mass_source_upper"]
train_df = gwtc_df.drop(columns=drop_list)


In [80]:
imputer = KNNImputer(n_neighbors=5)
train_df[:] = imputer.fit_transform(train_df)

# Create train and test sets

In [108]:
X = train_df.drop(columns=["label"])
y = train_df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Create and run model

In [125]:
rfmodel = RandomForestClassifier(n_estimators=50, random_state=42, max_depth=10)
rfmodel.fit(X_train, y_train)

y_pred = rfmodel.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.9130434782608695


In [110]:
scaler = StandardScaler()
X_train_log = scaler.fit_transform(X_train)
X_test_log = scaler.transform(X_test)


In [126]:

lrmodel = LogisticRegression()
lrmodel.fit(X_train_log, y_train)
y_pred_log = lrmodel.predict(X_test_log)

accuracy = accuracy_score(y_test, y_pred_log)
print(f"Test Accuracy: {accuracy}")


Test Accuracy: 0.9130434782608695


# k-fold cross validation

In [122]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rf_scores = cross_val_score(rfmodel, X, y, cv=kfold, scoring='accuracy')
lr_scores = cross_val_score(lrmodel, X, y, cv=kfold, scoring='accuracy')

print(f"Random Forest Accuracy: {np.mean(rf_scores):.4f} +/- {np.std(rf_scores):.4f}")
print(f"Logistic Regression Accuracy: {np.mean(lr_scores):.4f} +/- {np.std(lr_scores):.4f}")

Random Forest Accuracy: 0.9735 +/- 0.0216
Logistic Regression Accuracy: 0.7621 +/- 0.0564


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [131]:
# nn_model = MLPClassifier(hidden_layer_sizes=(16, 8), activation='relu', solver='adam', 
#                          alpha=0.01, max_iter=5000, random_state=42)

# # Use a pipeline to scale data for the neural network
# nn_pipeline = make_pipeline(StandardScaler(), nn_model)


In [132]:
# voting_clf = VotingClassifier(estimators=[('rf', rfmodel), ('nn', nn_pipeline)], voting='soft')

# # Train the Voting Classifier
# voting_clf.fit(X_train, y_train)

# # Evaluate the model
# accuracy = voting_clf.score(X_test, y_test)
# print(f'Voting Classifier Accuracy: {accuracy:.4f}')

Voting Classifier Accuracy: 0.9565
