In [1]:
import argparse
from pathlib import Path
from data_processing import DataProcessor
from models import ModelTrainer
from feature_selection import FeatureSelector
from visualization import Visualizer
from causal_inference import CausalInference

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
print("Starting ML Pipeline...")
base_dir = '../../../'
print(f"Base directory set to: {base_dir}")

data_path = base_dir + 'dataset/' + 'data_full.xlsx'
raw_data_path = base_dir + 'dataset/' + 'result_raw.xlsx'
result_dir = base_dir + 'result/'

report_file_path = result_dir + 'report.txt'

print("Loading data...")
data_processor = DataProcessor(data_path=str(data_path))
df = data_processor.load_data_metabolites()
print("Data loaded successfully.")

Starting ML Pipeline...
Base directory set to: ../../../
Loading data...
Data loaded successfully.


In [7]:
print("Preprocessing raw data...")
raw_df = data_processor.preprocess_raw_data(raw_data_path=str(raw_data_path))
print("Raw data preprocessed successfully.")

print("Encoding labels...")
df_encoded, label_encoder = data_processor.encode_labels(df, label_column='Group')
print("Labels encoded successfully.")

X = df_encoded.drop(columns=['Group'])
y = df_encoded['Group']

print("Training Random Forest model...")
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 7],
    'min_samples_leaf': [1, 2, 4]
}
model_trainer = ModelTrainer(X, y)
model, best_params = model_trainer.train_random_forest(param_dist)
print("Model trained successfully.")

Preprocessing raw data...
Raw data preprocessed successfully.
Encoding labels...
Labels encoded successfully.
Training Random Forest model...
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Model trained successfully.


In [10]:
print("Evaluating model...")
accuracy, report = model_trainer.evaluate_model()
print("Model evaluation completed.")

print("Saving trained model...")
model_trainer.save_model(str(result_dir + 'best_random_forest_model.pkl'))
print("Model saved successfully.")



Evaluating model...
Model evaluation completed.
Saving trained model...
Model saved successfully.


In [12]:
print("Performing feature selection using Gini importance...")
feature_selector = FeatureSelector(model, model_trainer.X_train)
selected_features_gini = feature_selector.gini_importance(threshold=0.01)
print("Feature selection (Gini importance) completed.")

print("Performing feature selection using SHAP importance...")
selected_features_shap = feature_selector.shap_importance(model_trainer.X_test, threshold=0.005)
print("Feature selection (SHAP importance) completed.")

print("Generating SHAP summary plot...")
visualizer = Visualizer()
shap_summary_path = result_dir + 'shap_summary.png'
visualizer.plot_shap_summary(model, model_trainer.X_test, str(shap_summary_path))
print(f"SHAP summary plot saved at: {shap_summary_path}")


Performing feature selection using Gini importance...
Feature selection (Gini importance) completed.
Performing feature selection using SHAP importance...
Feature selection (SHAP importance) completed.
Generating SHAP summary plot...
SHAP summary plot saved at: ../../../result/shap_summary.png


In [15]:
print("Performing causal inference...with SHAP selected and Group")
causal_features = selected_features_shap.to_list() + ['Group']
df_causal = df_encoded[causal_features]
causal_inference = CausalInference(df_causal)
causal_graph = causal_inference.run_pc_algorithm()
print("Causal inference completed.")

print("Drawing causal graph...")
causal_graph_path = result_dir + 'causal_graph.png'
causal_inference.draw_graph(str(causal_graph_path))
print(f"Causal graph saved at: {causal_graph_path}")

print("Writing report...")
with open(report_file_path, 'w') as report_file:
    report_file.write("First few rows of the dataset:\n")
    report_file.write(df.head().to_string())
    report_file.write("\n\n")
    report_file.write("Best Parameters:\n")
    report_file.write(str(best_params))
    report_file.write("\n\n")
    report_file.write(f"Accuracy: {accuracy * 100:.2f}%\n\n")
    report_file.write("Classification Report:\n")
    report_file.write(report)
    report_file.write("\n\n")
    report_file.write("Selected Features (Gini Importance):\n")
    report_file.write(', '.join(selected_features_gini))
    report_file.write("\n\n")
    report_file.write("Selected Features (SHAP Importance):\n")
    report_file.write(', '.join(selected_features_shap))
    report_file.write("\n\n")
    report_file.write(f"SHAP summary plot saved as '{shap_summary_path}'.\n\n")
    report_file.write(f"Causal graph saved as '{causal_graph_path}'.\n")
print("Report written successfully.")

print(f"Report generated at '{report_file_path}'.")
print("Process completed successfully.")


Performing causal inference...with SHAP selected and Group


Depth=6, working on node 20: 100%|██████████| 21/21 [00:00<00:00, 1415.01it/s]


Causal inference completed.
Drawing causal graph...
Causal graph saved at: ../../../result/causal_graph.png
Writing report...
Report written successfully.
Report generated at '../../../result/report.txt'.
Process completed successfully.


In [23]:
from causallearn.search import IDA


ImportError: cannot import name 'IDA' from 'causallearn.search' (c:\Users\snorl\Desktop\FYP\venv\lib\site-packages\causallearn\search\__init__.py)