In [31]:
from google.cloud import bigquery
import time
import os

# Get the current working directory in a Jupyter notebook
current_dir = os.getcwd()

# Create the full path
key_path = os.path.join(current_dir, 'projet-integration-au-2024-81640cb2db70.json')

# Set the environment variable to the path of your JSON key file
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = key_path

print(f"Google Application Credentials set to: {os.environ['GOOGLE_APPLICATION_CREDENTIALS']}")

# Initialiser le client BigQuery
client = bigquery.Client()

Google Application Credentials set to: c:\Users\sacha\FORK\AI_Final\projet-integration-au-2024-81640cb2db70.json


In [32]:
# Define your dataset and table IDs
dataset_id = 'sacha_phishing_url_website'
table_id_stagging = "sacha_table_initial_stagging"
table_id_stagging_na_string = "sacha_table_initial_stagging_NA_string"

In [33]:
# Helper function to execute a query and log timing
def run_query_with_logging(query, model_name):
    start_time = time.time()
    print(f"Training started for {model_name}...")
    
    query_job = client.query(query)
    query_job.result()  # Wait for the query to complete
    
    end_time = time.time()
    print(f"Training completed for {model_name} in {end_time - start_time:.2f} seconds.")

# Fetch and print model information after training
def get_model_info(model_name):
    print(f"Fetching model details for {model_name}...")
    query = f"""
    SELECT
      *
    FROM
      ML.TRAINING_INFO(MODEL `{dataset_id}.{model_name}`)
    """
    query_job = client.query(query)
    results = query_job.result()

    print(f"Training information for {model_name}:")
    for row in results:
        print(row)

# Helper function to evaluate the trained model
def evaluate_model(model_name):
    print(f"Evaluating {model_name}...")
    
    # Use ML.EVALUATE to evaluate the model on the entire dataset
    query = f"""
    SELECT
      *
    FROM
      ML.EVALUATE(
        MODEL `{dataset_id}.{model_name}`,
        (SELECT * FROM `{dataset_id}.{table_id_stagging_na_string}`)
      )
    """
    
    query_job = client.query(query)
    results = query_job.result()

    print(f"Evaluation results for {model_name}:")
    for row in results:
        print(row)

In [34]:
# Train logistic regression model
def train_logistic_regression():
    query = f"""
    CREATE OR REPLACE MODEL `{dataset_id}.logistic_regression_model`
    OPTIONS(
        model_type = 'logistic_reg',
        auto_class_weights = TRUE,
        max_iterations = 10,  -- Minimal number of iterations for testing
        data_split_method = 'RANDOM',  -- Automatically split the data
        data_split_eval_fraction = 0.3  -- Use 30% of the data for evaluation
    ) AS
    SELECT * FROM `{dataset_id}.{table_id_stagging_na_string}`
    """
    run_query_with_logging(query, "logistic_regression_model")
    print("Logistic Regression model trained")


# Train DNN classifier model
def train_dnn_classifier():
    query = f"""
    CREATE OR REPLACE MODEL `{dataset_id}.dnn_classifier_model`
    OPTIONS(
        model_type = 'dnn_classifier',
        hidden_units = [32],  -- Minimal number of hidden units
        max_iterations = 10,  -- Minimal number of iterations for testing
        data_split_method = 'RANDOM',  -- Automatically split the data
        data_split_eval_fraction = 0.3  -- Use 30% of the data for evaluation
    ) AS
    SELECT * FROM `{dataset_id}.{table_id_stagging_na_string}`
    """
    run_query_with_logging(query, "dnn_classifier_model")
    print("DNN Classifier model trained")


# Train Decision Tree model
def train_decision_tree():
    query = f"""
    CREATE OR REPLACE MODEL `{dataset_id}.decision_tree_model`
    OPTIONS(
        model_type = 'boosted_tree_classifier',
        max_iterations = 10,  -- Minimal number of iterations
        data_split_method = 'RANDOM',  -- Automatically split the data
        data_split_eval_fraction = 0.3  -- Use 30% of the data for evaluation
    ) AS
    SELECT * FROM `{dataset_id}.{table_id_stagging_na_string}`
    """
    run_query_with_logging(query, "decision_tree_model")
    print("Decision Tree model trained")


# Train Random Forest model
def train_random_forest():
    query = f"""
    CREATE OR REPLACE MODEL `{dataset_id}.random_forest_model`
    OPTIONS(
        model_type = 'random_forest_classifier',
        data_split_method = 'RANDOM',  -- Use RANDOM split for evaluation
        data_split_eval_fraction = 0.3  -- Use 30% of the data for evaluation
    ) AS
    SELECT * FROM `{dataset_id}.{table_id_stagging_na_string}`
    """
    run_query_with_logging(query, "random_forest_model")
    print("Random Forest model trained")


# Train XGBoost classifier model
def train_xgboost():
    query = f"""
    CREATE OR REPLACE MODEL `{dataset_id}.xgboost_model`
    OPTIONS(
        model_type = 'boosted_tree_classifier',
        max_iterations = 10,  -- Number of boosting rounds
        data_split_method = 'RANDOM',  -- Use RANDOM split for evaluation
        data_split_eval_fraction = 0.3  -- Use 30% of the data for evaluation
    ) AS
    SELECT * FROM `{dataset_id}.{table_id_stagging_na_string}`
    """
    run_query_with_logging(query, "xgboost_model")
    print("XGBoost model trained")

In [35]:
# Train and evaluate all models with minimal configurations
def run_all_models_and_evaluate():
    # Train models with minimal settings
    train_logistic_regression()
    #train_dnn_classifier()
    train_decision_tree()
    train_random_forest()
    train_xgboost()

    # Evaluate models
    evaluate_model('logistic_regression_model')
    #evaluate_model('dnn_classifier_model')
    evaluate_model('decision_tree_model')
    evaluate_model('random_forest_model')
    evaluate_model('xgboost_model')

# Run all models and evaluation
run_all_models_and_evaluate()

Training started for logistic_regression_model...
Training completed for logistic_regression_model in 132.55 seconds.
Logistic Regression model trained
Training started for decision_tree_model...
Training completed for decision_tree_model in 298.16 seconds.
Decision Tree model trained
Training started for random_forest_model...
Training completed for random_forest_model in 310.71 seconds.
Random Forest model trained
Training started for xgboost_model...
Training completed for xgboost_model in 177.62 seconds.
XGBoost model trained
Evaluating logistic_regression_model...
Evaluation results for logistic_regression_model:
Row((0.9119037685996384, 0.8305100845391508, 0.8596906326997145, 0.8693058478466203, 0.3434720829050217, 0.9244285714285714), {'precision': 0, 'recall': 1, 'accuracy': 2, 'f1_score': 3, 'log_loss': 4, 'roc_auc': 5})
Evaluating decision_tree_model...
Evaluation results for decision_tree_model:
Row((0.9429475265925896, 0.9437909634930184, 0.9363342020760876, 0.9433690565201