In [1]:
import pandas as pd  # Data manipulation and analysis library
import numpy as np   # Numerical computing library

from sklearn.model_selection import train_test_split, GridSearchCV  # For splitting data and hyperparameter tuning
from sklearn.linear_model import LogisticRegression  # Logistic Regression model
from sklearn.ensemble import RandomForestClassifier  # Random Forest model
import lightgbm as lgb  # LightGBM gradient boosting framework

from sklearn.metrics import (
    accuracy_score,    # Accuracy metric
    precision_score,   # Precision metric
    recall_score,      # Recall metric
    f1_score,          # F1-score metric
    roc_auc_score,     # ROC AUC metric
    classification_report  # Detailed classification report
)

import joblib  # For saving and loading machine learning models

import json  # For working with JSON data format

In [2]:
# --- Load data from Stage 2 (Assuming it has been run and saved) ---
print("--- Stage 3: Model Building and Training ---")
print("\n--- 3.0 Load processed pairwise features data from Stage 2 ---")

try:
    # Attempt to load the saved pairwise features file from Stage 2
    pairwise_features_df = pd.read_csv("../data/pairwise_features_engineered.csv")
    print("Successfully loaded pairwise features data!")
except FileNotFoundError:
    print("File pairwise_features_engineered.csv not found.")
    print("Please ensure that Stage 2 has been run and the file has been saved, or the data exists in memory.")

--- Stage 3: Model Building and Training ---

--- 3.0 Load processed pairwise features data from Stage 2 ---
Successfully loaded pairwise features data!


In [3]:
print(f"Number of samples in pairwise_features_df: {len(pairwise_features_df)}")
print("Information about pairwise_features_df:")
pairwise_features_df.info()

Number of samples in pairwise_features_df: 49758
Information about pairwise_features_df:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49758 entries, 0 to 49757
Data columns (total 22 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   user1                                  49758 non-null  int64  
 1   user2                                  49758 non-null  int64  
 2   target                                 49758 non-null  int64  
 3   age_diff                               49758 non-null  float64
 4   height_diff                            49758 non-null  float64
 5   geo_distance_km                        49758 non-null  float64
 6   user1_within_user2_loc_pref            49758 non-null  float64
 7   user2_within_user1_loc_pref            49758 non-null  float64
 8   orientation_compatible_user1_to_user2  49758 non-null  bool   
 9   orientation_compatible_user2_to_user1  49758 non-

In [4]:
# --- 3.1 Dataset Splitting ---
print("\n--- 3.1 Dataset Splitting ---")

# Remove non-feature columns (user1 and user2 can be kept for identification but excluded from features X)
if 'user1' in pairwise_features_df.columns and 'user2' in pairwise_features_df.columns:
    X = pairwise_features_df.drop(columns=['target', 'user1', 'user2'], errors='ignore')
else:
    X = pairwise_features_df.drop(columns=['target'], errors='ignore')

y = pairwise_features_df['target']


--- 3.1 Dataset Splitting ---


In [5]:
# Split the dataset into train and test sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Further split the training set into final training and validation sets
# (e.g., 80% train_final, 20% validation from the original training set)
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

In [6]:
print(f"Training set size (train_final): {X_train_final.shape}, {y_train_final.shape}")
print(f"Validation set size: {X_val.shape}, {y_val.shape}")
print(f"Test set size: {X_test.shape}, {y_test.shape}")

# Save test sets for later use
X_test.to_csv("../data/X_test_data.csv", index=False)
y_test.to_csv("../data/y_test_data.csv", index=False)

Training set size (train_final): (31844, 19), (31844,)
Validation set size: (7962, 19), (7962,)
Test set size: (9952, 19), (9952,)


In [7]:
# --- 3.2 Model Selection and Training ---
print("\n--- 3.2 Model Selection and Training ---")

models = {}
model_predictions = {}
model_probabilities = {}


--- 3.2 Model Selection and Training ---


In [8]:
# --- 3.2.1 Logistic Regression ---
print("\n--- Training Logistic Regression (default parameters initially) ---")
log_reg_default = LogisticRegression(
    solver='liblinear',
    random_state=42,
    max_iter=1000,
    class_weight='balanced'
)
log_reg_default.fit(X_train_final, y_train_final)
models['Logistic Regression_default'] = log_reg_default

y_pred_log_reg_val_default = log_reg_default.predict(X_val)
y_proba_log_reg_val_default = log_reg_default.predict_proba(X_val)[:, 1]

model_predictions['Logistic Regression_default_val'] = y_pred_log_reg_val_default
model_probabilities['Logistic Regression_default_val'] = y_proba_log_reg_val_default

print("Logistic Regression (default) training completed.")


--- Training Logistic Regression (default parameters initially) ---
Logistic Regression (default) training completed.


In [9]:
# --- 3.2.2 Random Forest Classifier ---
print("\n--- Training Random Forest Classifier (default parameters initially) ---")
rf_clf_default = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)
rf_clf_default.fit(X_train_final, y_train_final)
models['Random Forest_default'] = rf_clf_default

y_pred_rf_val_default = rf_clf_default.predict(X_val)
y_proba_rf_val_default = rf_clf_default.predict_proba(X_val)[:, 1]

model_predictions['Random Forest_default_val'] = y_pred_rf_val_default
model_probabilities['Random Forest_default_val'] = y_proba_rf_val_default

print("Random Forest (default) training completed.")


--- Training Random Forest Classifier (default parameters initially) ---
Random Forest (default) training completed.


In [10]:
# --- 3.2.3 LightGBM Classifier ---
print("\n--- Training LightGBM Classifier (default parameters initially) ---")
lgbm_clf_default = lgb.LGBMClassifier(
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)
lgbm_clf_default.fit(
    X_train_final, y_train_final,
    eval_set=[(X_val, y_val)],
    eval_metric='auc',
    callbacks=[lgb.early_stopping(10, verbose=False)]
)
models['LightGBM_default'] = lgbm_clf_default

y_pred_lgbm_val_default = lgbm_clf_default.predict(X_val)
y_proba_lgbm_val_default = lgbm_clf_default.predict_proba(X_val)[:, 1]

model_predictions['LightGBM_default_val'] = y_pred_lgbm_val_default
model_probabilities['LightGBM_default_val'] = y_proba_lgbm_val_default

print("LightGBM (default) training completed.")


--- Training LightGBM Classifier (default parameters initially) ---
[LightGBM] [Info] Number of positive: 10615, number of negative: 21229
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002474 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 877
[LightGBM] [Info] Number of data points in the train set: 31844, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
LightGBM (default) training completed.


In [11]:
# --- 3.2.4 Hyperparameter Tuning with GridSearchCV ---
print("\n--- 3.2.4 Hyperparameter Tuning with GridSearchCV ---")
# Define a common scoring metric, e.g., roc_auc or f1. 'roc_auc' is good for imbalanced binary classification.
# For precision/recall focus, 'f1' might be better.
SCORING_METRIC = 'roc_auc' # Or 'f1', 'accuracy', etc.
CV_FOLDS = 3 # Number of cross-validation folds. Increase for more robust tuning, but takes longer.

# --- Hyperparameter Tuning for Logistic Regression ---
print(f"\n--- Tuning Logistic Regression (scoring: {SCORING_METRIC}) ---")
param_grid_log_reg = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],       # Norm used in the penalization
    # solver='liblinear' is good for l1/l2 and small datasets
}
grid_search_log_reg = GridSearchCV(
    estimator=LogisticRegression(solver='liblinear', random_state=42, max_iter=1000, class_weight='balanced'),
    param_grid=param_grid_log_reg,
    cv=CV_FOLDS,
    scoring=SCORING_METRIC,
    verbose=1,
    n_jobs=-1
)
grid_search_log_reg.fit(X_train_final, y_train_final)
print(f"Best parameters for Logistic Regression: {grid_search_log_reg.best_params_}")
print(f"Best {SCORING_METRIC} score for Logistic Regression: {grid_search_log_reg.best_score_:.4f}")
models['Logistic Regression_tuned'] = grid_search_log_reg.best_estimator_

# Predictions on validation set with tuned model
y_pred_log_reg_tuned_val = models['Logistic Regression_tuned'].predict(X_val)
y_proba_log_reg_tuned_val = models['Logistic Regression_tuned'].predict_proba(X_val)[:, 1]
model_predictions['Logistic Regression_tuned_val'] = y_pred_log_reg_tuned_val
model_probabilities['Logistic Regression_tuned_val'] = y_proba_log_reg_tuned_val


--- 3.2.4 Hyperparameter Tuning with GridSearchCV ---

--- Tuning Logistic Regression (scoring: roc_auc) ---
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for Logistic Regression: {'C': 10, 'penalty': 'l2'}
Best roc_auc score for Logistic Regression: 0.7431


In [12]:
# --- Hyperparameter Tuning for Random Forest ---
print(f"\n--- Tuning Random Forest Classifier (scoring: {SCORING_METRIC}) ---")
param_grid_rf = {
    'n_estimators': [100, 200],         # Number of trees
    'max_depth': [None, 10, 20],          # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],    # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],      # Minimum number of samples required to be at a leaf node
    # 'max_features': ['sqrt', 'log2'] # Number of features to consider when looking for the best split. 'auto' is sqrt
}
# Note: max_features='auto' is equivalent to 'sqrt' for RandomForestClassifier.
# A smaller grid is used here for faster execution. Expand it for more thorough tuning.
grid_search_rf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, class_weight='balanced', n_jobs=-1),
    param_grid=param_grid_rf,
    cv=CV_FOLDS,
    scoring=SCORING_METRIC,
    verbose=1,
    n_jobs=-1
)
grid_search_rf.fit(X_train_final, y_train_final)
print(f"Best parameters for Random Forest: {grid_search_rf.best_params_}")
print(f"Best {SCORING_METRIC} score for Random Forest: {grid_search_rf.best_score_:.4f}")
models['Random Forest_tuned'] = grid_search_rf.best_estimator_

# Predictions on validation set with tuned model
y_pred_rf_tuned_val = models['Random Forest_tuned'].predict(X_val)
y_proba_rf_tuned_val = models['Random Forest_tuned'].predict_proba(X_val)[:, 1]
model_predictions['Random Forest_tuned_val'] = y_pred_rf_tuned_val
model_probabilities['Random Forest_tuned_val'] = y_proba_rf_tuned_val


--- Tuning Random Forest Classifier (scoring: roc_auc) ---
Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best parameters for Random Forest: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Best roc_auc score for Random Forest: 0.7394


In [13]:
# --- Hyperparameter Tuning for LightGBM ---
print(f"\n--- Tuning LightGBM Classifier (scoring: {SCORING_METRIC}) ---")
param_grid_lgbm = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [20, 31, 40],          # max number of leaves in one tree
    'max_depth': [-1, 10, 20],            # max tree depth for base learners, -1 means no limit
    # 'min_child_samples': [10, 20, 30], # min number of data in one leaf
    # 'subsample': [0.8, 0.9, 1.0],       # subsample ratio of the training instance
    # 'colsample_bytree': [0.8, 0.9, 1.0] # subsample ratio of columns when constructing each tree
}
# A smaller grid is used here for faster execution. Expand it for more thorough tuning.
# Note: For LightGBM, if using class_weight='balanced', you might also want to set boosting_type='gbdt' explicitly.
# Early stopping is not directly used by GridSearchCV during its internal CV,
# but the best_estimator_ will be a model with the specified n_estimators.
# You could re-train the best estimator with early stopping on the full X_train_final and X_val if desired.
grid_search_lgbm = GridSearchCV(
    estimator=lgb.LGBMClassifier(random_state=42, class_weight='balanced', n_jobs=-1),
    param_grid=param_grid_lgbm,
    cv=CV_FOLDS,
    scoring=SCORING_METRIC,
    verbose=1,
    n_jobs=-1
)
grid_search_lgbm.fit(X_train_final, y_train_final) # No eval_set or callbacks here, GridSearchCV handles CV
print(f"Best parameters for LightGBM: {grid_search_lgbm.best_params_}")
print(f"Best {SCORING_METRIC} score for LightGBM: {grid_search_lgbm.best_score_:.4f}")
models['LightGBM_tuned'] = grid_search_lgbm.best_estimator_

# Predictions on validation set with tuned model
y_pred_lgbm_tuned_val = models['LightGBM_tuned'].predict(X_val)
y_proba_lgbm_tuned_val = models['LightGBM_tuned'].predict_proba(X_val)[:, 1]
model_predictions['LightGBM_tuned_val'] = y_pred_lgbm_tuned_val
model_probabilities['LightGBM_tuned_val'] = y_proba_lgbm_tuned_val

print("Hyperparameter tuning completed for all models.")


--- Tuning LightGBM Classifier (scoring: roc_auc) ---
Fitting 3 folds for each of 81 candidates, totalling 243 fits
[LightGBM] [Info] Number of positive: 7077, number of negative: 14153
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029509 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 876
[LightGBM] [Info] Number of data points in the train set: 21230, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 7076, number of negative: 14153
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003941 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 875


In [14]:
print("\n--- 3.3 Preliminary Evaluation on Validation Set ---")

# We now have default and tuned versions for each model.
# The loop will iterate through all of them.
all_model_names = list(models.keys()) # Get all model names (default and tuned)

for model_name in all_model_names:
    # Check if predictions for validation set exist
    val_pred_key = model_name + "_val" # e.g., "Logistic Regression_default_val" or "Logistic Regression_tuned_val"

    if val_pred_key in model_predictions:
        y_pred = model_predictions[val_pred_key]
        y_proba = model_probabilities[val_pred_key]

        print(f"\n--- Results for {model_name} on Validation Set ---")
        print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
        print(f"Precision: {precision_score(y_val, y_pred, zero_division=0):.4f}")
        print(f"Recall: {recall_score(y_val, y_pred, zero_division=0):.4f}")
        print(f"F1 Score: {f1_score(y_val, y_pred, zero_division=0):.4f}")
        print(f"ROC AUC: {roc_auc_score(y_val, y_proba):.4f}")
        print("\nClassification Report:")
        print(classification_report(y_val, y_pred, zero_division=0))
    else:
        print(f"Validation predictions for {model_name} not found (key: {val_pred_key}). Skipping evaluation.")


--- 3.3 Preliminary Evaluation on Validation Set ---

--- Results for Logistic Regression_default on Validation Set ---
Accuracy: 0.6350
Precision: 0.4773
Recall: 0.9996
F1 Score: 0.6461
ROC AUC: 0.7402

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.45      0.62      5308
           1       0.48      1.00      0.65      2654

    accuracy                           0.64      7962
   macro avg       0.74      0.73      0.63      7962
weighted avg       0.83      0.64      0.63      7962


--- Results for Random Forest_default on Validation Set ---
Accuracy: 0.6623
Precision: 0.4908
Recall: 0.3512
F1 Score: 0.4094
ROC AUC: 0.7370

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.82      0.76      5308
           1       0.49      0.35      0.41      2654

    accuracy                           0.66      7962
   macro avg       0.60      0.58      0.59      7962
weigh

In [15]:
# --- 3.4 Determine and Save Best Model ---
print("\n--- 3.4 Determine and Save Best Model ---")

# 1. Collect validation scores (ROC AUC in this case)
validation_roc_auc_scores = {}
for model_name_key, proba_val in model_probabilities.items(): # Keys like "Logistic Regression_default_val"
    if "_val" in model_name_key:
        # Extract the base model name (e.g., "Logistic Regression_default")
        # This base name should match the keys in the `models` dictionary
        model_base_name = model_name_key.replace("_val", "")
        if model_base_name in models: # Ensure the model exists
            score = roc_auc_score(y_val, proba_val)
            validation_roc_auc_scores[model_base_name] = score
            print(f"Validation ROC AUC for {model_base_name}: {score:.4f}")
        else:
            print(f"Warning: Model {model_base_name} not found in 'models' dictionary, but has validation probabilities.")


--- 3.4 Determine and Save Best Model ---
Validation ROC AUC for Logistic Regression_default: 0.7402
Validation ROC AUC for Random Forest_default: 0.7370
Validation ROC AUC for LightGBM_default: 0.7422
Validation ROC AUC for Logistic Regression_tuned: 0.7400
Validation ROC AUC for Random Forest_tuned: 0.7437
Validation ROC AUC for LightGBM_tuned: 0.7474


In [16]:
# 2. Determine the best performing model based on ROC AUC on validation set
best_overall_model_name = None
best_overall_roc_auc = -1.0  # Initialize with a value lower than any possible ROC AUC

if validation_roc_auc_scores: # Check if dictionary is not empty
    # Find the model name with the highest ROC AUC score
    best_overall_model_name = max(validation_roc_auc_scores, key=validation_roc_auc_scores.get)
    best_overall_roc_auc = validation_roc_auc_scores[best_overall_model_name]
else:
    print("No validation scores collected. Cannot determine the best model.")


if best_overall_model_name and best_overall_model_name in models:
    print(f"\nOverall best model on validation set: {best_overall_model_name} with ROC AUC: {best_overall_roc_auc:.4f}")

    # 3. Save the best model object
    best_model_instance = models[best_overall_model_name]
    model_filename = f"../models/best_overall_model.joblib" # Simplified filename
    joblib.dump(best_model_instance, model_filename)
    print(f"Saved best model ({best_overall_model_name}) to {model_filename}")

    # 4. Save information about the best model
    # Retrieve all metrics for the best model from the evaluation step (or re-calculate if needed)
    best_model_val_pred_key = best_overall_model_name + "_val"
    y_pred_best_val = model_predictions[best_model_val_pred_key]
    y_proba_best_val = model_probabilities[best_model_val_pred_key]

    best_model_metrics = {
        'accuracy': accuracy_score(y_val, y_pred_best_val),
        'precision': precision_score(y_val, y_pred_best_val, zero_division=0),
        'recall': recall_score(y_val, y_pred_best_val, zero_division=0),
        'f1': f1_score(y_val, y_pred_best_val, zero_division=0),
        'roc_auc': roc_auc_score(y_val, y_proba_best_val) # This should match best_overall_roc_auc
    }

    best_model_info = {
        'best_model_name': best_overall_model_name,
        'saved_filename': model_filename,
        'validation_metrics': best_model_metrics,
        'parameters': best_model_instance.get_params() # Get all parameters of the best model
    }

    # Add tuning details if the best model was a tuned one and tuning info is available
    if "_tuned" in best_overall_model_name:
        gs_obj = None
        original_model_type = best_overall_model_name.split('_')[0] # e.g. "Logistic Regression"

        if "Logistic Regression" in original_model_type:
            gs_obj = grid_search_log_reg
        elif "Random Forest" in original_model_type:
            gs_obj = grid_search_rf
        elif "LightGBM" in original_model_type:
            gs_obj = grid_search_lgbm

        if gs_obj:
            best_model_info['tuning_details'] = {
                'best_cv_params': gs_obj.best_params_,
                'best_cv_score': gs_obj.best_score_,
                'cv_scoring_metric': SCORING_METRIC
            }
        else:
            best_model_info['tuning_details'] = "Tuning object not found for the best model."


    best_model_summary_filename = "../models/best_model_summary.json"
    try:
        # Helper function to convert numpy types to native Python types for JSON serialization
        def convert_numpy_types(obj):
            if isinstance(obj, np.integer):
                return int(obj)
            elif isinstance(obj, np.floating):
                return float(obj)
            elif isinstance(obj, np.ndarray):
                return obj.tolist()
            elif isinstance(obj, np.bool_):
                return bool(obj)
            return obj # Or raise TypeError for unhandled types

        with open(best_model_summary_filename, 'w') as f:
            json.dump(best_model_info, f, indent=4, default=convert_numpy_types)
        print(f"Best model summary saved to {best_model_summary_filename}")
    except TypeError as e:
        print(f"Error serializing best model summary to JSON: {e}")
        print("Attempting to save with default=str as a fallback...")
        try:
            with open(best_model_summary_filename, 'w') as f:
                 json.dump(best_model_info, f, indent=4, default=str)
            print(f"Best model summary saved to {best_model_summary_filename} using default=str.")
        except Exception as e_fallback:
            print(f"Fallback saving also failed: {e_fallback}")
    except Exception as e:
        print(f"An unexpected error occurred while saving best model summary: {e}")

else:
    if not best_overall_model_name:
        print("Could not determine the best model. No validation scores found or scores were too low.")
    elif best_overall_model_name not in models:
        print(f"Best model '{best_overall_model_name}' determined by score, but not found in 'models' dictionary.")

print("\n--- Stage 3 Completed ---")


Overall best model on validation set: LightGBM_tuned with ROC AUC: 0.7474
Saved best model (LightGBM_tuned) to ../models/best_overall_model.joblib
Best model summary saved to ../models/best_model_summary.json

--- Stage 3 Completed ---
