## Executing a simple sklearn machine learning in HTCondor Executor

### Pre-requisites
- Make sure all those Condor Executors have `pandas` and `scikit-learn` Python library installed via `sudo pip3 installl pandas scikit-learn`.
- Install NFS server on Condor Submit, and mount it to Condor Executors
- For your Submit Condor and Condor Executors, you need to change the ownership and permission for your home directory and NFS directory

### Running `scikit-learn` in HTCondor Executor

In [1]:
# Installing necessary libraries
import os
import htcondor

In [2]:
%%writefile ./data/scripts/train_loan_prediction.py
#!/usr/bin/env python3
# Import necessary libraries
print("------------")
print("05-loading-csv-from-nfs-in-htcondor-executor.ipynb")
print("------------")

# Import necessary libraries
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the loan dataset from NFS filesystem
base_dir = "/home/tanyongsheng_net/data"

CSV_file = os.path.join(base_dir, "loan_data.csv")
data = pd.read_csv(CSV_file)

# Inspect the dataset
print(data.head())

# Preprocess the data: Handle missing values, encoding categorical features, etc.
# Example of filling missing values with the mean or mode
data.fillna(data.mean(), inplace=True)

# Encoding categorical features (example: 'Gender', 'Married' columns)
data = pd.get_dummies(data, drop_first=True)

# Define the target and features
X = data.drop('Loan_Status', axis=1)  # Assuming 'Loan_Status' is the target column
y = data['Loan_Status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (optional but recommended)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

print(f"Accuracy: {accuracy:.2f}")
print("Task completed!")

Overwriting ./data/scripts/train_loan_prediction.py


In [3]:
# make sure the python script is executable
!chmod 764 ./data/scripts/train_loan_prediction.py

- Setting up Configuration file for HTCondor Submit

In [4]:
base_dir = "/home/tanyongsheng_net/data"

train_ml_job = htcondor.Submit({
    "executable": os.path.join(base_dir, "/scripts/train_loan_prediction.py"),  # Use bash to execute shell commands
    "request_cpus": "1",            # Number of CPU cores required
    "request_memory": "128MB",      # Memory required
    "request_disk": "128MB",        # Disk space required
    "output": os.path.join(base_dir, "output/train_loan_prediction.out"),  # Standard output file
    "error": os.path.join(base_dir, ".error/train_loan_prediction.err"),    # Standard error file
    "log": os.path.join(base_dir, "log/train_loan_prediction.log"),        # Log file
})

print(train_ml_job)

executable = /scripts/train_loan_prediction.py
request_cpus = 1
request_memory = 128MB
request_disk = 128MB
output = /home/tanyongsheng_net/data/output/train_loan_prediction.out
error = /home/tanyongsheng_net/data/.error/train_loan_prediction.err
log = /home/tanyongsheng_net/data/log/train_loan_prediction.log



- Submit the task to HTCondor task

In [5]:
schedd = htcondor.Schedd()                   # get the Python representation of the scheduler
submit_result = schedd.submit(train_ml_job)  # submit the job
print(submit_result.cluster())               # print the job's ClusterId

34


In [6]:
schedd.query(
    constraint='ClusterId =?= {}'.format(submit_result.cluster()),
    projection=["ClusterId", "ProcId", "JobStatus", "EnteredCurrentStatus"],
)

[[ ProcId = 0; ClusterId = 34; JobStatus = 1; ServerTime = 1736437982; EnteredCurrentStatus = 1736437982 ]]

## Monitoring condor_status in Jupyter notebook

- Basically, it's same as running `wait -n 1 condor_status` in terminal

In [None]:
import subprocess
import time
from IPython.display import clear_output, display, HTML

# Function to run the condor_status command and display the output
def display_condor_status():
    while True:
        result = subprocess.run(['condor_status'], capture_output=True, text=True)
        
        # Clear the previous output before updating
        clear_output(wait=True)

        # Format the output with proper wrapping and limit cell size
        html_output = f"<pre style='white-space: pre-wrap; word-wrap: break-word; max-height: 600px; overflow: auto;'>{result.stdout}</pre>"
        
        # Display the formatted output
        display(HTML(html_output))
        
        # Wait for 1 second before updating
        time.sleep(1)

# Run the display function
display_condor_status()