In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix, recall_score, f1_score
from sklearn.metrics import roc_curve, auc

In [None]:
def load_and_preprocess_data(filename):
    data = pd.read_csv(filename, na_values='NULL')
    data.set_index('Pat', inplace=True)

    # Major cluster receive label 1, the rest 0
    data["clusters_pred"].replace([2, 3], 0, inplace=True)

    return data

In [None]:
def train_random_forest(X_train, y_train, random_state=1):
    classifier = RandomForestClassifier(random_state=random_state)
    classifier.fit(X_train, y_train)
    return classifier

In [None]:
def evaluate_classifier(classifier, X_test, y_test):
    predictions = classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, predictions)
    confusion = confusion_matrix(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    
    return accuracy, confusion, recall, f1

In [None]:
def plot_roc_curve(classifier, X_test, y_test):
    class_probabilities = classifier.predict_proba(X_test)
    preds = class_probabilities[:, 1]

    fpr, tpr, _ = roc_curve(y_test, preds)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [None]:
# Define paths and other parameters
input_filename = "path_to_input_file.csv"
random_state = 1

# Load and preprocess data
data = load_and_preprocess_data(input_filename)

# Split the data into features and target variable
X = data.iloc[:, :-1]
y = data['clusters_pred']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)

# Train Random Forest classifier
classifier = train_random_forest(X_train, y_train, random_state=random_state)

# Evaluate classifier
accuracy, confusion, recall, f1 = evaluate_classifier(classifier, X_test, y_test)
print(f"Accuracy of the classifier is: {accuracy}")
print("Confusion Matrix:")
print(confusion)
print(f"Recall Score of the classifier is: {recall}")
print(f"F1 Score of the classifier is: {f1}")

# Plot ROC Curve
plot_roc_curve(classifier, X_test, y_test)