In [None]:
# Copyright © 2025 Yao Yuzhuo (yaoyuzhuo6@gmail.com). For academic research only. Commercial use is strictly prohibited.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score, mean_absolute_error
from imblearn.over_sampling import SMOTE
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import itertools
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from pytorch_tabnet.tab_model import TabNetClassifier
import statsmodels.formula.api as smf
from sklearn import tree
from six import StringIO
from IPython.display import Image
import pydotplus
from termcolor import colored as cl
from scipy.stats import norm
from sklearn import preprocessing
import imblearn

In [None]:
# Load data (this should be after imports)
train_data = pd.read_csv('DBS.csv', sep=';')
test_data = pd.read_csv('DBS_2020.csv', sep=';')

X_train_common = np.asarray(train_data[['access', 'tests', 'assignments']])
y_train_common = np.asarray(train_data['graduate'])
X_test_common = np.asarray(test_data[['access', 'tests', 'assignments']])
y_test_common = np.asarray(test_data['graduate'])

# New: Generate Table I - Descriptive Statistics for 2016-2019 Dataset
independent_vars = train_data[['access', 'tests', 'assignments', 'exam', 'project']]
desc_stats_2019 = independent_vars.describe()
print("Table I: Descriptive Statistics of Independent Variables (Accesses, Assignments, Tests, Exam, and Project) for the 2016–2019 Dataset")
print(desc_stats_2019)

# New: Generate Figure 2 - Correlation Matrix
corr_matrix = train_data[['access', 'assignments', 'tests', 'exam', 'project', 'result_points']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Figure 2: Correlation Matrix of Independent Variables and Result Points')
plt.show()

# New: Generate Table II - Descriptive Statistics for 2020 Dataset
independent_vars_2020 = test_data[['access', 'tests', 'assignments', 'exam', 'project']]
desc_stats_2020 = independent_vars_2020.describe()
print("Table II: Descriptive Statistics of Independent Variables (Accesses, Assignments, Tests, Exam, and Project) for the 2020 Dataset")
print(desc_stats_2020)

In [None]:
# Section 1: Decision Tree

# Copy of data for this model
X_train = np.copy(X_train_common)
y_train = np.copy(y_train_common)
X_test = np.copy(X_test_common)
y_test = np.copy(y_test_common)

# Standardize the data
standardizer = StandardScaler()

# Initialize classifier
clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)

# Train the model
clf = clf.fit(X_train, y_train)

# Evaluate pipeline without oversampling
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=4)
scores = cross_val_score(clf, X_test, y_test, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC (without oversampling): %.3f' % np.mean(scores))

# Handle class imbalance with SMOTE
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

# Summarize new class distribution
counter = Counter(y_train)
print("Class distribution after SMOTE:", counter)

# Evaluate pipeline with oversampling
scores = cross_val_score(clf, X_test, y_test, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC (with oversampling): %.3f' % np.mean(scores))

# Predictions
yhat = clf.predict(X_test)
yhat_prob = clf.predict_proba(X_test)

# Unified Evaluation
print("\nClassification Report:\n", classification_report(y_test, yhat))
accuracy = accuracy_score(y_test, yhat)
print("Accuracy:", accuracy)
cm = confusion_matrix(y_test, yhat)
print("Confusion Matrix:\n", cm)
auc = roc_auc_score(y_test, yhat_prob[:, 1])
print("AUC-ROC:", auc)

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, yhat_prob[:, 1])
plt.plot(fpr, tpr, label=f"Decision Tree, AUC={auc:.2f}")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()

# Visualize Decision Tree
dot_data = StringIO()
tree.export_graphviz(clf, out_file=dot_data, filled=True, rounded=True, special_characters=True,
                     feature_names=['access', 'tests', 'assignments'], class_names=['0', '1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('tree.png')
Image(graph.create_png())

In [None]:
# Section 2: Logistic Regression

# Copy of data for this model
X_train = np.copy(X_train_common)
y_train = np.copy(y_train_common)
X_test = np.copy(X_test_common)
y_test = np.copy(y_test_common)

# Display basic information
print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)
print("Train data columns:", list(train_data.columns))

# Check class distribution
print("\nTrain graduate distribution:\n", train_data['graduate'].value_counts())
print("Test graduate distribution:\n", test_data['graduate'].value_counts())

# Display samples
print(cl('\nX_train samples:', attrs=['bold']), X_train[:5])
print(cl('y_train samples:', attrs=['bold']), y_train[:5])
print(cl('\nX_test samples:', attrs=['bold']), X_test[:5])
print(cl('y_test samples:', attrs=['bold']), y_test[:5])

# Normalize test data
scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test)
print("\nNormalized X_test description:\n", pd.DataFrame(X_test_scaled).describe())

# Visualize original class distribution
counter = Counter(y_train)
print("\nOriginal class distribution:", counter)
for label in counter:
    row_ix = np.where(y_train == label)[0]
    plt.scatter(X_train[row_ix, 0], X_train[row_ix, 1], label=f'Class {label}')
plt.legend()
plt.title("Original Class Distribution")
plt.show()

# Apply SMOTE to balance training data
sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# Visualize resampled class distribution
counter_res = Counter(y_train_res)
print("\nResampled class distribution:", counter_res)
for label in counter_res:
    row_ix = np.where(y_train_res == label)[0]
    plt.scatter(X_train_res[row_ix, 0], X_train_res[row_ix, 1], label=f'Class {label}')
plt.legend()
plt.title("Resampled Class Distribution (SMOTE)")
plt.show()

# Train Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train_res, y_train_res)
print(cl('\nLogistic Regression Model:', attrs=['bold']), lr_model)

# Evaluate model with cross-validation
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)
scores = cross_val_score(lr_model, X_test, y_test, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % np.mean(scores))

# Make predictions
yhat = lr_model.predict(X_test)
yhat_prob = lr_model.predict_proba(X_test)

# Unified Evaluation
print("\nClassification Report:\n", classification_report(y_test, yhat))
accuracy = accuracy_score(y_test, yhat)
print("Accuracy:", accuracy)
cm = confusion_matrix(y_test, yhat)
print("Confusion Matrix:\n", cm)
auc = roc_auc_score(y_test, yhat_prob[:, 1])
print("AUC-ROC:", auc)

# Confusion Matrix Plot
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j], horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

cnf_matrix = confusion_matrix(y_test, yhat)
print(f"\nRecall: {100 * cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]):.2f}%")
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=[0, 1], title='Confusion Matrix')
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, yhat_prob[:, 1])
plt.plot(fpr, tpr, label=f"Logistic Regression, AUC={auc:.3f}")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.title('ROC Curve')
plt.show()

# Fit logistic regression using statsmodels
logit_model = smf.logit("graduate ~ access + assignments + tests", data=train_data)
results = logit_model.fit()
print("\nStatsmodels Logistic Regression Summary:")
print(results.summary())

In [None]:
# Section 3: Naive Bayes

# Copy of data for this model
X_train = np.copy(X_train_common)
y_train = np.copy(y_train_common)
X_test = np.copy(X_test_common)
y_test = np.copy(y_test_common)

# Set plot size
plt.rcParams['figure.figsize'] = (20, 10)

# Print training data preview
print("Training data preview:\n", train_data.head())

# Initialize and train Naive Bayes model
model = GaussianNB(priors=None, var_smoothing=1e-09)
model.fit(X_train, y_train)

# Evaluate model with cross-validation
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=4)
scores = cross_val_score(model, X_test, y_test, scoring='roc_auc', cv=cv, n_jobs=-1)
print(f"\nMean ROC AUC: {np.mean(scores):.3f}")

# Make predictions
predictions = model.predict(X_test)
yhat_prob = model.predict_proba(X_test)[:, 1]

# Unified Evaluation
print("\nClassification Report:\n", classification_report(y_test, predictions))
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
cm = confusion_matrix(y_test, predictions)
print("Confusion Matrix:\n", cm)
auc = roc_auc_score(y_test, yhat_prob)
print("AUC-ROC:", auc)

# Confusion Matrix Plot
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j], horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')

cnf_matrix = confusion_matrix(y_test, predictions)
recall = 100 * cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1])
print(f"\nRecall metric in the testing dataset: {recall:.2f}%")
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=[0, 1], title='Confusion Matrix')
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, yhat_prob)
plt.plot(fpr, tpr, label=f"Naive Bayes, AUC={auc:.2f}")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc=4)
plt.show()

In [None]:
# Section 4: MLP (Improved MLP)

# Copy of data for this model
X_train = np.copy(X_train_common)
y_train = np.copy(y_train_common)
X_test = np.copy(X_test_common)
y_test = np.copy(y_test_common)

# Feature standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# SMOTE
oversample = SMOTE(random_state=42)
X_train_res, y_train_res = oversample.fit_resample(X_train, y_train)

# To PyTorch tensors
X_train_tensor = torch.tensor(X_train_res, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_res, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

# Split train/val
X_train, X_val, y_train, y_val = train_test_split(X_train_tensor, y_train_tensor, test_size=0.2, random_state=42)

# DataLoaders
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# EnhancedMLP model
class EnhancedMLP(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, hidden_size3, dropout_rate=0.3):
        super(EnhancedMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout_rate)
        self.fc3 = nn.Linear(hidden_size2, hidden_size3)
        self.relu3 = nn.ReLU()
        self.dropout3 = nn.Dropout(dropout_rate)
        self.fc4 = nn.Linear(hidden_size3, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.dropout1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.dropout2(out)
        out = self.fc3(out)
        out = self.relu3(out)
        out = self.dropout3(out)
        out = self.fc4(out)
        out = self.sigmoid(out)
        return out

input_size = 3
hidden_size1 = 128
hidden_size2 = 64
hidden_size3 = 32
dropout_rate = 0.3

model = EnhancedMLP(input_size, hidden_size1, hidden_size2, hidden_size3, dropout_rate)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

patience = 10
best_val_loss = float('inf')
epochs_no_improve = 0

epochs = 100
train_losses = []
val_losses = []

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader)
    train_losses.append(train_loss)

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            output = model(X_batch)
            loss = criterion(output, y_batch)
            val_loss += loss.item()

    val_loss /= len(val_loader)
    val_losses.append(val_loss)

    print(f"Epoch [{epoch + 1}/{epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f"Early stopping triggered after epoch {epoch + 1}")
            break

# Evaluation
model.eval()
with torch.no_grad():
    y_pred_prob = model(X_test_tensor)
    y_pred = (y_pred_prob > 0.5).float()

# Unified Evaluation
print("\nClassification Report:\n", classification_report(y_test_tensor.numpy(), y_pred.numpy()))
accuracy = accuracy_score(y_test_tensor.numpy(), y_pred.numpy())
print("Accuracy:", accuracy)
cm = confusion_matrix(y_test_tensor.numpy(), y_pred.numpy())
print("Confusion Matrix:\n", cm)
auc = roc_auc_score(y_test_tensor.numpy(), y_pred_prob.numpy())
print("AUC-ROC:", auc)

plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Passed', 'Passed'],
            yticklabels=['Not Passed', 'Passed'])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

fpr, tpr, _ = roc_curve(y_test_tensor.numpy(), y_pred_prob.numpy())
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'Enhanced MLP, AUC={auc:.3f}', color='darkblue')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

plt.figure(figsize=(8, 6))
plt.plot(train_losses, label="Train Loss", color='blue')
plt.plot(val_losses, label="Val Loss", color='orange')
plt.title('Model Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

results = pd.DataFrame({
    'Actual': y_test_tensor.numpy().flatten(),
    'Predicted': y_pred.numpy().flatten(),
    'Probability': y_pred_prob.numpy().flatten()
})
results.to_csv('enhanced_mlp_predictions.csv', index=False)
print("\nPredictions saved to 'enhanced_mlp_predictions.csv'")

In [None]:
# Section 5: Neural Network

# Copy of data for this model
X_train = np.copy(X_train_common)
y_train = np.copy(y_train_common)
X_test = np.copy(X_test_common)
y_test = np.copy(y_test_common)

# Standardize features
X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test)

# SMOTE
oversample = SMOTE()
X_train_res, y_train_res = oversample.fit_resample(X_train, y_train)
print("\nResampled training data shape:", X_train_res.shape)

# To PyTorch tensors
X_train_tensor = torch.tensor(X_train_res, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_res, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

# MLPModel
class MLPModel(nn.Module):
    def __init__(self, input_dim):
        super(MLPModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 10)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(10, 5)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(5, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

model = MLPModel(input_dim=3)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=5, shuffle=True)

epochs = 100
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader)
    print(f"Epoch [{epoch + 1}/{epochs}], Train Loss: {train_loss:.4f}")

model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor)
    y_pred_binary = (y_pred > 0.5).float()

# Unified Evaluation
print("\nClassification Report:\n", classification_report(y_test_tensor.numpy(), y_pred_binary.numpy()))
accuracy = accuracy_score(y_test_tensor.numpy(), y_pred_binary.numpy())
print("Accuracy:", accuracy)
cm = confusion_matrix(y_test_tensor.numpy(), y_pred_binary.numpy())
print("Confusion Matrix:\n", cm)
auc = roc_auc_score(y_test_tensor.numpy(), y_pred.numpy())
print("AUC-ROC:", auc)

plt.figure(figsize=(7, 5))
sns.set(font_scale=1.4)
sns.heatmap(cm, annot=True, annot_kws={"size": 16}, fmt='d', cmap='Blues')
plt.title('Confusion Matrix Heatmap')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
# Section 6: Random Forest

# Copy of data for this model
X_train = np.copy(X_train_common)
y_train = np.copy(y_train_common)
X_test = np.copy(X_test_common)
y_test = np.copy(y_test_common)

print("imblearn version:", imblearn.__version__)

print("Training data preview:\n", train_data.head())

# Visualize original class distribution
counter = Counter(y_train)
print("\nOriginal class distribution:", counter)
for label in counter:
    row_ix = np.where(y_train == label)[0]
    plt.scatter(X_train[row_ix, 0], X_train[row_ix, 1], label=f'Class {label}')
plt.legend()
plt.title('Original Data Distribution')
plt.xlabel('Access')
plt.ylabel('Tests')
plt.show()

# Normalize data
scaler = MinMaxScaler(feature_range=(0, 1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# SMOTE
oversample = SMOTE()
X_train_res, y_train_res = oversample.fit_resample(X_train, y_train)

# Visualize resampled
counter_res = Counter(y_train_res)
print("\nResampled class distribution:", counter_res)
for label in counter_res:
    row_ix = np.where(y_train_res == label)[0]
    plt.scatter(X_train_res[row_ix, 0], X_train_res[row_ix, 1], label=f'Class {label}')
plt.legend()
plt.title('Resampled Data Distribution (SMOTE)')
plt.xlabel('Access')
plt.ylabel('Tests')
plt.show()

# Train Random Forest
forest = RandomForestClassifier(
    random_state=1,
    n_estimators=1000,
    max_features='sqrt',
    max_depth=50,
    bootstrap=False,
    min_samples_split=2,
    min_samples_leaf=1
)
forest.fit(X_train_res, y_train_res)

def evaluate(model, X, y):
    predictions = model.predict(X)
    errors = abs(predictions - y)
    mape = mean_absolute_error(predictions, y) * 100
    accuracy = 100 - mape
    print("\nModel Performance:")
    print(f"Average Error: {np.mean(errors):.4f}")
    print(f"Accuracy: {accuracy:.2f}%")
    return accuracy

base_accuracy = evaluate(forest, X_test, y_test)

y_pred_test = forest.predict(X_test)

print(f"\nAccuracy Score: {accuracy_score(y_test, y_pred_test):.3f}")

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j], horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')

cnf_matrix = confusion_matrix(y_test, y_pred_test)
recall = 100 * cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1])
print(f"\nRecall metric in the testing dataset: {recall:.2f}%")
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=[0, 1], title='Confusion Matrix')
plt.show()

print("\nClassification Report:\n", classification_report(y_test, y_pred_test))

y_pred_proba = forest.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)
print("AUC-ROC:", auc)
plt.plot(fpr, tpr, label=f"Random Forest, AUC={auc:.3f}")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc=4)
plt.show()

results = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred_test,
    'Probability': y_pred_proba
})
results.to_csv('rf_predictions.csv', index=False)
print("\nPredictions saved to 'rf_predictions.csv'")

In [None]:
# Section 7: Support Vector Machine

# Copy of data for this model
X_train = np.copy(X_train_common)
y_train = np.copy(y_train_common)
X_test = np.copy(X_test_common)
y_test = np.copy(y_test_common)

print("imblearn version:", imblearn.__version__)

print("Training data preview:\n", train_data.head())

# SMOTE
oversample = SMOTE()
X_train_res, y_train_res = oversample.fit_resample(X_train, y_train)
print("\nResampled training data shape:", X_train_res.shape)

# Train SVM
svclassifier = SVC(
    C=1.0,
    kernel='linear',
    degree=3,
    gamma='auto',
    probability=True
)
svclassifier.fit(X_train_res, y_train_res)

# Cross-val
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)
scores = cross_val_score(svclassifier, X_test, y_test, scoring='roc_auc', cv=cv, n_jobs=-1)
print(f"\nMean ROC AUC: {np.mean(scores):.3f}")

# Predictions
yhat = svclassifier.predict(X_test)
yhat_prob = svclassifier.predict_proba(X_test)[:, 1]

# Unified Evaluation
print("\nClassification Report:\n", classification_report(y_test, yhat))
accuracy = accuracy_score(y_test, yhat)
print("Accuracy:", accuracy)
cm = confusion_matrix(y_test, yhat)
print("Confusion Matrix:\n", cm)
auc = roc_auc_score(y_test, yhat_prob)
print("AUC-ROC:", auc)

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix', cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype(float) / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=22)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, fontsize=13)
    plt.yticks(tick_marks, classes, fontsize=13)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment='center',
                 fontsize=15,
                 color='white' if cm[i, j] > thresh else 'black')

    plt.tight_layout()
    plt.ylabel('True Label', fontsize=16)
    plt.xlabel('Predicted Label', fontsize=16)

cnf_matrix = confusion_matrix(y_test, yhat, labels=[1, 0])
np.set_printoptions(precision=2)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['graduate=1', 'graduate=0'], normalize=False, title='Confusion Matrix')
plt.savefig('confusion_matrix.png')
plt.show()

fpr, tpr, _ = roc_curve(y_test, yhat_prob)
plt.figure()
plt.plot(fpr, tpr, label=f"Support Vector Machine, AUC={auc:.3f}")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc=4)
plt.show()

In [None]:
# Section 8: TabNet

# Copy of data for this model
X_train = np.copy(X_train_common)
y_train = np.copy(y_train_common)
X_test = np.copy(X_test_common)
y_test = np.copy(y_test_common)

# Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# SMOTE
oversample = SMOTE(random_state=42)
X_train_res, y_train_res = oversample.fit_resample(X_train, y_train)

# Split
X_train, X_val, y_train, y_val = train_test_split(X_train_res, y_train_res, test_size=0.2, random_state=42)

# TabNet model
model = TabNetClassifier(
    n_d=32,
    n_a=32,
    n_steps=7,
    gamma=1.3,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=0.002),
    verbose=1
)

model.fit(
    X_train=X_train,
    y_train=y_train,
    eval_set=[(X_val, y_val)],
    eval_name=['val'],
    eval_metric=['auc', 'accuracy'],
    max_epochs=100,
    patience=10,
    batch_size=32,
    virtual_batch_size=32,
    num_workers=0,
    drop_last=False
)

y_pred_prob = model.predict_proba(X_test)[:, 1]
y_pred = (y_pred_prob > 0.5).astype(int)

# Unified Evaluation
print("\nClassification Report:\n", classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
auc = roc_auc_score(y_test, y_pred_prob)
print("AUC-ROC:", auc)

plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Passed', 'Passed'],
            yticklabels=['Not Passed', 'Passed'])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'Enhanced TabNet, AUC={auc:.3f}', color='darkblue')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

results = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred,
    'Probability': y_pred_prob
})
results.to_csv('enhanced_tabnet_predictions.csv', index=False)
print("\nPredictions saved to 'enhanced_tabnet_predictions.csv'")

In [None]:
# Section 9: Attention-Augmented-Autoencoder

# Copy of data for this model
X_train = np.copy(X_train_common)
y_train = np.copy(y_train_common)
X_test = np.copy(X_test_common)
y_test = np.copy(y_test_common)

# Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# SMOTE
oversample = SMOTE(random_state=42)
X_train_res, y_train_res = oversample.fit_resample(X_train, y_train)

# Tensors
X_train_tensor = torch.tensor(X_train_res, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_res, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

# Split
X_train, X_val, y_train, y_val = train_test_split(X_train_tensor, y_train_tensor, test_size=0.2, random_state=42)

# DataLoaders for autoencoder (input = output)
train_dataset = TensorDataset(X_train, X_train)
val_dataset = TensorDataset(X_val, X_val)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# AttentionLayer
class AttentionLayer(nn.Module):
    def __init__(self, feature_size):
        super(AttentionLayer, self).__init__()
        self.attention_weights = nn.Parameter(torch.randn(feature_size, 1))

    def forward(self, x):
        attention_scores = torch.matmul(x, self.attention_weights)
        attention_scores = torch.softmax(attention_scores, dim=1)
        weighted_x = x * attention_scores
        return weighted_x

# AttentionAutoencoder
class AttentionAutoencoder(nn.Module):
    def __init__(self, input_size, encoding_size):
        super(AttentionAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            AttentionLayer(64),
            nn.Linear(64, encoding_size),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_size, 64),
            nn.ReLU(),
            nn.Linear(64, input_size),
            nn.ReLU()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Classifier
class Classifier(nn.Module):
    def __init__(self, encoding_size):
        super(Classifier, self).__init__()
        self.fc = nn.Linear(encoding_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc(x)
        out = self.sigmoid(out)
        return out

input_size = 3
encoding_size = 32

attention_autoencoder = AttentionAutoencoder(input_size, encoding_size)
classifier = Classifier(encoding_size)

ae_criterion = nn.SmoothL1Loss()
ae_optimizer = optim.Adam(attention_autoencoder.parameters(), lr=0.001)

epochs_ae = 50
for epoch in range(epochs_ae):
    attention_autoencoder.train()
    train_loss = 0
    for X_batch, _ in train_loader:
        ae_optimizer.zero_grad()
        output = attention_autoencoder(X_batch)
        loss = ae_criterion(output, X_batch)
        loss.backward()
        ae_optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader)
    print(f"Attention Autoencoder Epoch [{epoch + 1}/{epochs_ae}], Train Loss: {train_loss:.4f}")

attention_autoencoder.eval()
with torch.no_grad():
    X_train_encoded = attention_autoencoder.encoder(X_train).cpu().numpy()
    X_val_encoded = attention_autoencoder.encoder(X_val).cpu().numpy()
    X_test_encoded = attention_autoencoder.encoder(X_test_tensor).cpu().numpy()

X_train_encoded = torch.tensor(X_train_encoded, dtype=torch.float32)
X_val_encoded = torch.tensor(X_val_encoded, dtype=torch.float32)
X_test_encoded = torch.tensor(X_test_encoded, dtype=torch.float32)

train_dataset_class = TensorDataset(X_train_encoded, y_train)
val_dataset_class = TensorDataset(X_val_encoded, y_val)
test_dataset_class = TensorDataset(X_test_encoded, y_test_tensor)

train_loader_class = DataLoader(train_dataset_class, batch_size=32, shuffle=True)
val_loader_class = DataLoader(val_dataset_class, batch_size=32, shuffle=False)
test_loader_class = DataLoader(test_dataset_class, batch_size=32, shuffle=False)

clf_criterion = nn.BCELoss()
clf_optimizer = optim.Adam(classifier.parameters(), lr=0.001)

patience = 5
best_val_loss = float('inf')
epochs_no_improve = 0

epochs_clf = 50
for epoch in range(epochs_clf):
    classifier.train()
    train_loss = 0
    for X_batch, y_batch in train_loader_class:
        clf_optimizer.zero_grad()
        output = classifier(X_batch)
        loss = clf_criterion(output, y_batch)
        loss.backward()
        clf_optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader_class)

    classifier.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader_class:
            output = classifier(X_batch)
            loss = clf_criterion(output, y_batch)
            val_loss += loss.item()

    val_loss /= len(val_loader_class)
    print(f"Classifier Epoch [{epoch + 1}/{epochs_clf}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f"Early stopping triggered after epoch {epoch + 1}")
            break

classifier.eval()
with torch.no_grad():
    y_pred_prob = classifier(X_test_encoded)
    y_pred = (y_pred_prob > 0.5).float()

# Unified Evaluation
print("\nClassification Report:\n", classification_report(y_test_tensor.numpy(), y_pred.numpy()))
accuracy = accuracy_score(y_test_tensor.numpy(), y_pred.numpy())
print("Accuracy:", accuracy)
cm = confusion_matrix(y_test_tensor.numpy(), y_pred.numpy())
print("Confusion Matrix:\n", cm)
auc = roc_auc_score(y_test_tensor.numpy(), y_pred_prob.numpy())
print("AUC-ROC:", auc)

plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Passed', 'Passed'],
            yticklabels=['Not Passed', 'Passed'])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

fpr, tpr, _ = roc_curve(y_test_tensor.numpy(), y_pred_prob.numpy())
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'Attention Autoencoder, AUC={auc:.3f}', color='darkblue')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

results = pd.DataFrame({
    'Actual': y_test_tensor.numpy().flatten(),
    'Predicted': y_pred.numpy().flatten(),
    'Probability': y_pred_prob.numpy().flatten()
})
results.to_csv('attention_autoencoder_predictions.csv', index=False)
print("\nPredictions saved to 'attention_autoencoder_predictions.csv'")

**版权声明（Copyright Notice）**

本项目中所有源代码及相关文件均由**姚宇倬**独立开发与完成，享有全部知识产权及最终解释权。

本代码及相关资源仅限于科研与学术研究用途。
严禁任何形式的商业使用（包括但不限于：产品化、盈利性服务、商业推广、技术转让等）。

任何未经授权的复制、修改、传播、再发布、出售或商业化利用，均构成对本人知识产权的严重侵犯。
本人将依法追究侵权者的法律责任，直至刑事责任。

特此声明。

作者：姚宇倬——华北理工大学
研究方向：机器人工程、计算机视觉与深度学习
电子邮箱：yaoyuzhuo6@gmail.com

---

**Copyright Notice**

All source code and related files in this project are independently developed and completed by **Yao Yuzhuo**, who retains full intellectual property rights and final interpretation rights.

The code and related resources are provided strictly for academic and research purposes only.
Any form of commercial use is strictly prohibited, including but not limited to productization, profit-making services, commercial promotion, or technology transfer.

Any unauthorized reproduction, modification, distribution, republication, sale, or commercialization constitutes a serious infringement of the author's intellectual property rights.
The author reserves the right to pursue legal liability, including civil and criminal responsibility, against any infringer.

Author: Yao Yuzhuo——North China University of Science and Technology
Research Focus: Robotics Engineering, Computer Vision, Deep Learning
Email：yaoyuzhuo6@gmail.com

---

二零二五年-九月七日-白露
2025-9-7