In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from google.colab import files
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import io

UPLOAD FILES

In [None]:
print("--- Step 1: Upload FEATURES Data ---")
print("Please upload your final feature file (e.g., dummy_features_noise.xlsx).")
uploaded = files.upload()
X_file_name = list(uploaded.keys())[0]

# --- FIX APPLIED HERE: Use pd.read_excel for XLSX file ---
X_final_features = pd.read_excel(io.BytesIO(uploaded[X_file_name]), engine='openpyxl')

X = X_final_features.reset_index(drop=True)
print(f"Features loaded successfully from: {X_file_name}")
print(f"X (Features) shape: {X.shape}")

--- Step 1: Upload FEATURES Data ---
Please upload your final feature file (e.g., dummy_features_noise.xlsx).


Saving dummy_features_noise.xlsx to dummy_features_noise (1).xlsx
Features loaded successfully from: dummy_features_noise (1).xlsx
X (Features) shape: (1000, 15)


In [None]:
print("\n--- Step 2: Upload TARGETS Data ---")
print("Please upload your original target file (e.g., dummy_target.xlsx).")


uploaded = files.upload()

Y_file_name = list(uploaded.keys())[0]

Y_targets = pd.read_excel(io.BytesIO(uploaded[Y_file_name]), engine='openpyxl')
Y = Y_targets.reset_index(drop=True)

print(f"Targets loaded successfully from: {Y_file_name}")
print(f"Y (Targets) shape: {Y.shape}")


--- Step 2: Upload TARGETS Data ---
Please upload your original target file (e.g., target.xlsx).


Saving dummy_target_noise.xlsx to dummy_target_noise.xlsx
Targets loaded successfully from: dummy_target_noise.xlsx
Y (Targets) shape: (1000, 30)


In [None]:
# --- 1. SIMPLIFIED DATA SPLITTING (85% Train, 15% Test) ---

# Split 1: Train (85%) vs. Test (15%)
# We are removing the validation set to maximize training data (26 samples)
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.15, random_state=42, shuffle=True
)

# Use the full training set for both training and validation monitoring,
# as the number of samples is too small to split effectively.
X_val = X_test
Y_val = Y_test

print("Data Split Complete (Simplified):")
print(f"Training Samples (85%): {X_train.shape[0]}")
print(f"Testing Samples (15%): {X_test.shape[0]}")


# --- 2. SIMPLIFIED MODEL ARCHITECTURE ---

N_FEATURES = X_train.shape[1]  # 15
N_TARGETS = Y_train.shape[1]   # Confirmed 20 targets
LEARNING_RATE = 0.001
EPOCHS = 100
BATCH_SIZE = 4 # Reduce batch size to 4 due to small training set

# Input Layer
input_layer = Input(shape=(N_FEATURES,), name='Input_15_Features')

# Single Hidden Layer (Reduce from 64/32 to just 16)
h1 = Dense(units=16, activation='relu', name='Hidden_16')(input_layer)
h1 = Dropout(0.2)(h1)

# Output Layer: 20 independent binary classifications (Sigmoid)
output_layer = Dense(
    units=N_TARGETS,
    activation='sigmoid',
    name='Output_20_Targets' # Renamed for clarity
)(h1)

model = Model(inputs=input_layer, outputs=output_layer)


# --- 3. COMPILE AND TRAIN ---
model.compile(
    optimizer=Adam(learning_rate=LEARNING_RATE),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
)

print("\nModel Architecture Summary (Simplified):")
model.summary()

early_stop = EarlyStopping(
    monitor='loss', # Monitor training loss instead of val_loss due to tiny test set
    patience=10,
    restore_best_weights=True
)

print("\n--- Starting DNN Training ---")
history = model.fit(
    X_train, Y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE, # Use a small batch size
    # Since the sample is tiny, we'll monitor the training loss directly
    callbacks=[early_stop],
    verbose=1
)

# --- 4. Evaluate the Model on the Test Set ---
loss, accuracy, auc = model.evaluate(X_test, Y_test, verbose=0)
print(f"\nModel Evaluation (Final Test Set):")
print(f"  Loss (Binary Crossentropy): {loss:.4f}")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  AUC (Area Under Curve): {auc:.4f}")

Data Split Complete (Simplified):
Training Samples (85%): 850
Testing Samples (15%): 150

Model Architecture Summary (Simplified):



--- Starting DNN Training ---
Epoch 1/100
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.0527 - auc: 0.6297 - loss: 0.6265
Epoch 2/100
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.1699 - auc: 0.8795 - loss: 0.3338
Epoch 3/100
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.1699 - auc: 0.9072 - loss: 0.2968
Epoch 4/100
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1688 - auc: 0.9283 - loss: 0.2653
Epoch 5/100
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1819 - auc: 0.9390 - loss: 0.2474
Epoch 6/100
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1681 - auc: 0.9491 - loss: 0.2272
Epoch 7/100
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1439 - auc: 0.9513 - loss: 0.2233
Epoch 8/100
[1m213/213

In [None]:
# --- 1. Generate Predictions ---
# Predict probabilities (between 0 and 1) for the 5 unseen test samples
probabilities = model.predict(X_test, verbose=0)
print("Predicted Probabilities (Test Set):\n")
print(probabilities)

# --- 2. Convert Probabilities to Binary Predictions ---
# We use a threshold (0.5 is standard) to convert probabilities into a recommendation (1 or 0)
RECOMMENDATION_THRESHOLD = 0.49
predictions = (probabilities > RECOMMENDATION_THRESHOLD).astype(int)

# --- 3. Display Results (First Test Sample) ---
print("\n-----------------------------------------------------")
print(f"Analysis of First Test Sample (Index {X_test.index[0]}):")
print("-----------------------------------------------------")

# Get the column names for easier interpretation
target_columns = Y_test.columns.tolist()

# Zip the column names, the true values, and the predictions for the first sample
results = pd.DataFrame({
    'Resource_Name': target_columns,
    'True_Value': Y_test.iloc[0].values,
    'Predicted_Value': predictions[0]
}).sort_values(by=['Predicted_Value', 'True_Value'], ascending=False).reset_index(drop=True)



Predicted Probabilities (Test Set):

[[6.4624670e-14 2.9177032e-03 3.9230844e-08 ... 8.4339816e-04
  2.0727924e-09 9.7618706e-04]
 [5.5137938e-14 2.5725323e-03 3.5362376e-08 ... 8.7013590e-04
  2.9179683e-09 1.0123588e-03]
 [7.4113323e-14 3.2078517e-03 4.3474635e-08 ... 8.5346715e-04
  2.0334920e-09 1.0240596e-03]
 ...
 [3.4415604e-09 9.8664445e-01 1.2899892e-04 ... 2.2728713e-03
  1.1702628e-11 5.2520086e-04]
 [1.0407197e-09 7.6428837e-01 6.9897735e-01 ... 1.0148716e-02
  1.7620852e-06 1.3763702e-02]
 [4.6953681e-09 9.8680860e-01 1.3860787e-04 ... 2.3843891e-03
  1.0063219e-11 5.4603937e-04]]

-----------------------------------------------------
Analysis of First Test Sample (Index 521):
-----------------------------------------------------


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

# --- 1. Get Probabilities ---
# Use the trained model to get probabilities for the test set
probabilities = model.predict(X_test, verbose=0)
Y_true_flat = Y_test.values.flatten() # Flatten the true labels for scoring

# --- 2. Iterate and Evaluate Thresholds ---
# Test 50 thresholds from 0.01 to 0.50
thresholds = np.linspace(0.01, 0.50, 50)
best_results = {'accuracy': 0, 'f1_score': 0, 'threshold_acc': 0, 'threshold_f1': 0}

all_results = []

for t in thresholds:
    # Convert probabilities to binary predictions using the current threshold 't'
    Y_pred_flat = (probabilities > t).astype(int).flatten()

    # Calculate metrics
    acc = accuracy_score(Y_true_flat, Y_pred_flat)
    f1 = f1_score(Y_true_flat, Y_pred_flat) # F1 is often better for imbalanced data

    all_results.append({'Threshold': f'{t:.3f}', 'Accuracy': f'{acc:.4f}', 'F1_Score': f'{f1:.4f}'})

    # Track the best scores
    if acc > best_results['accuracy']:
        best_results['accuracy'] = acc
        best_results['threshold_acc'] = t

    if f1 > best_results['f1_score']:
        best_results['f1_score'] = f1
        best_results['threshold_f1'] = t

# --- 3. Display Results ---
print("\n--- Threshold Optimization Results ---")

# Convert results to a DataFrame for clean printing
results_df = pd.DataFrame(all_results)
# Only show the first 10 rows and the last 10 rows for brevity
print("Sample of Threshold vs. Metrics:")
print(pd.concat([results_df.head(10), results_df.tail(10)]).to_markdown(index=False))

print("\n--- Best Thresholds Found ---")
print(f"To Maximize Accuracy: Threshold = {best_results['threshold_acc']:.3f} (Accuracy = {best_results['accuracy']:.4f})")
print(f"To Maximize F1-Score: Threshold = {best_results['threshold_f1']:.3f} (F1-Score = {best_results['f1_score']:.4f})")


--- Threshold Optimization Results ---
Sample of Threshold vs. Metrics:
|   Threshold |   Accuracy |   F1_Score |
|------------:|-----------:|-----------:|
|        0.01 |     0.8158 |     0.6815 |
|        0.02 |     0.8847 |     0.7729 |
|        0.03 |     0.9138 |     0.8197 |
|        0.04 |     0.9216 |     0.8333 |
|        0.05 |     0.9304 |     0.8493 |
|        0.06 |     0.9367 |     0.8609 |
|        0.07 |     0.9391 |     0.8654 |
|        0.08 |     0.9451 |     0.8771 |
|        0.09 |     0.948  |     0.8826 |
|        0.1  |     0.95   |     0.8866 |
|        0.41 |     0.9873 |     0.9686 |
|        0.42 |     0.9878 |     0.9697 |
|        0.43 |     0.9878 |     0.9697 |
|        0.44 |     0.9878 |     0.9697 |
|        0.45 |     0.9882 |     0.9708 |
|        0.46 |     0.9893 |     0.9735 |
|        0.47 |     0.9898 |     0.9745 |
|        0.48 |     0.99   |     0.9751 |
|        0.49 |     0.9902 |     0.9756 |
|        0.5  |     0.9902 |     0.9756 |

--

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# The optimal threshold found in the previous steps
FINAL_RECOMMENDATION_THRESHOLD = 0.5

# 1. Get Probabilities from the model you just trained
probabilities = model.predict(X_test, verbose=0)

# 2. Flatten True Labels and Predictions
Y_true_flat = Y_test.values.flatten()
# Use the optimized threshold (0.470) instead of the Keras default (0.5)
Y_pred_flat = (probabilities > FINAL_RECOMMENDATION_THRESHOLD).astype(int).flatten()

# 3. Calculate Final Metrics
final_accuracy = accuracy_score(Y_true_flat, Y_pred_flat)
final_f1_score = f1_score(Y_true_flat, Y_pred_flat)

print("--- Final Optimized Model Metrics ---")
print(f"Optimal Threshold: {FINAL_RECOMMENDATION_THRESHOLD:.3f}")
print(f"Test Accuracy: {final_accuracy:.4f} ({final_accuracy * 100:.2f}%)")
print(f"Test F1-Score: {final_f1_score:.4f}")

--- Final Optimized Model Metrics ---
Optimal Threshold: 0.500
Test Accuracy: 0.9902 (99.02%)
Test F1-Score: 0.9756
