In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from google.colab import files
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import io

UPLOAD FILES

In [None]:
print("--- Step 1: Upload FEATURES Data ---")
print("Please upload your final feature file ('X_final_optimized.csv').")
uploaded = files.upload()
X_file_name = list(uploaded.keys())[0]
X_final_features = pd.read_csv(io.BytesIO(uploaded[X_file_name]))
X = X_final_features.reset_index(drop=True)
print(f"Features loaded successfully from: {X_file_name}")
print(f"X (Features) shape: {X.shape}")

--- Step 1: Upload FEATURES Data ---
Please upload your final feature file ('X_final_optimized.csv').


Saving X_final_optimized.xlsx to X_final_optimized (4).xlsx
Features loaded successfully from: X_final_optimized (4).xlsx
X (Features) shape: (31, 15)


In [None]:
# 2. LOAD TARGETS DATA (Y) - CORRECTED
# --------------------------------------------------------
print("\n--- Step 2: Upload TARGETS Data ---")
print("Please upload your original target file (e.g., target.xlsx).")

# You are prompted to upload the file here
uploaded = files.upload()

Y_file_name = list(uploaded.keys())[0]

# --- FIX APPLIED HERE: Use pd.read_excel with the 'openpyxl' engine ---
Y_targets = pd.read_excel(io.BytesIO(uploaded[Y_file_name]), engine='openpyxl')
Y = Y_targets.reset_index(drop=True)

print(f"Targets loaded successfully from: {Y_file_name}")
print(f"Y (Targets) shape: {Y.shape}")


--- Step 2: Upload TARGETS Data ---
Please upload your original target file (e.g., target.xlsx).


Saving target.xlsx to target (5).xlsx
Targets loaded successfully from: target (5).xlsx
Y (Targets) shape: (31, 20)


DATA SPLITTING

In [None]:
import numpy as np
import tensorflow as tf
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# --- SEEDING FOR REPRODUCIBILITY (The Fix) ---
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
# -------------------------------------------

# --- 1. SIMPLIFIED DATA SPLITTING (85% Train, 15% Test) ---

# Split 1: Train (85%) vs. Test (15%)
# Note: The dataframes X and Y must be loaded in a previous step to run this.
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.15, random_state=SEED, shuffle=True
)

X_val = X_test
Y_val = Y_test

print("Data Split Complete (Simplified):")
print(f"Training Samples (85%): {X_train.shape[0]}")
print(f"Testing Samples (15%): {X_test.shape[0]}")


# --- 2. SIMPLIFIED MODEL ARCHITECTURE ---

N_FEATURES = X_train.shape[1]
N_TARGETS = Y_train.shape[1]
LEARNING_RATE = 0.001
EPOCHS = 100
BATCH_SIZE = 4

# Input Layer
input_layer = Input(shape=(N_FEATURES,), name='Input_15_Features')

# Single Hidden Layer
h1 = Dense(units=16, activation='relu', name='Hidden_16')(input_layer)
h1 = Dropout(0.2)(h1)

# Output Layer
output_layer = Dense(
    units=N_TARGETS,
    activation='sigmoid',
    name='Output_20_Targets'
)(h1)

model = Model(inputs=input_layer, outputs=output_layer)


# --- 3. COMPILE AND TRAIN ---
model.compile(
    optimizer=Adam(learning_rate=LEARNING_RATE),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
)

print("\nModel Architecture Summary (Simplified):")
model.summary()

early_stop = EarlyStopping(
    monitor='loss',
    patience=10,
    restore_best_weights=True
)

print("\n--- Starting DNN Training ---")
history = model.fit(
    X_train, Y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stop],
    verbose=1
)

# --- 4. Evaluate the Model on the Test Set ---
loss, accuracy, auc = model.evaluate(X_test, Y_test, verbose=0)
print(f"\nModel Evaluation (Final Test Set):")
print(f"  Loss (Binary Crossentropy): {loss:.4f}")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  AUC (Area Under Curve): {auc:.4f}")

Data Split Complete (Simplified):
Training Samples (85%): 26
Testing Samples (15%): 5

Model Architecture Summary (Simplified):



--- Starting DNN Training ---
Epoch 1/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 32ms/step - accuracy: 0.1403 - auc: 0.5003 - loss: 0.6884
Epoch 2/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.1655 - auc: 0.5668 - loss: 0.6766
Epoch 3/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.1397 - auc: 0.5887 - loss: 0.6702     
Epoch 4/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.1856 - auc: 0.6483 - loss: 0.6600
Epoch 5/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.0690 - auc: 0.7349 - loss: 0.6383    
Epoch 6/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0507 - auc: 0.6971 - loss: 0.6428    
Epoch 7/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.0244 - auc: 0.7216 - loss: 0.6298    
Epoch 8/100
[1m7/7[0m [32

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# The optimal threshold found in the previous steps
FINAL_RECOMMENDATION_THRESHOLD = 0.470

# 1. Get Probabilities from the model you just trained
probabilities = model.predict(X_test, verbose=0)

# 2. Flatten True Labels and Predictions
Y_true_flat = Y_test.values.flatten()
# Use the optimized threshold (0.470) instead of the Keras default (0.5)
Y_pred_flat = (probabilities > FINAL_RECOMMENDATION_THRESHOLD).astype(int).flatten()

# 3. Calculate Final Metrics
final_accuracy = accuracy_score(Y_true_flat, Y_pred_flat)
final_f1_score = f1_score(Y_true_flat, Y_pred_flat)

print("--- Final Optimized Model Metrics ---")
print(f"Optimal Threshold: {FINAL_RECOMMENDATION_THRESHOLD:.3f}")
print(f"Test Accuracy: {final_accuracy:.4f} ({final_accuracy * 100:.2f}%)")
print(f"Test F1-Score: {final_f1_score:.4f}")

--- Final Optimized Model Metrics ---
Optimal Threshold: 0.470
Test Accuracy: 0.8400 (84.00%)
Test F1-Score: 0.6923


In [None]:
# --- 1. Generate Predictions ---
# Predict probabilities (between 0 and 1) for the 5 unseen test samples
probabilities = model.predict(X_test, verbose=0)
print("Predicted Probabilities (Test Set):\n")
print(probabilities)

# --- 2. Convert Probabilities to Binary Predictions ---
# We use a threshold (0.5 is standard) to convert probabilities into a recommendation (1 or 0)
RECOMMENDATION_THRESHOLD = 0.47
predictions = (probabilities > RECOMMENDATION_THRESHOLD).astype(int)

# --- 3. Display Results (First Test Sample) ---
print("\n-----------------------------------------------------")
print(f"Analysis of First Test Sample (Index {X_test.index[0]}):")
print("-----------------------------------------------------")

# Get the column names for easier interpretation
target_columns = Y_test.columns.tolist()

# Zip the column names, the true values, and the predictions for the first sample
results = pd.DataFrame({
    'Resource_Name': target_columns,
    'True_Value': Y_test.iloc[0].values,
    'Predicted_Value': predictions[0]
}).sort_values(by=['Predicted_Value', 'True_Value'], ascending=False).reset_index(drop=True)

print(results)

Predicted Probabilities (Test Set):

[[0.00961911 0.31833366 0.02011937 0.07917637 0.07927252 0.9415242
  0.03302092 0.87965554 0.02233612 0.8141981  0.14455356 0.18062694
  0.00939604 0.7254489  0.3349391  0.02657234 0.21130437 0.80102116
  0.01184929 0.03657625]
 [0.00779925 0.3952497  0.01606472 0.0374951  0.0306546  0.91792977
  0.02531291 0.8618502  0.0171031  0.7913621  0.11087084 0.16638483
  0.0060689  0.6474162  0.3349969  0.02940946 0.22276461 0.6715669
  0.01110223 0.03526424]
 [0.02624219 0.41767618 0.07645883 0.1255779  0.13447835 0.74572426
  0.10602131 0.6816961  0.08839978 0.7355762  0.21760301 0.19996622
  0.04248987 0.58358705 0.32143778 0.10989536 0.2099521  0.55977
  0.08168489 0.12140927]
 [0.01616536 0.40148744 0.02675549 0.071816   0.06721929 0.9028901
  0.04385141 0.85721666 0.03217662 0.76616204 0.12816386 0.19506244
  0.01863431 0.6647363  0.3353171  0.04346571 0.24623129 0.64518666
  0.02036992 0.05111176]
 [0.00815558 0.38865516 0.02115965 0.03486349 0.02995

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

# --- 1. Get Probabilities ---
# Use the trained model to get probabilities for the test set
probabilities = model.predict(X_test, verbose=0)
Y_true_flat = Y_test.values.flatten() # Flatten the true labels for scoring

# --- 2. Iterate and Evaluate Thresholds ---
# Test 50 thresholds from 0.01 to 0.50
thresholds = np.linspace(0.01, 0.50, 50)
best_results = {'accuracy': 0, 'f1_score': 0, 'threshold_acc': 0, 'threshold_f1': 0}

all_results = []

for t in thresholds:
    # Convert probabilities to binary predictions using the current threshold 't'
    Y_pred_flat = (probabilities > t).astype(int).flatten()

    # Calculate metrics
    acc = accuracy_score(Y_true_flat, Y_pred_flat)
    f1 = f1_score(Y_true_flat, Y_pred_flat) # F1 is often better for imbalanced data

    all_results.append({'Threshold': f'{t:.3f}', 'Accuracy': f'{acc:.4f}', 'F1_Score': f'{f1:.4f}'})

    # Track the best scores
    if acc > best_results['accuracy']:
        best_results['accuracy'] = acc
        best_results['threshold_acc'] = t

    if f1 > best_results['f1_score']:
        best_results['f1_score'] = f1
        best_results['threshold_f1'] = t

# --- 3. Display Results ---
print("\n--- Threshold Optimization Results ---")

# Convert results to a DataFrame for clean printing
results_df = pd.DataFrame(all_results)
# Only show the first 10 rows and the last 10 rows for brevity
print("Sample of Threshold vs. Metrics:")
print(pd.concat([results_df.head(10), results_df.tail(10)]).to_markdown(index=False))

print("\n--- Best Thresholds Found ---")
print(f"To Maximize Accuracy: Threshold = {best_results['threshold_acc']:.3f} (Accuracy = {best_results['accuracy']:.4f})")
print(f"To Maximize F1-Score: Threshold = {best_results['threshold_f1']:.3f} (F1-Score = {best_results['f1_score']:.4f})")


--- Threshold Optimization Results ---
Sample of Threshold vs. Metrics:
|   Threshold |   Accuracy |   F1_Score |
|------------:|-----------:|-----------:|
|        0.01 |       0.34 |     0.4211 |
|        0.02 |       0.39 |     0.4404 |
|        0.03 |       0.48 |     0.48   |
|        0.04 |       0.55 |     0.5161 |
|        0.05 |       0.58 |     0.5333 |
|        0.06 |       0.61 |     0.5412 |
|        0.07 |       0.62 |     0.5476 |
|        0.08 |       0.64 |     0.561  |
|        0.09 |       0.65 |     0.5679 |
|        0.1  |       0.65 |     0.5679 |
|        0.41 |       0.84 |     0.7241 |
|        0.42 |       0.84 |     0.7241 |
|        0.43 |       0.84 |     0.7241 |
|        0.44 |       0.85 |     0.7368 |
|        0.45 |       0.85 |     0.7368 |
|        0.46 |       0.85 |     0.7368 |
|        0.47 |       0.86 |     0.75   |
|        0.48 |       0.86 |     0.75   |
|        0.49 |       0.84 |     0.7037 |
|        0.5  |       0.85 |     0.717  |

--