In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.regularizers import l2
from keras.layers import Input
from keras.utils import to_categorical
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
import os

# Define Focal Loss
def focal_loss(gamma=2., alpha=0.25):
    def focal_loss_fixed(y_true, y_pred):
        y_pred = tf.clip_by_value(y_pred, tf.keras.backend.epsilon(), 1 - tf.keras.backend.epsilon())
        loss = - alpha * tf.pow((1 - y_pred), gamma) * tf.math.log(y_pred)
        return tf.keras.backend.mean(tf.keras.backend.sum(loss * y_true, axis=1))
    return focal_loss_fixed

# Define the model architecture
def create_model(input_shape, num_classes):
    model = Sequential([
        Input(shape=input_shape),  # Specify the input shape here
        Conv1D(64, kernel_size=3, activation='relu', kernel_regularizer=l2(0.01), padding='same'),
        BatchNormalization(),
        MaxPooling1D(pool_size=1),
        Dropout(0.3),

        Conv1D(128, kernel_size=3, activation='relu', kernel_regularizer=l2(0.01), padding='same'),
        BatchNormalization(),
        MaxPooling1D(pool_size=1),
        Dropout(0.3),

        Conv1D(256, kernel_size=3, activation='relu', kernel_regularizer=l2(0.01), padding='same'),
        BatchNormalization(),
        Dropout(0.3),

        Flatten(),
        Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        Dropout(0.4),

        Dense(num_classes, activation='softmax')
    ])

    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss=focal_loss(alpha=.25, gamma=2),
                  metrics=['accuracy'])
    return model

# Load your dataset
file_path = 'cleaned_second_module_input1.csv'  # Update with your file path
data = pd.read_csv(file_path)

# Prepare the data
features = [f'votes_{i}' for i in range(1, 11)] + [f'rating_{i}' for i in range(1, 11)] + ['calculated_score']
X = data[features].values
y = data['predicted_popularity_class'].values

# Encode the target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# One-hot encode the labels
y_encoded = to_categorical(y_encoded)

# Stratified K-Fold Cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Lists to store results
histories = []
y_true_all = []
y_pred_all = []
best_val_accuracy = 0  # Track the best validation accuracy
best_model_filename = 'best_model.keras'  # Filename to save the best model

for fold, (train_index, val_index) in enumerate(skf.split(X, np.argmax(y_encoded, axis=1)), 1):
    print(f"Fold {fold}")
    
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y_encoded[train_index], y_encoded[val_index]
    
    # Apply SMOTE
    smote = SMOTE(random_state=42, k_neighbors=2)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    # Scale features
    scaler = StandardScaler()
    X_train_resampled = scaler.fit_transform(X_train_resampled)
    X_val = scaler.transform(X_val)
    
    X_train_resampled = X_train_resampled.reshape((X_train_resampled.shape[0], X_train_resampled.shape[1], 1))
    X_val = X_val.reshape((X_val.shape[0], X_val.shape[1], 1))
    
    # Create and train the model
    model = create_model((21, 1), y_encoded.shape[1])  # Number of classes in one-hot encoded labels
    
    callbacks = [
        EarlyStopping(patience=10, restore_best_weights=True),
        ReduceLROnPlateau(factor=0.2, patience=5, min_lr=0.00001),
        ModelCheckpoint(best_model_filename, monitor='val_accuracy', save_best_only=True, mode='max', verbose=0)
    ]
    
    history = model.fit(X_train_resampled, y_train_resampled,
                        epochs=100,
                        batch_size=32,
                        validation_data=(X_val, y_val),
                        callbacks=callbacks,
                        verbose=0)
    
    # Evaluate the model on the validation set
    val_accuracy = model.evaluate(X_val, y_val, verbose=0)[1]  # [1] is accuracy

    # Check if this fold produced the best validation accuracy
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        model.save(best_model_filename)  # Save the best model to file
        print(f"New best model saved with val_accuracy: {best_val_accuracy:.4f} (Fold {fold})")

    histories.append(history)
    
    # Predictions for analysis
    y_pred = model.predict(X_val)
    y_pred_classes = np.argmax(y_pred, axis=1)
    
    y_true_all.extend(np.argmax(y_val, axis=1))  # Convert one-hot to integers for true labels
    y_pred_all.extend(y_pred_classes)

# Print overall results
print("Classification Report for Best Model Across All Folds:")
print(classification_report(y_true_all, y_pred_all, target_names=label_encoder.classes_))
print(confusion_matrix(y_true_all, y_pred_all))

# Load the best model for further use
from keras.models import load_model
best_model = load_model(best_model_filename, custom_objects={'focal_loss_fixed': focal_loss()})


Fold 1
New best model saved with val_accuracy: 0.9956 (Fold 1)
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step
Fold 2
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step
Fold 3
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step
Fold 4


In [44]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from keras.models import load_model
import tensorflow as tf
import joblib

# Define Focal Loss (used during model training)
def focal_loss(gamma=2., alpha=0.25):
    def focal_loss_fixed(y_true, y_pred):
        y_pred = tf.clip_by_value(y_pred, tf.keras.backend.epsilon(), 1 - tf.keras.backend.epsilon())
        loss = - alpha * tf.pow((1 - y_pred), gamma) * tf.math.log(y_pred)
        return tf.keras.backend.mean(tf.keras.backend.sum(loss * y_true, axis=1))
    return focal_loss_fixed

# Load the scaler
scaler = joblib.load('scaler.joblib')  # Load the scaler to scale inputs properly

# Input example
input_example = [2911.0, 141.0, 14021.0, 3220.0, 25927.0, 
                 2156.0, 1354.0, 172.0, 376.0, 798.0, 
                 1.0, 1.2, 1.5, 1.2, 1.8, 1.7, 1.4, 1.2, 1.7, 1.4, 
                 0.15]

# Step 1: Scale the input
input_array = np.array(input_example).reshape(1, -1)  # Reshape for scaling
input_scaled = scaler.transform(input_array)  # Use the loaded scaler to scale the input

# Step 2: Reshape the input to fit the CNN model (samples, features, 1)
input_scaled_reshaped = input_scaled.reshape((input_scaled.shape[0], input_scaled.shape[1], 1))

# Step 3: Load the trained model and specify the custom loss function
model = load_model('best_model.keras', custom_objects={'focal_loss_fixed': focal_loss()})

# Step 4: Predict the class
predicted_probabilities = model.predict(input_scaled_reshaped)
predicted_class = np.argmax(predicted_probabilities, axis=1)  # Get the class with the highest probability

# Step 5: Manually define the labels (in the same order as the classification report/confusion matrix)
labels = ["A", "AA", "F", "H", "SDH", "SH"]

# Step 6: Map the predicted class to its corresponding label
predicted_label = labels[predicted_class[0]]

print(f"Predicted Label: {predicted_label}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Predicted Label: F


In [None]:
input_example = [2917451.0, 1600541.0, 1464021.0, 322800.0, 2590527.0, 2154266.0, 1340354.0, 179212.0, 37654.0, 78938.0,9.0, 8.2, 8.5, 7.2, 8.8, 8.7, 8.4, 6.2, 6.7, 6.4, 0.95]
