In [1]:
import pandas as pd
import numpy as np

# chia dữ liệu
from sklearn.model_selection import train_test_split

# tiền xử lý dữ liệu encode, scaler các thứ
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.model_selection import train_test_split


# các thông số đánh giá mô hình
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
df =  pd.read_excel("..\store\\fact.xlsx")

  df =  pd.read_excel("..\store\\fact.xlsx")


In [3]:
# Tạo results để lưu các thông số đánh giá mô hình
results_df = pd.DataFrame(columns=["Model", "Epochs", "Accuracy", "Precision", "Recall", "F1_score"])

def evaluate_and_save_model(name_model, epochs, model, X_test, y_test):
    global results_df
    
    # Dự đoán từ mô hình
    y_pred = model.predict(X_test)
    if len(y_pred.shape) > 1:  # Trường hợp output là xác suất (probability)
        y_pred = y_pred.argmax(axis=1)  # Chọn lớp có xác suất cao nhất
    
    # Tính các chỉ số đánh giá
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')  # Sử dụng weighted nếu có nhiều lớp
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Lưu kết quả vào DataFrame
    new_result = pd.DataFrame({
        "Epochs": epochs,
        "Model": [name_model],
        "Accuracy": [accuracy],
        "Precision": [precision],
        "Recall": [recall],
        "F1_score": [f1],
    })
    
    results_df = pd.concat([results_df, new_result], ignore_index=True)
    return results_df

In [4]:
df.drop(columns=['mssv', 'khoahoc_chuan'], axis=1, inplace=True)

In [5]:
def preprocess_and_train_mlp(epochs, df=df):
    # Define categorical and numerical columns
    categorical_cols = ["gioitinh", "noisinh", "khoa", "hedt"]
    numerical_cols = ["diem_tt"] + [col for col in df.columns if "dtbhk_hk_" in col or "sotchk_hk_" in col or "drl_hk_" in col]
    
    # One-Hot Encoding for categorical data
    encoder = OneHotEncoder()
    encoded_features = encoder.fit_transform(df[categorical_cols]).toarray()

    # Normalize numerical data
    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(df[numerical_cols])

    # Combine features
    X = np.hstack([scaled_features, encoded_features])
    
    # Convert target variable to numerical labels
    y = df["xeploai"].astype("category").cat.codes 

    # Split the data into training, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Define MLP model
    mlp_model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(len(np.unique(y)), activation='softmax')
    ])

    # Compile the model
    mlp_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    mlp_history = mlp_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=32)

    # Evaluate the model on the test set
    evaluate_and_save_model("MLP (Multi-Layer Perceptron) " + str(epochs), epochs, mlp_model, X_test, y_test)

In [6]:
def preprocess_and_train_lstm(epochs, df=df):
    # Define sequential features
    seq_features = [f"dtbhk_hk_{i}" for i in range(1, 6)] + \
                   [f"sotchk_hk_{i}" for i in range(1, 6)] + \
                   [f"drl_hk_{i}" for i in range(1, 6)]
    
    # Convert DataFrame to NumPy array
    df_numpy = df[seq_features].values
    
    # Reshape: (samples, timesteps, features)
    sequence_data = df_numpy.reshape(df_numpy.shape[0], 5, -1)
    
    # Normalize data
    scaler = MinMaxScaler()
    sequence_data = sequence_data.reshape(-1, sequence_data.shape[-1])
    sequence_data = scaler.fit_transform(sequence_data)
    sequence_data = sequence_data.reshape(-1, 5, len(seq_features) // 5)

    # Prepare labels
    y = df["xeploai"].astype("category").cat.codes

    # Split the data into training, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(sequence_data, y, test_size=0.2, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Define LSTM model
    lstm_model = tf.keras.Sequential([
        tf.keras.layers.LSTM(64, input_shape=(5, len(seq_features) // 5), return_sequences=True),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.LSTM(32),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(len(np.unique(y)), activation='softmax')
    ])

    # Compile the model
    lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    lstm_history = lstm_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=32)

    # Evaluate the model on the test set
    evaluate_and_save_model("LSTM (Long Short-Term Memory) "+ str(epochs), epochs, lstm_model, X_test, y_test)

In [7]:
def preprocess_and_train_cnn(epochs, df=df):
    # Define sequential features
    seq_features = [f"dtbhk_hk_{i}" for i in range(1, 6)] + \
                   [f"sotchk_hk_{i}" for i in range(1, 6)] + \
                   [f"drl_hk_{i}" for i in range(1, 6)]
    
    # Convert DataFrame to NumPy array
    sequence_data = df[seq_features].values
    
    # Reshape: (samples, timesteps, features)
    sequence_data = sequence_data.reshape(sequence_data.shape[0], 5, -1)
    
    # Normalize data
    scaler = MinMaxScaler()
    sequence_data = sequence_data.reshape(-1, sequence_data.shape[-1])
    sequence_data = scaler.fit_transform(sequence_data)
    sequence_data = sequence_data.reshape(-1, 5, len(seq_features) // 5)

    # Prepare labels
    y = df["xeploai"].astype("category").cat.codes

    # Split the data into training, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(sequence_data, y, test_size=0.2, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Define CNN model
    cnn_model = tf.keras.Sequential([
        tf.keras.layers.Conv1D(32, kernel_size=2, activation='relu', input_shape=(5, len(seq_features) // 5)),
        tf.keras.layers.MaxPooling1D(pool_size=2),
        tf.keras.layers.Conv1D(64, kernel_size=2, activation='relu'),
        tf.keras.layers.GlobalMaxPooling1D(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(len(np.unique(y)), activation='softmax')
    ])

    # Compile the model
    cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    cnn_history = cnn_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=32)

    # Evaluate the model on the test set
    evaluate_and_save_model("CNN (Convolutional Neural Network) " + str(epochs), epochs, cnn_model, X_test, y_test)

In [8]:
def run_all_models_with_epochs(df):
    results = {}
    # Tạo danh sách epoch từ 5 đến 200, cách nhau 5
    epoch_list = list(range(5, 151, 20))  # từ 5 đến 150, cách nhau 10

    # Lặp qua mỗi giá trị epoch
    for epochs in epoch_list:
        print(f"Training models for {epochs} epochs...")
        results[epochs] = {
            'MLP': preprocess_and_train_mlp(epochs),
            'LSTM': preprocess_and_train_lstm(epochs),
            'CNN': preprocess_and_train_cnn(epochs)
        }

In [None]:
run_all_models_with_epochs(df)

Training models for 5 epochs...
Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.5488 - loss: 1.1304 - val_accuracy: 0.6441 - val_loss: 0.8552
Epoch 2/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6161 - loss: 0.9237 - val_accuracy: 0.6497 - val_loss: 0.7962
Epoch 3/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6420 - loss: 0.8742 - val_accuracy: 0.6780 - val_loss: 0.7539
Epoch 4/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6659 - loss: 0.8053 - val_accuracy: 0.7006 - val_loss: 0.7371
Epoch 5/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6659 - loss: 0.7663 - val_accuracy: 0.6780 - val_loss: 0.6874
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
Epoch 1/5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  results_df = pd.concat([results_df, new_result], ignore_index=True)
  super().__init__(**kwargs)


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.4430 - loss: 1.3070 - val_accuracy: 0.6441 - val_loss: 0.9511
Epoch 2/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5998 - loss: 1.0259 - val_accuracy: 0.6441 - val_loss: 0.9041
Epoch 3/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6163 - loss: 0.9650 - val_accuracy: 0.6441 - val_loss: 0.8982
Epoch 4/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6238 - loss: 0.9459 - val_accuracy: 0.6441 - val_loss: 0.8998
Epoch 5/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6211 - loss: 0.9416 - val_accuracy: 0.6441 - val_loss: 0.8911
[1m1/6[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m1s[0m 391ms/step

In [None]:
results_df