In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam


In [None]:
# Load the dataset
data = pd.read_csv('/content/new_dataset.csv')

# Check for NaN values
print("NaN values in the dataset:")
print(data.isna().sum())

# Remove rows with NaN values
data_cleaned = data.dropna()

In [None]:
# Prepare the features and labels
numerical_features = ['word density', 'avg line length', 'mean_perplexity', 'burstiness1', 'flesch_kincaid_score', 'gunning_fog_score', 'burstiness2']
X_numerical = data_cleaned[numerical_features].values
X_text = data_cleaned['text'].values
y = data_cleaned['label'].values

# Split the data into train+val and test sets
X_text_trainval, X_text_test, X_num_trainval, X_num_test, y_trainval, y_test = train_test_split(
    X_text, X_numerical, y, test_size=0.2, random_state=42
)

In [None]:
# Preprocess text data
max_words = 10000
max_length = 350

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_text_trainval)
X_text_trainval_seq = tokenizer.texts_to_sequences(X_text_trainval)
X_text_test_seq = tokenizer.texts_to_sequences(X_text_test)
X_text_trainval_padded = pad_sequences(X_text_trainval_seq, maxlen=max_length)
X_text_test_padded = pad_sequences(X_text_test_seq, maxlen=max_length)

# Standardize numerical features
scaler = StandardScaler()
X_num_trainval_scaled = scaler.fit_transform(X_num_trainval)
X_num_test_scaled = scaler.transform(X_num_test)

In [None]:
# Define the CNN-BiLSTM model
def create_model(max_words, max_length, num_numerical_features):
    # Text input
    text_input = Input(shape=(max_length,))
    embedding = Embedding(max_words, 100)(text_input)

    conv1 = Conv1D(64, 5, activation='relu')(embedding)
    pool1 = MaxPooling1D(pool_size=4)(conv1)
    conv2 = Conv1D(128, 5, activation='relu')(pool1)
    pool2 = MaxPooling1D(pool_size=4)(conv2)

    bilstm = Bidirectional(LSTM(64, return_sequences=False))(pool2)

    # Numerical input
    numerical_input = Input(shape=(num_numerical_features,))

    # Concatenate BiLSTM output and numerical features
    combined = Concatenate()([bilstm, numerical_input])

    x = Dense(64, activation='relu')(combined)
    x = Dropout(0.5)(x)
    output = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[text_input, numerical_input], outputs=output)
    return model

In [None]:
# Set up K-Fold cross-validation
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

fold_accuracies = []
fold_histories = []

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

best_accuracy = 0
best_model = None

for fold, (train_index, val_index) in enumerate(kf.split(X_text_trainval_padded), 1):
    print(f"Training on fold {fold}/{n_folds}...")

    X_text_train, X_text_val = X_text_trainval_padded[train_index], X_text_trainval_padded[val_index]
    X_num_train, X_num_val = X_num_trainval_scaled[train_index], X_num_trainval_scaled[val_index]
    y_train, y_val = y_trainval[train_index], y_trainval[val_index]

    learning_rate = 2e-3

    model = create_model(max_words, max_length, X_num_trainval.shape[1])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    history = model.fit(
        [X_text_train, X_num_train], y_train,
        validation_data=([X_text_val, X_num_val], y_val),
        epochs=3,
        batch_size=32,
        callbacks=[early_stopping],
        verbose=1
    )

    y_pred = model.predict([X_text_val, X_num_val])
    y_pred_classes = (y_pred > 0.5).astype(int).flatten()
    fold_accuracy = accuracy_score(y_val, y_pred_classes)
    fold_accuracies.append(fold_accuracy)
    fold_histories.append(history)

    print(f"Fold {fold} accuracy: {fold_accuracy:.4f}")
    print(classification_report(y_val, y_pred_classes))
    print(confusion_matrix(y_val, y_pred_classes))
    print("\n")

    # Save the best model
    if fold_accuracy > best_accuracy:
        best_accuracy = fold_accuracy
        best_model = model

best_model.save('best_cnn_bilstm_model.h5')
print("Best model saved to 'best_cnn_bilstm_model.h5'")

print(f"Average accuracy across all folds: {np.mean(fold_accuracies):.4f}")
print(f"Standard deviation of accuracy: {np.std(fold_accuracies):.4f}")

In [None]:
# Plot k-fold cross-validation results
plt.figure(figsize=(12, 10))

#accuracy
plt.subplot(2, 1, 1)
for i, history in enumerate(fold_histories):
    plt.plot(history.history['accuracy'], label=f'Train (Fold {i+1})')
    plt.plot(history.history['val_accuracy'], label=f'Validation (Fold {i+1})')
plt.title('Model Accuracy Across Folds')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(loc='lower right')

#loss
plt.subplot(2, 1, 2)
for i, history in enumerate(fold_histories):
    plt.plot(history.history['loss'], label=f'Train (Fold {i+1})')
    plt.plot(history.history['val_loss'], label=f'Validation (Fold {i+1})')
plt.title('Model Loss Across Folds')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc='upper right')

plt.tight_layout()
plt.show()

#final k-fold results
plt.figure(figsize=(10, 6))
plt.plot(range(1, n_folds + 1), fold_accuracies, marker='o')
plt.title('Model Accuracy for Each Fold')
plt.xlabel('Fold')
plt.ylabel('Accuracy')
for i, acc in enumerate(fold_accuracies):
    plt.text(i + 1, acc, f'{acc:.4f}', ha='center', va='bottom')
plt.ylim(min(fold_accuracies) - 0.05, max(fold_accuracies) + 0.05)
plt.show()

print("\nEvaluating the best model on the test set:")
y_test_pred = best_model.predict([X_text_test_padded, X_num_test_scaled])
y_test_pred_classes = (y_test_pred > 0.5).astype(int).flatten()

print("\nEvaluating the best model on the test set:")
test_loss, test_accuracy = best_model.evaluate([X_text_test_padded, X_num_test_scaled], y_test, verbose=0)
y_test_pred = best_model.predict([X_text_test_padded, X_num_test_scaled])
y_test_pred_classes = (y_test_pred > 0.5).astype(int).flatten()

In [None]:
# Calculate metrics
test_f1 = f1_score(y_test, y_test_pred_classes)
test_auc = roc_auc_score(y_test, y_test_pred)

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Test AUC: {test_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred_classes))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_classes))

# Plot ROC curve
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(y_test, y_test_pred)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {test_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()

In [None]:
# Plot test results
plt.figure(figsize=(12, 5))

# Test Accuracy
plt.subplot(1, 2, 1)
plt.bar(['Test Accuracy'], [test_accuracy])
plt.title('Test Accuracy')
plt.ylim(0, 1)
for i, v in enumerate([test_accuracy]):
    plt.text(i, v, f'{v:.4f}', ha='center', va='bottom')

# Test Loss
plt.subplot(1, 2, 2)
plt.bar(['Test Loss'], [test_loss])
plt.title('Test Loss')
for i, v in enumerate([test_loss]):
    plt.text(i, v, f'{v:.4f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 5))

# F1 Score
plt.subplot(1, 2, 1)
plt.bar(['F1 Score'], [test_f1])
plt.title('Test F1 Score')
plt.ylim(0, 1)
for i, v in enumerate([test_f1]):
    plt.text(i, v, f'{v:.4f}', ha='center', va='bottom')

# AUC
plt.subplot(1, 2, 2)
plt.bar(['AUC'], [test_auc])
plt.title('Test AUC')
plt.ylim(0, 1)
for i, v in enumerate([test_auc]):
    plt.text(i, v, f'{v:.4f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()