In [None]:
#importing libraries
import pandas as pd
import numpy as np
import random
import seaborn as sns
import tensorflow as tf
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, SimpleRNN, Concatenate
from tensorflow.keras.callbacks import EarlyStopping

import matplotlib.pyplot as plt

In [None]:
#use a fixed seed to ensure the same results
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [None]:
#Load processed dataset
df = pd.read_csv("../data/processed/clean_dataset_full.csv")
df.head()

In [None]:
#Encode gender as a numerical feature
df['Gender_encoded'] = df['Gender'].map({'Male':0, 'Female':1, 'Other':2})
df[['Gender', 'Gender_encoded']].head()

In [None]:
#Standardize Numeric Features
numeric_features = df[['Age', 'Symptom_Count', 'Gender_encoded']].values.astype(np.float32)
scaler = StandardScaler()
numeric_features = scaler.fit_transform(numeric_features)
numeric_features[:5]

In [None]:
#Tokenize & Pad Text Data
X_text = df['clean_symptoms'].values
max_len = 50

tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(X_text)

X_text_seq = tokenizer.texts_to_sequences(X_text)
X_text_seq = pad_sequences(X_text_seq, maxlen=max_len, padding='post')

X_text_seq[:2]


In [None]:
#One-Hot Encode Target Labels
y_labels = df['label'].values

lb = LabelBinarizer()
y = lb.fit_transform(y_labels)

num_classes = y.shape[1]
num_classes

In [None]:
#Train , Validation , Test Split
X_train_text, X_temp_text, X_train_num, X_temp_num, y_train, y_temp = train_test_split(
    X_text_seq,
    numeric_features,
    y,
    test_size=0.3,
    random_state=SEED,
    stratify=y_labels
)

X_val_text, X_test_text, X_val_num, X_test_num, y_val, y_test = train_test_split(
    X_temp_text,
    X_temp_num,
    y_temp,
    test_size=0.5,
    random_state=SEED,
    stratify=np.argmax(y_temp, axis=1)
)

In [None]:
# Build the RNN Model
embedding_dim = 50
rnn_units = 64
vocab_size = len(tokenizer.word_index) + 1   # +1 for padding token

# Text input
text_input = Input(shape=(max_len,), name="text_input")
x_text = Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim
)(text_input)
x_text = SimpleRNN(rnn_units)(x_text)

# Numeric input
numeric_input = Input(shape=(X_train_num.shape[1],), name="numeric_input")

# Concatenate
x = Concatenate()([x_text, numeric_input])
x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.3)(x)

# Output layer
output = Dense(num_classes, activation='softmax')(x)

# Build model
model = Model(inputs=[text_input, numeric_input], outputs=output)

# Compile model
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# Build and summarize
model.build(input_shape=[(None, max_len), (None, X_train_num.shape[1])])
model.summary()


In [None]:
#Train the Model with Early Stopping
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

history = model.fit(
    [X_train_text, X_train_num],
    y_train,
    validation_data=([X_val_text, X_val_num], y_val),
    epochs=20,
    batch_size=32,
    callbacks=[early_stop]
)

In [None]:
#Final Test Evaluation
test_loss, test_acc = model.evaluate([X_test_text, X_test_num], y_test)

print(f" Test Accuracy: {test_acc:.4f}")

In [None]:
#Classification Report and Confusion Matrix
y_prob = model.predict([X_test_text, X_test_num])
y_pred = np.argmax(y_prob, axis=1)
y_true = np.argmax(y_test, axis=1)   # because y_test is one-hot encoded

# Classification report
print("Classification Report:")
print(classification_report(y_true, y_pred, digits=3, zero_division=0))

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(8, 8))
sns.heatmap(cm,
            cmap="Blues",
            cbar=True,
            square=True)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("RNN Confusion Matrix (Text + Numeric Features)")
plt.show()

In [None]:
#Accuracy Curve Plot
plt.plot(history.history['accuracy'], label='train_acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('RNN Accuracy (Text + Numeric Features)')
plt.legend()
plt.show()

In [None]:
#Save Model & Preprocessing Tools
import joblib
model.save("../data/processed/rnn_model_full.keras")

joblib.dump(tokenizer, "../data/processed/tokenizer.pkl")
joblib.dump(scaler, "../data/processed/numeric_scaler.pkl")
joblib.dump(lb, "../data/processed/label_encoder.pkl")
