In [None]:
#Import libraries
import pandas as pd
import numpy as np
import random
import tensorflow as tf
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from scipy import sparse
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
#use a fixed seed to ensure the same results
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [None]:
#Load processed dataset
df = pd.read_csv("../data/processed/clean_dataset_full.csv")
df.head()


In [None]:
#Encode gender as a numerical feature
df['Gender_encoded'] = df['Gender'].map({'Male':0, 'Female':1, 'Other':2})
df[['Gender', 'Gender_encoded']].head()


In [None]:
#Defining the target variable
y = df['label'].values

In [None]:
# Train / Test split (no validation needed for TF-IDF)
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['clean_symptoms'],
    y,
    test_size=0.15,
    random_state=SEED,
    stratify=y
)

In [None]:
# TF-IDF Vectorization of Symptom Text
tfidf = TfidfVectorizer(
    ngram_range=(1, 2)
)

X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

In [None]:
# Combine Text and Numeric Features
X_numeric = df[['Age', 'Symptom_Count', 'Gender_encoded']].values

scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

# Split numeric features using the same indices
X_train_numeric = X_numeric_scaled[df.index.isin(X_train_text.index)]
X_test_numeric = X_numeric_scaled[df.index.isin(X_test_text.index)]

X_train = sparse.hstack([
    X_train_tfidf,
    sparse.csr_matrix(X_train_numeric)
])

X_test = sparse.hstack([
    X_test_tfidf,
    sparse.csr_matrix(X_test_numeric)
])


In [None]:
#Train Logistic Regression Model
clf = LogisticRegression(
    max_iter=4000,
    solver='saga',
    multi_class='multinomial',
    class_weight='balanced',
     random_state=SEED,
    n_jobs=-1
)

clf.fit(X_train, y_train)

In [None]:
#Evaluation Metrics
y_pred = clf.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\n Classification Report:\n")
print(classification_report(y_test, y_pred))


In [None]:
#Confusion Matrix Visualization
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(12,10))
sns.heatmap(cm, annot=False, cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("TF-IDF + Logistic Regression (Balanced)")
plt.show()


In [None]:
#Save Model and Vectorizer for Deployment
joblib.dump(clf, "../data/processed/tfidf_logreg_model.pkl")

joblib.dump(tfidf, "../data/processed/tfidf_vectorizer.pkl")
