Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


Loading Dataset

In [None]:
# Load dataset
df = pd.read_csv("/content/Training.csv")  # Replace with actual filename
df.head()


Unnamed: 0,diseases,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,...,stuttering or stammering,problems with orgasm,nose deformity,lump over jaw,sore in nose,hip weakness,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness
0,panic disorder,1,0,1,1,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,panic disorder,0,0,1,1,0,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,panic disorder,1,1,1,1,0,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,panic disorder,1,0,0,1,0,1,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,panic disorder,1,1,0,0,0,0,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Data Integration

In [None]:
print("Shape:", df.shape)
print("\nColumns:\n", df.columns)
print("\nNull values:\n", df.isnull().sum())
df = df.dropna()
print("\nUnique diseases:", df["diseases"].nunique())


Shape: (12212, 378)

Columns:
 Index(['diseases', 'anxiety and nervousness', 'depression',
       'shortness of breath', 'depressive or psychotic symptoms',
       'sharp chest pain', 'dizziness', 'insomnia',
       'abnormal involuntary movements', 'chest tightness',
       ...
       'stuttering or stammering', 'problems with orgasm', 'nose deformity',
       'lump over jaw', 'sore in nose', 'hip weakness', 'back swelling',
       'ankle stiffness or tightness', 'ankle weakness', 'neck weakness'],
      dtype='object', length=378)

Null values:
 diseases                            0
anxiety and nervousness             0
depression                          0
shortness of breath                 0
depressive or psychotic symptoms    0
                                   ..
hip weakness                        1
back swelling                       1
ankle stiffness or tightness        1
ankle weakness                      1
neck weakness                       1
Length: 378, dtype: int64

U

Data pre processing

In [None]:
# Encode target labels
le = LabelEncoder()
df['diseases'] = le.fit_transform(df['diseases'])


# Save reverse mapping
disease_mapping = dict(zip(le.transform(le.classes_), le.classes_))
# Count disease instances
counts = df['diseases'].value_counts()

# Filter out diseases with only 1 instance
df = df[df['diseases'].isin(counts[counts > 1].index)]

# Re-encode if needed
X = df.drop('diseases', axis=1)
y = df['diseases']



Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


Model

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


Evaluate

In [None]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
plt.figure(figsize=(12, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=False, cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()


predicting on custom input

In [1]:
# Get list of symptoms
all_symptoms = X.columns.tolist()

# User input symptoms (example)
input_symptoms = ['fever', 'headache', 'nausea', 'fatigue', 'vomiting']  # Modify as needed

# Create zero-vector
user_input = np.zeros(len(all_symptoms))

# Set 1 for entered symptoms
for symptom in input_symptoms:
    if symptom in all_symptoms:
        user_input[all_symptoms.index(symptom)] = 1
    else:
        print(f"Warning: '{symptom}' not found in dataset")

# Predict probabilities
probas = model.predict_proba([user_input])[0]
top_indices = np.argsort(probas)[::-1][:5]  # Top 5

# Display top diseases
print("\nTop Predicted Diseases:")
for idx in top_indices:
    print(f"{disease_mapping[idx]}: {probas[idx]*100:.2f}%")

# Plot
plt.figure(figsize=(8, 5))
sns.barplot(x=[probas[i] for i in top_indices], y=[disease_mapping[i] for i in top_indices], palette='viridis')
plt.xlabel("Probability")
plt.title("Top Predicted Diseases")
plt.show()


NameError: name 'X' is not defined

Feature Imp plot

In [None]:
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(20).plot(kind='barh', figsize=(8, 6))
plt.title("Top 20 Symptom Importance")
plt.show()
