<a href="https://colab.research.google.com/github/yasithS/DiseasePrediction/blob/main/diseasePrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing libraries

In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split ,cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle
import warnings
warnings.filterwarnings('ignore')

## Loading and exploring

In [21]:
df = pd.read_csv('dataset.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# check for missing values in each column
df.isnull().sum()

In [25]:
# replace Nan values with empty strings ''
df.fillna('', inplace=True)

In [None]:
# listing all columns with symptoms
symptom_columns = [col for col in df.columns if col.startswith('Symptom_')]
print(symptom_columns)

In [None]:
# extract unique diseases
unique_diseases = df['Disease'].unique()
print(len(unique_diseases))
print(unique_diseases)

In [None]:
# count the occurunce of each diseases
disease_counts = df['Disease'].value_counts()
print(disease_counts)

In [None]:
# plot disease distribution
plt.figure(figsize=(12, 6))
disease_counts.head(15).plot(kind='bar')
plt.title('Top 15 Diseases by Frequency')
plt.xlabel('Disease')
plt.ylabel('Count')
plt.xticks(rotation=90, ha='right')
plt.tight_layout()
plt.show()

In [None]:

print(symptom_columns)

In [37]:
# extracting all the unique symptoms from the dataset
unique_symptoms = set()
for column in symptom_columns:
    # Add all non-empty symptoms to the set
    symptoms = df[column].str.strip()
    unique_symptoms.update(symptoms[symptoms != ''].unique())

In [None]:
print(unique_symptoms)

In [39]:
# remove empty strings if there are some
if '' in unique_symptoms:
  unique_symptoms.remove('')

In [None]:
# convert to sorted list
all_symptoms = sorted(list(unique_symptoms))
print(all_symptoms)

In [41]:
# Get Symptom frequency

#count how often the each symptom appears
symptom_counts = {}
for symptom in all_symptoms:
    count = 0
    for column in symptom_columns:
      count += df[column].str.contains(symptom, na=False).sum()
    symptom_counts[symptom] = count


In [None]:
print(symptom_counts)

In [43]:
# Convert to dataframe for easier visualization
symptom_df = pd.DataFrame({
    'Symptom': list(symptom_counts.keys()),
    'Count': list(symptom_counts.values())
})
symptom_df = symptom_df.sort_values('Count', ascending=False)


In [None]:
# Plot the top symptoms
plt.figure(figsize=(45, 8))
symptom_df.head(25).plot(kind='bar', x='Symptom', y='Count')
plt.title('Top 15 Most Common Symptoms')
plt.xlabel('Symptom')
plt.ylabel('Frequency')
plt.xticks(rotation=90, ha='right')
plt.tight_layout()
plt.show()

## Data Preprocessing

In [54]:
# Create a one-hot encoding of symptoms
# For each row, create a binary vector indicating which symptoms are present
X = np.zeros((len(df), len(all_symptoms)))

In [55]:
# For each row in the dataset
for i, row in df.iterrows():
    # For each symptom column
    for column in symptom_columns:
        symptom = row[column].strip()
        if symptom:  # If not empty
            # Find the index of this symptom in our all_symptoms list
            symptom_index = all_symptoms.index(symptom)
            # Mark this symptom as present (1) for this row
            X[i, symptom_index] = 1

In [56]:
# Encode the disease (target variable)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Disease'])

In [60]:
# Store the mapping of encoded values to disease names for later reference
disease_mapping = dict(zip(range(len(label_encoder.classes_)), label_encoder.classes_))

## Spliting the dataset

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape, " -> x train shape")
print(X_test.shape, " -> x test shape")

## Building the machine learning model

In [64]:
# train random forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

## Evaluating the model

In [67]:
y_pred = model.predict(X_test)

In [None]:
# calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

In [None]:
# Generate classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
# Generate confusion matrix
plt.figure(figsize=(12, 10))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## cross validation

In [None]:
cv_scores = cross_val_score(model, X, y, cv=5)
print(f"\nCross-validation scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean():.4f}")