In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
import joblib
import os
from flask import Flask, request, jsonify

In [4]:
# loading the csv data to a panda DataFrmae
disease_dataset = pd.read_csv('disease_dataset.csv')

In [5]:
# Check the first few rows of the dataset to inspect its structure
disease_dataset.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [6]:
# Fill missing values with 'None'
disease_dataset.fillna('None', inplace=True)

In [7]:
# Print first five rows of the dataset
print(disease_dataset.head())

            Disease   Symptom_1              Symptom_2              Symptom_3  \
0  Fungal infection     itching              skin_rash   nodal_skin_eruptions   
1  Fungal infection   skin_rash   nodal_skin_eruptions    dischromic _patches   
2  Fungal infection     itching   nodal_skin_eruptions    dischromic _patches   
3  Fungal infection     itching              skin_rash    dischromic _patches   
4  Fungal infection     itching              skin_rash   nodal_skin_eruptions   

              Symptom_4 Symptom_5 Symptom_6 Symptom_7 Symptom_8 Symptom_9  \
0   dischromic _patches      None      None      None      None      None   
1                  None      None      None      None      None      None   
2                  None      None      None      None      None      None   
3                  None      None      None      None      None      None   
4                  None      None      None      None      None      None   

  Symptom_10 Symptom_11 Symptom_12 Symptom_13 Symp

In [8]:
# Check the shape of the dataset
print(f"Dataset shape: {disease_dataset.shape}")

Dataset shape: (410, 18)


In [9]:
# Check for missing values
print(disease_dataset.isnull().sum())

Disease       0
Symptom_1     0
Symptom_2     0
Symptom_3     0
Symptom_4     0
Symptom_5     0
Symptom_6     0
Symptom_7     0
Symptom_8     0
Symptom_9     0
Symptom_10    0
Symptom_11    0
Symptom_12    0
Symptom_13    0
Symptom_14    0
Symptom_15    0
Symptom_16    0
Symptom_17    0
dtype: int64


In [10]:
# Get info about the dataset
print(disease_dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 410 entries, 0 to 409
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Disease     410 non-null    object
 1   Symptom_1   410 non-null    object
 2   Symptom_2   410 non-null    object
 3   Symptom_3   410 non-null    object
 4   Symptom_4   410 non-null    object
 5   Symptom_5   410 non-null    object
 6   Symptom_6   410 non-null    object
 7   Symptom_7   410 non-null    object
 8   Symptom_8   410 non-null    object
 9   Symptom_9   410 non-null    object
 10  Symptom_10  410 non-null    object
 11  Symptom_11  410 non-null    object
 12  Symptom_12  410 non-null    object
 13  Symptom_13  410 non-null    object
 14  Symptom_14  410 non-null    object
 15  Symptom_15  410 non-null    object
 16  Symptom_16  410 non-null    object
 17  Symptom_17  410 non-null    object
dtypes: object(18)
memory usage: 57.8+ KB
None


In [11]:
# Get statistical summary
print(disease_dataset.describe())

                 Disease  Symptom_1  Symptom_2 Symptom_3 Symptom_4 Symptom_5  \
count                410        410        410       410       410       410   
unique                41         34         48        55        51        39   
top     Fungal infection   vomiting   vomiting   fatigue      None      None   
freq                  10         67         65        51        57       121   

       Symptom_6 Symptom_7 Symptom_8 Symptom_9 Symptom_10 Symptom_11  \
count        410       410       410       410        410        410   
unique        33        27        22        23         22         19   
top         None      None      None      None       None       None   
freq         191       232       256       278        288        331   

       Symptom_12 Symptom_13 Symptom_14 Symptom_15 Symptom_16 Symptom_17  
count         410        410        410        410        410        410  
unique         12          9          5          4          4          2  
top          

In [12]:
# Check the column names to confirm the target variable
print(disease_dataset.columns)

Index(['Disease', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4',
       'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9',
       'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14',
       'Symptom_15', 'Symptom_16', 'Symptom_17'],
      dtype='object')


In [13]:
# Separate features (symptoms) and target (disease)
X = disease_dataset.iloc[:, 1:]  # Symptom columns
y = disease_dataset['Disease']    # Target column (Disease)

In [14]:
# Print feature columns and target
print(X.columns)
print(y.head())

Index(['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5',
       'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9', 'Symptom_10',
       'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14', 'Symptom_15',
       'Symptom_16', 'Symptom_17'],
      dtype='object')
0    Fungal infection
1    Fungal infection
2    Fungal infection
3    Fungal infection
4    Fungal infection
Name: Disease, dtype: object


In [15]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shape of the data
print(f"X shape: {X.shape}, X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")


X shape: (410, 17), X_train shape: (328, 17), X_test shape: (82, 17)


In [16]:
# Create a pipeline with Imputer, OneHotEncoder, and RandomForestClassifier
model_pipeline = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='None'),  # Handle missing values
    OneHotEncoder(handle_unknown='ignore'),                # Encode categorical symptoms
    RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)  # Handle class imbalance
)

In [17]:
# Train the model
model_pipeline.fit(X_train, y_train)

In [18]:
# Test the model
y_pred = model_pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Print confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Model Accuracy: 97.56%
Confusion Matrix:
[[2 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 2 ... 0 0 0]
 ...
 [0 0 0 ... 4 0 0]
 [0 0 0 ... 0 2 0]
 [0 0 0 ... 0 0 3]]

Classification Report:
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         2
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         2
                       Bronchial Asthma       1.00      1.00      1.00         3
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox       1.00      1.00      1.00         2
                    Chronic cholestasis       1.00      1.00      1.00         4
                            Common Cold       1.00      1.00      1.00         1
                                 Dengue       1.00      1.00      1.00         1
    

In [19]:
# Perform cross-validation with 5 folds
cross_val_scores = cross_val_score(model_pipeline, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation Accuracy: {cross_val_scores.mean() * 100:.2f}%")




Cross-validation Accuracy: 98.77%


In [20]:
# Function to make predictions based on user input
def predict_disease(symptoms):
    # Convert the symptoms into a DataFrame with the same structure as the training set
    input_data = pd.DataFrame([symptoms], columns=X.columns)

    try:
        # Make prediction
        prediction = model_pipeline.predict(input_data)
        return f"Predicted Disease: {prediction[0]}"
    except Exception as e:
        return f"Unable to predict the disease due to invalid input: {str(e)}"


In [21]:
# Function to make predictions based on user input
def predict_disease(symptoms):
    # Convert the symptoms into a DataFrame with the same structure as the training set
    input_data = pd.DataFrame([symptoms], columns=X.columns)

    try:
        # Make prediction
        prediction = model_pipeline.predict(input_data)
        return f"Predicted Disease: {prediction[0]}"
    except Exception as e:
        return f"Unable to predict the disease due to invalid input: {str(e)}"


In [22]:
model_path = 'disease_prediction_model.pkl'
joblib.dump(model_pipeline, model_path)
print(f"Model saved as {model_path}")

Model saved as disease_prediction_model.pkl
