In [1]:
#imports
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score
import joblib
import os

In [2]:
# Load CSVs from your paths
dataset_path = '../dataset/dataset.csv'  # Symptom_Details.csv
symptom_description_path = '../dataset/symptom_Description.csv'
symptom_severity_path = '../dataset/Symptom-severity.csv'
symptom_precaution_path = '../dataset/symptom_precaution.csv'

dataset_df = pd.read_csv(dataset_path)
symptom_severity_df = pd.read_csv(symptom_severity_path)
precaution_df=pd.read_csv(symptom_precaution_path)

In [3]:
# Build symptom vocab from severity file
symptom_vocab = sorted(symptom_severity_df['Symptom'].str.lower().str.strip().unique())
symptom_index = {symptom: idx for idx, symptom in enumerate(symptom_vocab)}

In [4]:
dataset = pd.read_csv('../dataset/dataset.csv')
severity = pd.read_csv('../dataset/Symptom-severity.csv')
precautions = pd.read_csv('../dataset/symptom_precaution.csv')


In [5]:
print(dataset.shape)
print(dataset.isnull().sum())
print(dataset['Disease'].value_counts())


(4920, 18)
Disease          0
Symptom_1        0
Symptom_2        0
Symptom_3        0
Symptom_4      348
Symptom_5     1206
Symptom_6     1986
Symptom_7     2652
Symptom_8     2976
Symptom_9     3228
Symptom_10    3408
Symptom_11    3726
Symptom_12    4176
Symptom_13    4416
Symptom_14    4614
Symptom_15    4680
Symptom_16    4728
Symptom_17    4848
dtype: int64
Disease
Fungal infection                           120
Hepatitis C                                120
Hepatitis E                                120
Alcoholic hepatitis                        120
Tuberculosis                               120
Common Cold                                120
Pneumonia                                  120
Dimorphic hemmorhoids(piles)               120
Heart attack                               120
Varicose veins                             120
Hypothyroidism                             120
Hyperthyroidism                            120
Hypoglycemia                               120
Osteoarthristis

In [6]:
symptom_vocab = sorted(severity['Symptom'].str.lower().str.strip().unique())
symptom_index = {symptom: i for i, symptom in enumerate(symptom_vocab)}

def encode_symptoms(row):
    vec = [0]*len(symptom_vocab)
    for val in row:
        if pd.notna(val):
            val = val.strip().lower()
            if val in symptom_index:
                vec[symptom_index[val]] = 1
    return vec

dataset['symptom_vector'] = dataset[[f'Symptom_{i}' for i in range(1,18)]].apply(encode_symptoms, axis=1)


In [7]:
X = np.array(dataset['symptom_vector'].to_list())
y = dataset['Disease']


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)


In [9]:
model = RandomForestClassifier()
model.fit(X_train, y_train)


In [10]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        24
                                   AIDS       1.00      1.00      1.00        24
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      1.00      1.00        24
                                Allergy       1.00      1.00      1.00        24
                              Arthritis       1.00      1.00      1.00        24
                       Bronchial Asthma       1.00      1.00      1.00        24
                   Cervical spondylosis       1.00      1.00      1.00        24
                            Chicken pox       1.00      1.00      1.00        24
                    Chronic cholestasis       1.00      1.00      1.00        24
                            Common Cold       1.00      1.00      1.00        24
                           

In [11]:
# joblib.dump(model, 'disease_predictor.pkl')

In [12]:
import random

def inject_noise(row, symptom_cols, symptom_vocab, drop_prob=0.2, add_prob=0.2):
    symptoms = [s.lower().strip() for s in row[symptom_cols].dropna().tolist()]
    
    # Drop symptoms randomly
    symptoms = [s for s in symptoms if random.random() > drop_prob]
    
    # Add random unrelated symptoms
    if random.random() < add_prob:
        num_add = random.randint(1, 2)
        new_symptoms = random.sample(list(set(symptom_vocab) - set(symptoms)), num_add)
        symptoms.extend(new_symptoms)
    
    # Pad or trim to 17
    symptoms = symptoms[:17] + [None]*(17 - len(symptoms))
    
    return pd.Series(symptoms)


In [13]:
symptom_cols = [f"Symptom_{i}" for i in range(1, 18)]
symptom_vocab = severity['Symptom'].str.lower().str.strip().unique()

noisy_df = dataset.copy()
noisy_df[symptom_cols] = noisy_df.apply(lambda row: inject_noise(row, symptom_cols, symptom_vocab), axis=1)


In [14]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Step 1: Noise Injector
class SymptomNoiseInjector(BaseEstimator, TransformerMixin):
    def __init__(self, symptom_cols, symptom_vocab, drop_prob=0.2, add_prob=0.2):
        self.symptom_cols = symptom_cols
        self.symptom_vocab = symptom_vocab
        self.drop_prob = drop_prob
        self.add_prob = add_prob

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        def inject(row):
            symptoms = [s.lower().strip() for s in row[self.symptom_cols].dropna().tolist()]
            symptoms = [s for s in symptoms if random.random() > self.drop_prob]
            if random.random() < self.add_prob:
                num_add = random.randint(1, 2)
                added = random.sample(list(set(self.symptom_vocab) - set(symptoms)), num_add)
                symptoms.extend(added)
            symptoms = symptoms[:17] + [None]*(17 - len(symptoms))
            return pd.Series(symptoms)

        X[self.symptom_cols] = X.apply(inject, axis=1)
        return X

# Step 2: Encoder
class SymptomEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, vocab):
        self.vocab = vocab
        self.symptom_index = {s: i for i, s in enumerate(vocab)}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        vectors = []
        for _, row in X.iterrows():
            vec = [0]*len(self.vocab)
            for val in row.dropna():
                val = str(val).strip().lower()
                if val in self.symptom_index:
                    vec[self.symptom_index[val]] = 1
            vectors.append(vec)
        return np.array(vectors)


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

symptom_cols = [f'Symptom_{i}' for i in range(1, 18)]
symptom_vocab = severity['Symptom'].str.lower().str.strip().unique()

# Split first to avoid leak
train_df, test_df = train_test_split(dataset, test_size=0.2, stratify=dataset['Disease'], random_state=42)
y_train = train_df['Disease']
y_test = test_df['Disease']

# Build pipeline
pipeline = Pipeline([
    ('noise', SymptomNoiseInjector(symptom_cols, symptom_vocab)),
    ('encode', SymptomEncoder(symptom_vocab)),
    ('clf', RandomForestClassifier())
])

# Train
pipeline.fit(train_df[symptom_cols], y_train)

# Predict + Evaluate
y_pred = pipeline.predict(test_df[symptom_cols])
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[self.symptom_cols] = X.apply(inject, axis=1)


                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        24
                                   AIDS       1.00      1.00      1.00        24
                                   Acne       1.00      0.92      0.96        24
                    Alcoholic hepatitis       1.00      1.00      1.00        24
                                Allergy       1.00      0.96      0.98        24
                              Arthritis       1.00      1.00      1.00        24
                       Bronchial Asthma       1.00      1.00      1.00        24
                   Cervical spondylosis       1.00      1.00      1.00        24
                            Chicken pox       1.00      1.00      1.00        24
                    Chronic cholestasis       1.00      1.00      1.00        24
                            Common Cold       1.00      1.00      1.00        24
                           

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[self.symptom_cols] = X.apply(inject, axis=1)


In [16]:
class SymptomNoiseInjector(BaseEstimator, TransformerMixin):
    def __init__(self, symptom_cols, symptom_vocab, important_symptoms_map=None, swap_prob=0.3, label_flip_prob=0.05):
        self.symptom_cols = symptom_cols
        self.symptom_vocab = list(set(symptom_vocab))
        self.important_symptoms_map = important_symptoms_map
        self.swap_prob = swap_prob
        self.label_flip_prob = label_flip_prob
        self.is_train = True  # Track mode

    def fit(self, X, y=None):
        self.y = y
        self.is_train = True
        return self

    def transform(self, X):
        # Apply label flipping only during training
        if self.is_train and self.y is not None:
            y_vals = self.y.copy()
            for i in range(len(y_vals)):
                if random.random() < self.label_flip_prob:
                    y_vals.iloc[i] = random.choice(y_vals.values)
            self.y = y_vals

        def corrupt(row):
            symptoms = [s.lower().strip() for s in row[self.symptom_cols].dropna().tolist()]
            
            # Remove 2–5 actual symptoms
            if len(symptoms) > 0 and random.random() < 0.9:
                for _ in range(random.randint(2, 5)):
                    if symptoms:
                        symptoms.pop(random.randint(0, len(symptoms) - 1))
            
            # Add 2–6 wrong/random symptoms
            if random.random() < 0.9:
                for _ in range(random.randint(2, 6)):
                    symptoms.append(random.choice(self.symptom_vocab))
            
            # Shuffle order
            random.shuffle(symptoms)
            
            symptoms = symptoms[:17] + [None] * (17 - len(symptoms))
            return pd.Series(symptoms)


        X_copy = X.copy()
        X_copy[self.symptom_cols] = X_copy.apply(corrupt, axis=1)
        self.is_train = False  # Switch to test mode after training
        return X_copy


In [17]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# --- Config ---
symptom_cols = [f'Symptom_{i}' for i in range(1, 18)]
symptom_vocab = severity['Symptom'].str.lower().str.strip().unique()

# --- Data Split ---
train_df, test_df = train_test_split(
    dataset, 
    test_size=0.2, 
    stratify=dataset['Disease'], 
    random_state=42
)
y_train = train_df['Disease']
y_test = test_df['Disease']

# --- Pipeline Definition ---
pipeline = Pipeline([
    ('noise', SymptomNoiseInjector(
        symptom_cols=symptom_cols,
        symptom_vocab=symptom_vocab,
        important_symptoms_map=None,  # Optional for now
        swap_prob=0.9,
        label_flip_prob=0.25
    )),
    ('encode', SymptomEncoder(symptom_vocab)),
    ('clf', RandomForestClassifier(random_state=42))
])

# --- Model Training ---
pipeline.fit(train_df[symptom_cols], y_train)

# --- Evaluation ---
y_pred = pipeline.predict(test_df[symptom_cols])
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       0.38      0.54      0.45        24
                                   AIDS       0.47      0.29      0.36        24
                                   Acne       0.34      0.42      0.38        24
                    Alcoholic hepatitis       0.75      0.88      0.81        24
                                Allergy       0.38      0.38      0.38        24
                              Arthritis       0.65      0.54      0.59        24
                       Bronchial Asthma       0.74      0.71      0.72        24
                   Cervical spondylosis       0.62      0.62      0.62        24
                            Chicken pox       1.00      1.00      1.00        24
                    Chronic cholestasis       0.56      0.62      0.59        24
                            Common Cold       1.00      1.00      1.00        24
                           