In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
print('CUDA:',torch.version.cuda)

CUDA: 12.1


In [3]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (accuracy_score,
                            confusion_matrix,
                            classification_report)

In [4]:
df = pd.read_csv('/content/drive/MyDrive/contents/Symptoms/dataset.csv')

df.head(5)

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [5]:
df1 = pd.read_csv('/content/drive/MyDrive/contents/Symptoms/Symptom-severity.csv')

df1.head(10)

Unnamed: 0,Symptom,weight
0,itching,1
1,skin_rash,3
2,nodal_skin_eruptions,4
3,continuous_sneezing,4
4,shivering,5
5,chills,3
6,joint_pain,3
7,stomach_pain,5
8,acidity,3
9,ulcers_on_tongue,4


In [6]:
df.describe()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
count,4920,4920,4920,4920,4572,3714,2934,2268,1944,1692,1512,1194,744,504,306,240,192,72
unique,41,34,48,54,50,38,32,26,21,22,21,18,11,8,4,3,3,1
top,Fungal infection,vomiting,vomiting,fatigue,high_fever,headache,nausea,abdominal_pain,abdominal_pain,yellowing_of_eyes,yellowing_of_eyes,irritability,malaise,muscle_pain,chest_pain,chest_pain,blood_in_sputum,muscle_pain
freq,120,822,870,726,378,348,390,264,276,228,198,120,126,72,96,144,72,72


In [8]:
print(f"Length of dataset: {len(df)}")

print(f"\nNA values in dataset: \n{df.isna().sum()}")


Length of dataset: 4920

NA values in dataset: 
Disease          0
Symptom_1        0
Symptom_2        0
Symptom_3        0
Symptom_4      348
Symptom_5     1206
Symptom_6     1986
Symptom_7     2652
Symptom_8     2976
Symptom_9     3228
Symptom_10    3408
Symptom_11    3726
Symptom_12    4176
Symptom_13    4416
Symptom_14    4614
Symptom_15    4680
Symptom_16    4728
Symptom_17    4848
dtype: int64


In [9]:
cols = df.columns
data = df[cols].values.flatten()
print(f"Shape: {data.shape}")
data

Shape: (88560,)


array(['Fungal infection', 'itching', ' skin_rash', ..., nan, nan, nan],
      dtype=object)

In [10]:
series_data = pd.Series(data)
series_data = series_data.str.strip()
series_data = series_data.values.reshape(df.shape)

df = pd.DataFrame(series_data, columns=df.columns)

df = df.fillna(0)
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,itching,skin_rash,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [11]:
vals = df.values
symptoms = df1['Symptom'].unique()

for i in range(len(symptoms)):
    vals[vals == symptoms[i]] = df1[df1['Symptom'] == symptoms[i]]['weight'].values[0]

d = pd.DataFrame(vals, columns=cols)

d = d.replace('dischromic _patches', 0)
d = d.replace('spotting_ urination',0)
df = d.replace('foul_smell_of urine',0)
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,1,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,1,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
(df[cols] == 0).all()

df['Disease'].value_counts()

df['Disease'].unique()

data = df.iloc[:,1:].values
labels = df['Disease'].values

In [13]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, shuffle=True, test_size = 0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(3936, 17) (984, 17) (3936,) (984,)


In [14]:
def confusion_plot(model, X_test, y_test):
    plt.figure(figsize=(8, 8), dpi=150)

    y_pred = model.predict(X_test)

    conf_mat = confusion_matrix(y_test, y_pred)
    df_cm = pd.DataFrame(conf_mat, index=df['Disease'].unique(), columns=df['Disease'].unique())

    sns.heatmap(df_cm, annot=True)

In [15]:
def create_report(model, X_test, y_test):
    y_pred = model.predict(X_test)

    report = classification_report(y_test, y_pred)

    acc = accuracy_score(y_test, y_pred)

    print(f"Accuracy : {acc*100:.4f} %\n")
    print("Classification report: \n")
    print(report)

In [16]:
lr_model = LogisticRegression(solver='saga', max_iter=2500)
lr_model.fit(X_train, y_train)



In [17]:
create_report(lr_model, X_test, y_test)


Accuracy : 91.5650 %

Classification report: 

                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       0.89      0.77      0.83        22
                                   AIDS       0.83      0.87      0.85        23
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      0.86      0.92        28
                                Allergy       0.78      0.85      0.82        34
                              Arthritis       0.81      0.94      0.87        18
                       Bronchial Asthma       0.71      0.67      0.69        30
                   Cervical spondylosis       0.52      0.59      0.55        22
                            Chicken pox       1.00      1.00      1.00        30
                    Chronic cholestasis       0.91      1.00      0.95        29
                            Common Cold       1.00      1.00 

In [18]:
svc_model = SVC()
svc_model.fit(X_train, y_train)

In [19]:
create_report(svc_model, X_test, y_test)

Accuracy : 93.6992 %

Classification report: 

                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       0.85      0.77      0.81        22
                                   AIDS       0.96      1.00      0.98        23
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      0.93      0.96        28
                                Allergy       0.91      0.85      0.88        34
                              Arthritis       1.00      0.94      0.97        18
                       Bronchial Asthma       0.86      0.60      0.71        30
                   Cervical spondylosis       0.72      0.82      0.77        22
                            Chicken pox       1.00      1.00      1.00        30
                    Chronic cholestasis       0.83      1.00      0.91        29
                            Common Cold       1.00      1.00 

In [20]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

In [21]:
create_report(dt_model, X_test, y_test)

Accuracy : 99.5935 %

Classification report: 

                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        22
                                   AIDS       1.00      1.00      1.00        23
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      1.00      1.00        28
                                Allergy       0.94      1.00      0.97        34
                              Arthritis       1.00      1.00      1.00        18
                       Bronchial Asthma       1.00      1.00      1.00        30
                   Cervical spondylosis       0.96      1.00      0.98        22
                            Chicken pox       1.00      1.00      1.00        30
                    Chronic cholestasis       1.00      1.00      1.00        29
                            Common Cold       1.00      1.00 

In [22]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

In [23]:
create_report(rf_model, X_test, y_test)

Accuracy : 99.5935 %

Classification report: 

                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        22
                                   AIDS       1.00      1.00      1.00        23
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      1.00      1.00        28
                                Allergy       0.94      1.00      0.97        34
                              Arthritis       1.00      1.00      1.00        18
                       Bronchial Asthma       1.00      1.00      1.00        30
                   Cervical spondylosis       0.96      1.00      0.98        22
                            Chicken pox       1.00      1.00      1.00        30
                    Chronic cholestasis       1.00      1.00      1.00        29
                            Common Cold       1.00      1.00 

In [24]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)

In [25]:
create_report(knn_model, X_test, y_test)

Accuracy : 99.5935 %

Classification report: 

                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        22
                                   AIDS       1.00      1.00      1.00        23
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      1.00      1.00        28
                                Allergy       0.94      1.00      0.97        34
                              Arthritis       1.00      1.00      1.00        18
                       Bronchial Asthma       1.00      1.00      1.00        30
                   Cervical spondylosis       1.00      0.95      0.98        22
                            Chicken pox       1.00      1.00      1.00        30
                    Chronic cholestasis       1.00      1.00      1.00        29
                            Common Cold       1.00      1.00 

In [30]:
import joblib
joblib.dump(rf_model, 'Symptoms.pkl')


['Symptoms.pkl']