In [33]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import _tree

In [34]:
# Load the datasets
df = pd.read_csv('/content/Training.csv')
test_df = pd.read_csv('/content/Testing.csv')

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Columns: 134 entries, itching to Unnamed: 133
dtypes: float64(1), int64(132), object(1)
memory usage: 5.0+ MB


In [36]:
df.isnull().sum()

itching                    0
skin_rash                  0
nodal_skin_eruptions       0
continuous_sneezing        0
shivering                  0
                        ... 
blister                    0
red_sore_around_nose       0
yellow_crust_ooze          0
prognosis                  0
Unnamed: 133            4920
Length: 134, dtype: int64

In [37]:
df.drop(columns= "Unnamed: 133", inplace = True)

In [38]:
df.isnull().sum()

itching                 0
skin_rash               0
nodal_skin_eruptions    0
continuous_sneezing     0
shivering               0
                       ..
inflammatory_nails      0
blister                 0
red_sore_around_nose    0
yellow_crust_ooze       0
prognosis               0
Length: 133, dtype: int64

In [39]:
# Encode categorical features and labels
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y)

In [40]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [56]:
model = DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=0)

In [57]:
# Train and evaluate models

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

Accuracy: 0.95
Precision: 0.98
Recall: 0.95
F1-score: 0.96


In [58]:
# Save the columns
cols = df.columns[:-1]

In [79]:
def execute_bot():
    print("Please reply with yes/Yes or no/No for the following symptoms")

    def print_disease(node):
        node = node[0]
        val = node.nonzero()
        disease = labelencoder.inverse_transform(val[0])
        return disease

    def tree_to_code(tree, feature_names):
        tree_ = tree.tree_
        feature_name = [
            feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
            for i in tree_.feature
        ]
        symptoms_present = []

        def recurse(node, depth):
            if tree_.feature[node] != _tree.TREE_UNDEFINED:
                name = feature_name[node]
                threshold = tree_.threshold[node]
                print(name + " ?")
                ans = input().lower()
                if ans == 'yes':
                    val = 1
                else:
                    val = 0
                if val <= threshold:
                    recurse(tree_.children_left[node], depth + 1)
                else:
                    symptoms_present.append(name)
                    recurse(tree_.children_right[node], depth + 1)
            else:
                present_disease = print_disease(tree_.value[node])
                print("You may have " + present_disease[0])
                print("Symptoms present: " + str(symptoms_present))
                red_cols = df.columns
                coded_disease = labelencoder.transform([present_disease[0]])
                coded_disease = coded_disease[0]
                # print(coded_disease)
                symptoms_given = red_cols[df.iloc[coded_disease].values.nonzero()]
                print("Symptoms given: " + str(list(symptoms_given)))
                confidence_level = (1.0 * len(symptoms_present)) / len(symptoms_given)
                print("Confidence level: " + str(confidence_level))

        recurse(0, 1)

    tree_to_code(model, cols)

In [80]:
# Execute the bot
execute_bot()

Please reply with yes/Yes or no/No for the following symptoms
fatigue ?
Yes
loss_of_appetite ?
yes
malaise ?
Yes
yellowing_of_eyes ?
Yes
blood_in_sputum ?
no
You may have Hepatitis B
Symptoms present: ['fatigue', 'loss_of_appetite', 'malaise', 'yellowing_of_eyes']
19
Symptoms given: ['continuous_sneezing', 'shivering', 'chills', 'watering_from_eyes', 'prognosis']
Confidence level: 0.8
