In [1]:
import pandas as pd
import numpy as np
import mlflow

In [2]:
mlflow.set_experiment('medicine-recommendation')
mlflow.set_tracking_uri('http://localhost:5000')


2024/10/21 21:58:04 INFO mlflow.tracking.fluent: Experiment with name 'medicine-recommendation' does not exist. Creating a new experiment.


In [4]:
dataset_dir = './dataset/'

data = pd.read_csv(f'{dataset_dir}Training.csv')
data

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,(vertigo) Paroymsal Positional Vertigo
4916,0,1,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,Acne
4917,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Urinary tract infection
4918,0,1,0,0,0,0,1,0,0,0,...,0,0,1,1,1,1,0,0,0,Psoriasis


In [5]:
data.isnull().sum()

itching                 0
skin_rash               0
nodal_skin_eruptions    0
continuous_sneezing     0
shivering               0
                       ..
inflammatory_nails      0
blister                 0
red_sore_around_nose    0
yellow_crust_ooze       0
prognosis               0
Length: 133, dtype: int64

In [6]:
data.dtypes

itching                  int64
skin_rash                int64
nodal_skin_eruptions     int64
continuous_sneezing      int64
shivering                int64
                         ...  
inflammatory_nails       int64
blister                  int64
red_sore_around_nose     int64
yellow_crust_ooze        int64
prognosis               object
Length: 133, dtype: object

In [7]:
data['prognosis'].unique()

array(['Fungal infection', 'Allergy', 'GERD', 'Chronic cholestasis',
       'Drug Reaction', 'Peptic ulcer diseae', 'AIDS', 'Diabetes ',
       'Gastroenteritis', 'Bronchial Asthma', 'Hypertension ', 'Migraine',
       'Cervical spondylosis', 'Paralysis (brain hemorrhage)', 'Jaundice',
       'Malaria', 'Chicken pox', 'Dengue', 'Typhoid', 'hepatitis A',
       'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E',
       'Alcoholic hepatitis', 'Tuberculosis', 'Common Cold', 'Pneumonia',
       'Dimorphic hemmorhoids(piles)', 'Heart attack', 'Varicose veins',
       'Hypothyroidism', 'Hyperthyroidism', 'Hypoglycemia',
       'Osteoarthristis', 'Arthritis',
       '(vertigo) Paroymsal  Positional Vertigo', 'Acne',
       'Urinary tract infection', 'Psoriasis', 'Impetigo'], dtype=object)

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

import joblib

In [61]:
X = data.drop('prognosis', axis = 1)
y = data['prognosis']

In [62]:
le = LabelEncoder()
y = le.fit_transform(y)

joblib.dump(le, 'artifacts/label_encoder.pkl')

['artifacts/label_encoder.pkl']

In [11]:
test_size = 0.2
random_state = 42

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = random_state, stratify = y, shuffle = True)

In [63]:
def train_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average = 'weighted')
    print(f'Accuracy: {accuracy}')
    print(f'F1 Score: {f1}')
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    return accuracy, f1
    

In [64]:
list_of_models = [
    {
        'model': DecisionTreeClassifier(),
        'name': 'DecisionTreeClassifier',
        'params': {
            'max_depth': [None, 5, 10, 15, 20, 25, 30, 35, 40],
            'random_state': [42, 68]
        }
    },
    {
        'model': SVC(),
        'name': 'SVC',
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'random_state': [42, 68]
        }
    }
]

In [65]:
for model in list_of_models:
    with mlflow.start_run(run_name = model['name'], nested = True):
        
        mlflow.log_param('model', model['name'])
        grid = GridSearchCV(model['model'], model['params'], refit = True, verbose = 3, n_jobs = -1)
        mlflow.log_params(grid.get_params())
        
        accuracy, f1 = train_model(grid, X_train, y_train, X_test, y_test)
        mlflow.log_metric('accuracy', accuracy)
        mlflow.log_metric('f1', f1)
        
        joblib.dump(grid, f'artifacts/{model["name"]}.pkl')
        mlflow.sklearn.log_model(grid, model['name'])
        
        print(f'{model["name"]} done')
        
print('All models trained and saved')

Fitting 5 folds for each of 18 candidates, totalling 90 fits


2024/10/21 22:29:53 INFO mlflow.tracking._tracking_service.client: üèÉ View run DecisionTreeClassifier at: http://localhost:5000/#/experiments/771296367808407745/runs/075e901096cf4f8bb4802099cc693350.
2024/10/21 22:29:53 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/771296367808407745.


Accuracy: 1.0
F1 Score: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        24
           1       1.00      1.00      1.00        24
           2       1.00      1.00      1.00        24
           3       1.00      1.00      1.00        24
           4       1.00      1.00      1.00        24
           5       1.00      1.00      1.00        24
           6       1.00      1.00      1.00        24
           7       1.00      1.00      1.00        24
           8       1.00      1.00      1.00        24
           9       1.00      1.00      1.00        24
          10       1.00      1.00      1.00        24
          11       1.00      1.00      1.00        24
          12       1.00      1.00      1.00        24
          13       1.00      1.00      1.00        24
          14       1.00      1.00      1.00        24
          15       1.00      1.00      1.00        24
          16       1.00      1.00      1.00        24

2024/10/21 22:30:07 INFO mlflow.tracking._tracking_service.client: üèÉ View run SVC at: http://localhost:5000/#/experiments/771296367808407745/runs/606a53ab03294dd9b9de497e577ce583.
2024/10/21 22:30:07 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/771296367808407745.


Accuracy: 1.0
F1 Score: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        24
           1       1.00      1.00      1.00        24
           2       1.00      1.00      1.00        24
           3       1.00      1.00      1.00        24
           4       1.00      1.00      1.00        24
           5       1.00      1.00      1.00        24
           6       1.00      1.00      1.00        24
           7       1.00      1.00      1.00        24
           8       1.00      1.00      1.00        24
           9       1.00      1.00      1.00        24
          10       1.00      1.00      1.00        24
          11       1.00      1.00      1.00        24
          12       1.00      1.00      1.00        24
          13       1.00      1.00      1.00        24
          14       1.00      1.00      1.00        24
          15       1.00      1.00      1.00        24
          16       1.00      1.00      1.00        24

In [67]:

model_inputs = X.columns
with open('artifacts/model_inputs.txt', 'w') as f:
    for item in model_inputs:
        f.write("%s\n" % item)


In [68]:
sym_des = pd.read_csv(f"{dataset_dir}symtoms_df.csv", index_col = 0)
precautions = pd.read_csv(f"{dataset_dir}precautions_df.csv", index_col = 0)
workout = pd.read_csv(f"{dataset_dir}workout_df.csv", index_col = 0)
description = pd.read_csv(f"{dataset_dir}description.csv")
medications = pd.read_csv(f"{dataset_dir}medications.csv")
diets = pd.read_csv(f"{dataset_dir}diets.csv")

In [69]:
def get_description(disease):
    return description[description['Disease'] == disease]['Description'].values

def get_precautions(disease):
    return precautions[precautions['Disease'] == disease][['Precaution_1', 'Precaution_2', 'Precaution_3', 'Precaution_4']].values

def get_workout(disease):
    return workout[workout['disease'] == disease]['workout'].values

def get_medications(disease):
    return medications[medications['Disease'] == disease]['Medication'].values

def get_diets(disease):
    return diets[diets['Disease'] == disease]['Diet'].values


In [70]:

def predict_disease(symptoms, model,  le):
    prediction = model.predict(symptoms)
    disease = le.inverse_transform(prediction)
    return disease[0]
    
def get_recommendations(disease):
    return {
        'description': get_description(disease),
        'precautions': get_precautions(disease),
        'workout': get_workout(disease),
        'medications': get_medications(disease),
        'diets': get_diets(disease)
    }
    
def get_symptoms(symptoms, model_inputs):
    symptoms = symptoms.split(',')
    symptoms = [1 if item in symptoms else 0 for item in model_inputs]
    return pd.DataFrame([symptoms], columns = model_inputs)
    

In [71]:
# run_id = input('Enter the run_id of the model you want to use: ')
# model_name = input('Enter the model name you want to use: ')
# model = mlflow.sklearn.load_model(' runs:/' + run_id + '/' + model_name)
model = joblib.load('artifacts/SVC.pkl')

In [72]:
symptoms = input('Enter the symptoms separated by commas: ')
symptoms = get_symptoms(symptoms, model_inputs)
disease = predict_disease(symptoms, model, le)
recommendations = get_recommendations(disease)
print(f'The disease is: {disease}')
print(recommendations)

The disease is: Fungal infection
{'description': array(['Fungal infection is a common skin condition caused by fungi.'],
      dtype=object), 'precautions': array([['bath twice', 'use detol or neem in bathing water',
        'keep infected area dry', 'use clean cloths']], dtype=object), 'workout': array(['Avoid sugary foods', 'Consume probiotics',
       'Increase intake of garlic', 'Include yogurt in diet',
       'Limit processed foods', 'Stay hydrated', 'Consume green tea',
       'Eat foods rich in zinc', 'Include turmeric in diet',
       'Eat fruits and vegetables'], dtype=object), 'medications': array(["['Antifungal Cream', 'Fluconazole', 'Terbinafine', 'Clotrimazole', 'Ketoconazole']"],
      dtype=object), 'diets': array(["['Antifungal Diet', 'Probiotics', 'Garlic', 'Coconut oil', 'Turmeric']"],
      dtype=object)}
