# Neural network disease prediction model

# Importer les bibliothèques nécessaires

In [102]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Charger les datasets

In [103]:
dataset_path = './models/dataset.csv'
severity_path = './models/Symptom-severity.csv'

df = pd.read_csv(dataset_path)
severity_df = pd.read_csv(severity_path)
df

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,vomiting,headache,nausea,spinning_movements,loss_of_balance,unsteadiness,,,,,,,,,,,
4916,Acne,skin_rash,pus_filled_pimples,blackheads,scurring,,,,,,,,,,,,,
4917,Urinary tract infection,burning_micturition,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,,,,,,,,,,,,,
4918,Psoriasis,skin_rash,joint_pain,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,,,,,,,,,,,


In [104]:
unique_diseases = df['Disease'].nunique()

print(f"There are {unique_diseases} unique diseases in the dataset.")

There are 41 unique diseases in the dataset.


# Prétraitement des données

## Nettoyer les données

In [105]:
import re

def clean_symptom(symptom):
    # Remove leading/trailing whitespace, convert to lowercase, replace spaces with underscores
    symptom = symptom.strip().lower().replace(" ", "_")
    # Replace multiple underscores with a single underscore
    symptom = re.sub(r"_+", "_", symptom)
    return symptom

for col in df.columns[1:]:  # Skip the first column if it's 'Disease' or similar
    df[col] = df[col].fillna('').apply(lambda x: clean_symptom(str(x)) if x else '')


# Extract all unique symptoms from the dataset
# Assuming symptom-related columns start from the second column
unique_symptoms = set()

for col in df.columns[1:]:  # Skip the first column if it's 'Disease' or similar
    unique_symptoms.update(df[col].unique())

# Remove any empty strings if present
unique_symptoms = {symptom for symptom in unique_symptoms if symptom}

# Convert to a sorted list for better readability
unique_symptoms_list = sorted(unique_symptoms)

print(f"Unique Symptoms: {unique_symptoms_list}")

symptoms = unique_symptoms
print(len(symptoms))



Unique Symptoms: ['abdominal_pain', 'abnormal_menstruation', 'acidity', 'acute_liver_failure', 'altered_sensorium', 'anxiety', 'back_pain', 'belly_pain', 'blackheads', 'bladder_discomfort', 'blister', 'blood_in_sputum', 'bloody_stool', 'blurred_and_distorted_vision', 'breathlessness', 'brittle_nails', 'bruising', 'burning_micturition', 'chest_pain', 'chills', 'cold_hands_and_feets', 'coma', 'congestion', 'constipation', 'continuous_feel_of_urine', 'continuous_sneezing', 'cough', 'cramps', 'dark_urine', 'dehydration', 'depression', 'diarrhoea', 'dischromic_patches', 'distention_of_abdomen', 'dizziness', 'drying_and_tingling_lips', 'enlarged_thyroid', 'excessive_hunger', 'extra_marital_contacts', 'family_history', 'fast_heart_rate', 'fatigue', 'fluid_overload', 'foul_smell_of_urine', 'headache', 'high_fever', 'hip_joint_pain', 'history_of_alcohol_consumption', 'increased_appetite', 'indigestion', 'inflammatory_nails', 'internal_itching', 'irregular_sugar_level', 'irritability', 'irritati

Toutes les symptomes correspondent bien entre les deux databases

## Création d'une matrice one hot encoding

In [106]:
# Create a binary matrix with columns for each unique symptom and a 'Disease' column
binary_matrix = pd.DataFrame(0, index=df.index, columns=['Disease'] + list(symptoms))

# Copy diseases into the new matrix
binary_matrix['Disease'] = df['Disease']

# Fill the matrix with 1s where symptoms are present
for index, row in df.iterrows():
    symptoms_present = row[1:].dropna().values  # Get non-NaN symptoms
    symptoms_present = [symptom for symptom in symptoms_present if symptom in binary_matrix.columns]
    binary_matrix.loc[index, symptoms_present] = 1

df = binary_matrix
df.head()

Unnamed: 0,Disease,mucoid_sputum,pain_during_bowel_movements,anxiety,pain_behind_the_eyes,brittle_nails,bladder_discomfort,unsteadiness,increased_appetite,polyuria,...,foul_smell_of_urine,runny_nose,knee_pain,headache,weight_loss,loss_of_smell,spotting_urination,blackheads,skin_peeling,spinning_movements
0,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Vérification

Afficher les symptômes présents pour une maladie

In [107]:
present_symptoms = df.iloc[0][df.iloc[0] == 1].index.tolist()
print(f"Symptômes présents pour la première ligne : {present_symptoms}")


Symptômes présents pour la première ligne : ['itching', 'nodal_skin_eruptions', 'skin_rash', 'dischromic_patches']


## Analyse du dataset

### Suppressions des doublons

In [108]:
df.duplicated().head(20)

0     False
1     False
2     False
3     False
4     False
5      True
6      True
7      True
8      True
9      True
10    False
11    False
12    False
13    False
14    False
15     True
16     True
17     True
18     True
19     True
dtype: bool

In [109]:
duplicates = df.duplicated().sum()
print(f"Nombre de lignes dupliquées : {duplicates}")
    
df = df.drop_duplicates()
print(f"Nombre de lignes après suppression des doublons : {len(df)}")

Nombre de lignes dupliquées : 4616
Nombre de lignes après suppression des doublons : 304


# Réseau de neurones

## Préparer les données

In [110]:
X = df.iloc[:, 1:].values  # Symptômes (exclut la colonne Disease)
y = df['Disease'].values  # Maladies
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### Encoder les maladies en labels numériques

In [111]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

## Diviser les données en ensembles d'entraînement et de test

In [112]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


## Construire le modèle de réseau de neurones

In [113]:
model = Sequential([
    Dense(128, input_dim=X.shape[1], activation='relu'),  # Couche cachée avec 128 neurones
    Dropout(0.2),  # Dropout pour éviter le sur-apprentissage
    Dense(64, activation='relu'),  # Deuxième couche cachée
    Dense(len(np.unique(y_encoded)), activation='softmax')  # Couche de sortie avec une classe par maladie
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


## Compiler le modèle

In [114]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


## Entraîner le modèle

In [115]:
predicted_labels = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0275 - loss: 3.7256 - val_accuracy: 0.0328 - val_loss: 3.6337
Epoch 2/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1056 - loss: 3.5652 - val_accuracy: 0.1639 - val_loss: 3.4955
Epoch 3/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.3382 - loss: 3.4235 - val_accuracy: 0.3770 - val_loss: 3.3401
Epoch 4/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4614 - loss: 3.2569 - val_accuracy: 0.4918 - val_loss: 3.1523
Epoch 5/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6348 - loss: 3.0266 - val_accuracy: 0.6066 - val_loss: 2.9238
Epoch 6/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6318 - loss: 2.7260 - val_accuracy: 0.6066 - val_loss: 2.6558
Epoch 7/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0

# Analyse de la performance

## Accuracy

In [116]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Précision sur les données de test : {test_accuracy * 100:.2f}%")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0261  
Précision sur les données de test : 100.00%


## Matrice de confusion

permet de visualiser les erreurs de classification par classe

In [117]:
# from sklearn.metrics import confusion_matrix
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Créer la matrice de confusion
# conf_matrix = confusion_matrix(y_test, predicted_labels)

# # Afficher la matrice sous forme de heatmap
# plt.figure(figsize=(10, 8))
# sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
# plt.xlabel('Prédictions')
# plt.ylabel('Vérités')
# plt.title('Matrice de confusion')
# plt.show()

## Visualiser les métriques pendant l'entraînement

In [118]:

# plt.plot(history.history['accuracy'], label='Précision entraînement')
# plt.plot(history.history['val_accuracy'], label='Précision validation')
# plt.xlabel('Époques')
# plt.ylabel('Précision')
# plt.legend()
# plt.title('Courbe de précision')
# plt.show()

# plt.plot(history.history['loss'], label='Perte entraînement')
# plt.plot(history.history['val_loss'], label='Perte validation')
# plt.xlabel('Époques')
# plt.ylabel('Perte')
# plt.legend()
# plt.title('Courbe de perte')
# plt.show()

## Validation croisée

In [119]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

# Configuration de la validation croisée
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=None)

# Évaluation avec la validation croisée
from sklearn.metrics import make_scorer, accuracy_score

accuracy_scores = []
for train_index, test_index in kfold.split(X, y_encoded):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    
    # Entraîner un modèle pour ce fold
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)  # Ajustez les paramètres ici
    scores = model.evaluate(X_test, y_test, verbose=0)
    accuracy_scores.append(scores[1])  # Ajouter l'accuracy
    
print(f"Précision moyenne sur les 5 folds : {np.mean(accuracy_scores):.2f}")


Précision moyenne sur les 5 folds : 1.00


## Vérifier la certitude des prédictions 

In [133]:
def predict_disease_with_certainty(symptoms_list, df, model, label_encoder):
    """
    Prédire la maladie à partir d'une liste de symptômes et retourner la certitude associée.

    :param symptoms_list: Liste de symptômes donnés (strings).
    :param df: DataFrame contenant les colonnes des symptômes (tous les symptômes possibles).
    :param model: Modèle entraîné.
    :param label_encoder: Encodeur pour décoder la maladie prédite.
    :return: Nom de la maladie prédite et certitude associée.
    """
    # Obtenir les colonnes des symptômes
    symptom_columns = df.columns[1:]  # Exclure la colonne 'Disease'
    
    # Encoder les symptômes donnés en un vecteur binaire
    encoded_vector = np.zeros(len(symptom_columns))
    for symptom in symptoms_list:
        if symptom in symptom_columns:
            encoded_vector[symptom_columns.get_loc(symptom)] = 1

    print(encoded_vector)
    # Faire la prédiction
    probabilities = model.predict(encoded_vector.reshape(1, -1))
    predicted_label = np.argmax(probabilities)
    predicted_disease = label_encoder.inverse_transform([predicted_label])[0]
    
    # Certitude (probabilité associée à la classe prédite)
    certainty = probabilities[0][predicted_label]

    return predicted_disease, certainty


In [121]:
# Symptômes donnés pour le test
example_symptoms = ["skin_rash", "itching", "nodal_skin_eruptions","dischromic_patches"]

# Prédiction avec certitude
predicted_disease, certainty = predict_disease_with_certainty(example_symptoms, df, model, label_encoder)

# Affichage des résultats
print("Symptômes donnés :")
for symptom in example_symptoms:
    print(f"- {symptom}")
print(f"\nMaladie prédite : {predicted_disease}")
print(f"Certitude du modèle : {certainty * 100:.2f}%")

# Symptômes donnés pour le test
example_symptoms = ["skin_rash", "itching", "nodal_skin_eruptions"]

# Prédiction avec certitude
predicted_disease, certainty = predict_disease_with_certainty(example_symptoms, df, model, label_encoder)

# Affichage des résultats
print("Symptômes donnés :")
for symptom in example_symptoms:
    print(f"- {symptom}")
print(f"\nMaladie prédite : {predicted_disease}")
print(f"Certitude du modèle : {certainty * 100:.2f}%")

# Symptômes donnés pour le test
example_symptoms = ["skin_rash", "itching"]

# Prédiction avec certitude
predicted_disease, certainty = predict_disease_with_certainty(example_symptoms, df, model, label_encoder)

# Affichage des résultats
print("Symptômes donnés :")
for symptom in example_symptoms:
    print(f"- {symptom}")
print(f"\nMaladie prédite : {predicted_disease}")
print(f"Certitude du modèle : {certainty * 100:.2f}%")

# Symptômes donnés pour le test
example_symptoms = ["skin_rash"]

# Prédiction avec certitude
predicted_disease, certainty = predict_disease_with_certainty(example_symptoms, df, model, label_encoder)

# Affichage des résultats
print("Symptômes donnés :")
for symptom in example_symptoms:
    print(f"- {symptom}")
print(f"\nMaladie prédite : {predicted_disease}")
print(f"Certitude du modèle : {certainty * 100:.2f}%")

# Symptômes donnés pour le test
example_symptoms = ["itching"]

# Prédiction avec certitude
predicted_disease, certainty = predict_disease_with_certainty(example_symptoms, df, model, label_encoder)

# Affichage des résultats
print("Symptômes donnés :")
for symptom in example_symptoms:
    print(f"- {symptom}")
print(f"\nMaladie prédite : {predicted_disease}")
print(f"Certitude du modèle : {certainty * 100:.2f}%")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Symptômes donnés :
- skin_rash
- itching
- nodal_skin_eruptions
- dischromic_patches

Maladie prédite : Fungal infection
Certitude du modèle : 99.97%
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
Symptômes donnés :
- skin_rash
- itching
- nodal_skin_eruptions

Maladie prédite : Fungal infection
Certitude du modèle : 99.22%
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
Symptômes donnés :
- skin_rash
- itching

Maladie prédite : Fungal infection
Certitude du modèle : 49.40%
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
Symptômes donnés :
- skin_rash

Maladie prédite : Acne
Certitude du modèle : 18.91%
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
Symptômes donnés :
- itching

Maladie prédite : Fungal infection
Certitude du modèle : 29.89%


# Test

In [136]:
symptom_columns = df.columns[1:]
print(symptom_columns)
print(symptom_columns[4])

import pickle

with open('symptom_columns.pkl', 'wb') as f:
    pickle.dump(symptom_columns, f)

Index(['mucoid_sputum', 'pain_during_bowel_movements', 'anxiety',
       'pain_behind_the_eyes', 'brittle_nails', 'bladder_discomfort',
       'unsteadiness', 'increased_appetite', 'polyuria', 'hip_joint_pain',
       ...
       'foul_smell_of_urine', 'runny_nose', 'knee_pain', 'headache',
       'weight_loss', 'loss_of_smell', 'spotting_urination', 'blackheads',
       'skin_peeling', 'spinning_movements'],
      dtype='object', length=131)
brittle_nails


Si je prédit une maladie avec exactement le bon nombre de symptome la certitude est quasit parfaite. Par exemple pour Symptômes donnés :

In [134]:
# Symptômes donnés pour le test
example_symptoms = ["skin_rash", "itching", "nodal_skin_eruptions"]

# Prédiction avec certitude
predicted_disease, certainty = predict_disease_with_certainty(example_symptoms, df, model, label_encoder)

# Affichage des résultats
print("Symptômes donnés :")
for symptom in example_symptoms:
    print(f"- {symptom}")
print(f"\nMaladie prédite : {predicted_disease}")
print(f"Certitude du modèle : {certainty * 100:.2f}%")


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
Symptômes donnés :
- skin_rash
- itching
- nodal_skin_eruptions

Maladie prédite : Fungal infection
Certitude du modèle : 99.22%


Par contre si j'essaye de prédire une maladie avec uniquement une partie des symptomes, la certitude diminue.

In [123]:
# Symptômes donnés pour le test
example_symptoms = ["skin_rash", "itching"]

# Prédiction avec certitude
predicted_disease, certainty = predict_disease_with_certainty(example_symptoms, df, model, label_encoder)

# Affichage des résultats
print("Symptômes donnés :")
for symptom in example_symptoms:
    print(f"- {symptom}")
print(f"\nMaladie prédite : {predicted_disease}")
print(f"Certitude du modèle : {certainty * 100:.2f}%")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
Symptômes donnés :
- skin_rash
- itching

Maladie prédite : Fungal infection
Certitude du modèle : 49.40%


In [126]:
# Symptômes donnés pour le test
example_symptoms = ['fatigue', 'high_fever', 'cough', 'breathlessness']

# Prédiction avec certitude
predicted_disease, certainty = predict_disease_with_certainty(example_symptoms, df, model, label_encoder)

# Affichage des résultats
print("Symptômes donnés :")
for symptom in example_symptoms:
    print(f"- {symptom}")
print(f"\nMaladie prédite : {predicted_disease}")
print(f"Certitude du modèle : {certainty * 100:.2f}%")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Symptômes donnés :
- fatigue
- high_fever
- cough
- breathlessness

Maladie prédite : Bronchial Asthma
Certitude du modèle : 95.55%


# Sauvegarde du modèle

In [None]:
# import joblib

# # Sauvegarder le modèle et le label encoder
# joblib.dump(model, 'disease_prediction_model.pkl')  # Sauvegarder le modèle
# joblib.dump(label_encoder, 'label_encoder.pkl')  # Sauvegarder l'encodeur de labels
# print("Modèle et encodeur sauvegardés sous 'disease_prediction_model.pkl' et 'label_encoder.pkl'")


Modèle et encodeur sauvegardés sous 'disease_prediction_model.pkl' et 'label_encoder.pkl'
