In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import scipy.stats as stats
from sklearn import preprocessing
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector as sfs
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from scipy import stats
from scipy.stats import linregress
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification


In [None]:
#import dataset
train = pd.read_csv('../data/trainProcessed.csv')
validate = pd.read_csv('../data/validateProcessed.csv')
test = pd.read_csv('../data/testProcessed.csv')

In [None]:
#Our group decided to use Random forest to predict the pathology.

def data_pre(df):
    y=df['PATHOLOGY']
    X=df.drop(['DIFFERENTIAL_DIAGNOSIS','PATHOLOGY','INITIAL_EVIDENCE'], axis=1)
    return X,y
X,y=data_pre(train)

In [None]:
X,y=data_pre(train)

In order to get the best parameter of the model, our group decided to use grid search.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification

# To find the best parameter for the model, we decided to run the grid search.

X,y=data_pre(train)
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2,4],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}


myclf = RandomForestClassifier(random_state=0)


grid_search = GridSearchCV(estimator=myclf, param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

grid_search.fit(X, y)

best_params_ = grid_search.best_params_


After finding the best parameter, we use it in the model and see the performance on the valifation set.

In [None]:
clf = RandomForestClassifier(**best_params_, random_state=0, bootstrap=True)
clf.fit(X, y)

**Evaluate on validation dataset**

In [None]:
X_validate,y_validate=data_pre(validate)
y_pred_validate=clf.predict(X_validate)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy_v = accuracy_score(y_validate, y_pred_validate)
precision_v = precision_score(y_validate, y_pred_validate,average='macro')
recall_v = recall_score(y_validate, y_pred_validate,average='macro')
f1 = f1_score(y_validate, y_pred_validate, average='weighted')

print(f"Accuracy: {accuracy_v}")
print(f"Precision: {precision_v}")
print(f"Recall: {recall_v}")
print(f"F1 Score: {f1}")

Accuracy: 0.9933709833293066
Precision: 0.9940952646224286
Recall: 0.9868095249391869
F1 Score: 0.9931304922059677


According to the model's outcome validation set, the Accuracy and F1 score is high enough, so it is good to use in the test set. The reason of its high accuracy and F1 score is because of the enormous size of our dataset.

**Result on Test dataset**

In [None]:
y_test=test['PATHOLOGY']
X_test=test.drop(['DIFFERENTIAL_DIAGNOSIS','PATHOLOGY','INITIAL_EVIDENCE'], axis=1)
y_pred_test=clf.predict(X_test)

In [None]:
from metric_utils import calculate_metric
# apply the function to get the evaluation metric
test_metric = calculate_metric(y_test, y_pred_test, index=["data"])
test_metric = test_metric.loc['data']

print(f"Accuracy: {test_metric['accuracy']}")
print(f"Precision: {test_metric['precision']}")
print(f"Recall: {test_metric['recall']}")
print(f"F1 Score: {test_metric['f1 score']}")
print(f"Balanced Accuracy: {test_metric['balanced accuracy']}")

Accuracy: 0.9936519263504523
Precision: 0.9941703785457606
Recall: 0.9858327643299465
F1 Score: 0.9934309272140678
Balanced Accuracy: 0.9858327643299465


Our final result on the test set is shown above. We reached the maximum accuracy of 0.9937 and 0.9934 F1 score, which is high enough to predict the pathology.

**Accuracy for each pathology**

In [20]:
classes = np.unique(y_test)
accuracies = {}
for cls in classes:

    cls_true = (y_test == cls).astype(int)
    cls_pred = (y_pred_test == cls).astype(int)

    accuracies[cls] = accuracy_score(cls_true, cls_pred)

for cls, acc in accuracies.items():
    print(f"Accuracy for class {cls}: {acc}")

Accuracy for class Anaphylaxie: 1.0
Accuracy for class Angine instable: 0.9993086992395691
Accuracy for class Angine stable: 0.9993235659225892
Accuracy for class Anémie: 1.0
Accuracy for class Asthme exacerbé ou bronchospasme: 1.0
Accuracy for class Attaque de panique: 1.0
Accuracy for class Bronchiectasies: 1.0
Accuracy for class Bronchiolite: 0.9999776999754699
Accuracy for class Bronchite: 0.99994796660943
Accuracy for class Chagas: 1.0
Accuracy for class Coqueluche: 0.99998513331698
Accuracy for class Céphalée en grappe: 1.0
Accuracy for class Ebola: 0.99996283329245
Accuracy for class Embolie pulmonaire: 1.0
Accuracy for class Exacerbation aigue de MPOC et/ou surinfection associée: 1.0
Accuracy for class Fibrillation auriculaire/Flutter auriculaire: 1.0
Accuracy for class Fracture de côte spontanée: 0.9997695664131897
Accuracy for class Hernie inguinale: 1.0
Accuracy for class IVRS ou virémie: 1.0
Accuracy for class Laryngite aigue: 0.9996357662660096
Accuracy for class Laryngo-t


However, there is one limitation: the grid search process takes too long, our group is still trying to find a less time consuming way. Also, our dataset only contains pathology that are related with breath, throat and lungth. The model performance on dataset that contain wider pathology range is still unclear.