In [2]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score

### Load Dataset

In [4]:
# read csv function
def read_csv(file):
    dataset = pd.read_csv(file)
    return dataset

In [5]:
train = read_csv('/Users/ag/Desktop/trainProcessed.csv')
validate = read_csv('/Users/ag/Desktop/validateProcessed.csv')
test = read_csv('/Users/ag/Desktop/testProcessed.csv')

In [6]:
train.head(3)

Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,INITIAL_EVIDENCE,I30,diarrhee,bode,lesions_peau_endroitducorps_@_face_dorsale_main_D_,douleurxx_irrad_@_sous_la_machoire,...,etourdissement,hernie_hiatale,douleurxx_irrad_@_trachée,douleurxx_endroitducorps_@_orteil__1__G_,ww_dd,lesions_peau_endroitducorps_@_petite_lèvre_G_,lesions_peau_elevee_@_2,j17_j18,lesions_peau_intens_@_0,lesions_peau_endroitducorps_@_vagin
0,18,"[['Bronchite', 0.19171203430383882], ['Pneumon...",0,IVRS ou virémie,fievre,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,21,"[['VIH (Primo-infection)', 0.5189500564407601]...",0,VIH (Primo-infection),diaph,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,19,"[['Bronchite', 0.11278064619119596], ['Pneumon...",1,Pneumonie,expecto,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


### Split dataset to x and y

In [7]:
# function for spliting dataset and drop columns
def data_pre(df, target_column):
    targets = {}
    for column in target_columns:
        targets[column] = df[column].copy()
        df = df.drop(column, axis=1)
    
    data_X = df
    return data_X, targets

Train

In [13]:
target_columns = ['PATHOLOGY', 'DIFFERENTIAL_DIAGNOSIS', 'INITIAL_EVIDENCE']
train_X, train_targets = data_pre(train, target_columns)

In [14]:
train_targets_df = pd.DataFrame(train_targets)

In [15]:
# Extract the column as train_y
trainPathology_y = train_targets_df['PATHOLOGY'].copy()

In [16]:
train_X.head(2)

Unnamed: 0,AGE,SEX,I30,diarrhee,bode,lesions_peau_endroitducorps_@_face_dorsale_main_D_,douleurxx_irrad_@_sous_la_machoire,douleurxx_irrad_@_cartilage_thyroidien,douleurxx_irrad_@_arrière_de_tête,douleurxx_endroitducorps_@_hypochondre_G_,...,etourdissement,hernie_hiatale,douleurxx_irrad_@_trachée,douleurxx_endroitducorps_@_orteil__1__G_,ww_dd,lesions_peau_endroitducorps_@_petite_lèvre_G_,lesions_peau_elevee_@_2,j17_j18,lesions_peau_intens_@_0,lesions_peau_endroitducorps_@_vagin
0,18,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,21,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Validate

In [17]:
target_columns = ['PATHOLOGY', 'DIFFERENTIAL_DIAGNOSIS', 'INITIAL_EVIDENCE']
validate_X, validate_targets = data_pre(validate, target_columns)

In [18]:
validate_targets_df = pd.DataFrame(validate_targets)

In [19]:
# Extract the column as test_y
validatePathology_y = validate['PATHOLOGY'].copy()

In [20]:
validate_X.head(3)

Unnamed: 0,AGE,SEX,I30,diarrhee,bode,lesions_peau_endroitducorps_@_face_dorsale_main_D_,douleurxx_irrad_@_sous_la_machoire,douleurxx_irrad_@_cartilage_thyroidien,douleurxx_irrad_@_arrière_de_tête,douleurxx_endroitducorps_@_hypochondre_G_,...,etourdissement,hernie_hiatale,douleurxx_irrad_@_trachée,douleurxx_endroitducorps_@_orteil__1__G_,ww_dd,lesions_peau_endroitducorps_@_petite_lèvre_G_,lesions_peau_elevee_@_2,j17_j18,lesions_peau_intens_@_0,lesions_peau_endroitducorps_@_vagin
0,55,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,10,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,68,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


Test

In [21]:
target_columns = ['PATHOLOGY', 'DIFFERENTIAL_DIAGNOSIS', 'INITIAL_EVIDENCE']
test_X, test_targets = data_pre(test, target_columns)

In [22]:
test_targets_df = pd.DataFrame(test_targets)

In [23]:
# Extract the column as test_y
testPathology_y = test['PATHOLOGY'].copy()

In [24]:
test_X.head(2)

Unnamed: 0,AGE,SEX,I30,diarrhee,bode,lesions_peau_endroitducorps_@_face_dorsale_main_D_,douleurxx_irrad_@_sous_la_machoire,douleurxx_irrad_@_cartilage_thyroidien,douleurxx_irrad_@_arrière_de_tête,douleurxx_endroitducorps_@_hypochondre_G_,...,etourdissement,hernie_hiatale,douleurxx_irrad_@_trachée,douleurxx_endroitducorps_@_orteil__1__G_,ww_dd,lesions_peau_endroitducorps_@_petite_lèvre_G_,lesions_peau_elevee_@_2,j17_j18,lesions_peau_intens_@_0,lesions_peau_endroitducorps_@_vagin
0,49,1,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Logistic Regression

In [14]:
# Logistic Regression model
# logistic_model = LogisticRegression(solver='saga', max_iter=100)
logistic_model = LogisticRegression()

In [15]:
# Fit the dataset
logistic_model.fit(train_X, trainPathology_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

### Fit and evaluate on validation dataset

In [16]:
# Make predictions on validate dataset
y_pred_validate = logistic_model.predict(validate_X)

In [21]:
# Evaluate the model
accuracy_val = accuracy_score(validatePathology_y, y_pred_validate)
print(f"Model accuracy between train and validation is: {accuracy_val:.5f}")

Model accuracy between train and validation is: 0.99199


### Fit and evaluate on test dataset

In [22]:
# Make predictions on test dataset
y_pred = logistic_model.predict(test_X)

In [24]:
# Evaluate the model
accuracy = accuracy_score(testPathology_y, y_pred)
print(f"Model accuracy between train and test is: {accuracy:.5f}")

Model accuracy between train and test is: 0.99193


In [27]:
precision = precision_score(testPathology_y, y_pred, average='macro')
print(f"Precision between train and test is: {precision: 5f}")

Precision between train and test is:  0.966378


  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
recall = recall_score(testPathology_y, y_pred, average='macro')
print(f"Recall between train and test is: {recall: 5f}")

Recall between train and test is:  0.946374


In [30]:
f1 = f1_score(testPathology_y, y_pred, average='macro')
print(f"F1 Score: {f1: 5f}")

F1 Score:  0.949807


In [32]:
balanced_accuracy = balanced_accuracy_score(testPathology_y, y_pred)

print(f"Balanced Accuracy: {balanced_accuracy: 5f}")

Balanced Accuracy:  0.946374
