In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
import sklearn.metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score

### Load Dataset

In [2]:
# read csv function
def read_csv(file):
    dataset = pd.read_csv(file)
    return dataset

In [3]:
# apply the read_csv function into train, validate, and test datasets reading
train = read_csv('/Users/ag/Desktop/trainProcessed.csv')
validate = read_csv('/Users/ag/Desktop/validateProcessed.csv')
test = read_csv('/Users/ag/Desktop/testProcessed.csv')

In [4]:
train.head(3) # see the first three rows of the train dataset

Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,INITIAL_EVIDENCE,I30,diarrhee,bode,lesions_peau_endroitducorps_@_face_dorsale_main_D_,douleurxx_irrad_@_sous_la_machoire,...,etourdissement,hernie_hiatale,douleurxx_irrad_@_trachée,douleurxx_endroitducorps_@_orteil__1__G_,ww_dd,lesions_peau_endroitducorps_@_petite_lèvre_G_,lesions_peau_elevee_@_2,j17_j18,lesions_peau_intens_@_0,lesions_peau_endroitducorps_@_vagin
0,18,"[['Bronchite', 0.19171203430383882], ['Pneumon...",0,IVRS ou virémie,fievre,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,21,"[['VIH (Primo-infection)', 0.5189500564407601]...",0,VIH (Primo-infection),diaph,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,19,"[['Bronchite', 0.11278064619119596], ['Pneumon...",1,Pneumonie,expecto,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


### Split dataset to x and y

In [5]:
# a function for spliting dataset and drop columns
def data_pre(df, target_column):
    targets = {}
    for column in target_columns:
        targets[column] = df[column].copy()
        df = df.drop(column, axis=1)
    
    data_X = df
    return data_X, targets

Train

In [6]:
# the target columns we want to move from the train dataset
target_columns = ['PATHOLOGY', 'DIFFERENTIAL_DIAGNOSIS', 'INITIAL_EVIDENCE']
# apply the function to train dataset
train_X, train_targets = data_pre(train, target_columns)

In [7]:
# convert the dataset into pandas dataframe
train_targets_df = pd.DataFrame(train_targets)

In [8]:
# Extract the column as train_y
trainPathology_y = train_targets_df['PATHOLOGY'].copy()

In [9]:
train_X.head(2) # see the first two rows of the train dataset

Unnamed: 0,AGE,SEX,I30,diarrhee,bode,lesions_peau_endroitducorps_@_face_dorsale_main_D_,douleurxx_irrad_@_sous_la_machoire,douleurxx_irrad_@_cartilage_thyroidien,douleurxx_irrad_@_arrière_de_tête,douleurxx_endroitducorps_@_hypochondre_G_,...,etourdissement,hernie_hiatale,douleurxx_irrad_@_trachée,douleurxx_endroitducorps_@_orteil__1__G_,ww_dd,lesions_peau_endroitducorps_@_petite_lèvre_G_,lesions_peau_elevee_@_2,j17_j18,lesions_peau_intens_@_0,lesions_peau_endroitducorps_@_vagin
0,18,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,21,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Validate

In [10]:
# the target columns we want to move from the validate dataset
target_columns = ['PATHOLOGY', 'DIFFERENTIAL_DIAGNOSIS', 'INITIAL_EVIDENCE']
# apply the function to validate dataset
validate_X, validate_targets = data_pre(validate, target_columns)

In [11]:
# convert the dataset into pandas dataframe
validate_targets_df = pd.DataFrame(validate_targets)

In [12]:
# Extract the column as test_y
validatePathology_y = validate['PATHOLOGY'].copy()

In [13]:
validate_X.head(3) # see the first three rows of the train dataset

Unnamed: 0,AGE,SEX,I30,diarrhee,bode,lesions_peau_endroitducorps_@_face_dorsale_main_D_,douleurxx_irrad_@_sous_la_machoire,douleurxx_irrad_@_cartilage_thyroidien,douleurxx_irrad_@_arrière_de_tête,douleurxx_endroitducorps_@_hypochondre_G_,...,etourdissement,hernie_hiatale,douleurxx_irrad_@_trachée,douleurxx_endroitducorps_@_orteil__1__G_,ww_dd,lesions_peau_endroitducorps_@_petite_lèvre_G_,lesions_peau_elevee_@_2,j17_j18,lesions_peau_intens_@_0,lesions_peau_endroitducorps_@_vagin
0,55,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,10,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,68,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


Test

In [14]:
# the target columns we want to move from the test dataset
target_columns = ['PATHOLOGY', 'DIFFERENTIAL_DIAGNOSIS', 'INITIAL_EVIDENCE']
# apply the function to test dataset
test_X, test_targets = data_pre(test, target_columns)

In [15]:
# convert the dataset into pandas dataframe
test_targets_df = pd.DataFrame(test_targets)

In [16]:
# Extract the column as test_y
testPathology_y = test['PATHOLOGY'].copy()

In [17]:
test_X.head(2)  # see the first two rows of the train dataset

Unnamed: 0,AGE,SEX,I30,diarrhee,bode,lesions_peau_endroitducorps_@_face_dorsale_main_D_,douleurxx_irrad_@_sous_la_machoire,douleurxx_irrad_@_cartilage_thyroidien,douleurxx_irrad_@_arrière_de_tête,douleurxx_endroitducorps_@_hypochondre_G_,...,etourdissement,hernie_hiatale,douleurxx_irrad_@_trachée,douleurxx_endroitducorps_@_orteil__1__G_,ww_dd,lesions_peau_endroitducorps_@_petite_lèvre_G_,lesions_peau_elevee_@_2,j17_j18,lesions_peau_intens_@_0,lesions_peau_endroitducorps_@_vagin
0,49,1,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Logistic Regression

In [18]:
# Logistic Regression model
# logistic_model = LogisticRegression(solver='saga', max_iter=100)
logistic_model = LogisticRegression()

In [19]:
# Fit the dataset
logistic_model.fit(train_X, trainPathology_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

### Fit and evaluate on validation dataset

In [20]:
# a function for evaluating the model and get the metric
def calculate_metric(pathology, prediction, index=["data"]):
    return pd.DataFrame({
        "accuracy": sklearn.metrics.accuracy_score(pathology, prediction),
        "precision": sklearn.metrics.precision_score(pathology, prediction, average="macro"),
        "recall": sklearn.metrics.recall_score(pathology, prediction, average="macro"),
        "f1 score": sklearn.metrics.f1_score(pathology, prediction, average="macro"),
        "balanced accuracy": sklearn.metrics.balanced_accuracy_score(pathology, prediction)
        }, index=index)

In [21]:
# make predictions on validate dataset
y_pred_validate = logistic_model.predict(validate_X)

In [22]:
# apply the function to get the evaluation metric
validation_metric = calculate_metric(validatePathology_y, y_pred_validate, index=["data"])

  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
# show the result
validation_metric

Unnamed: 0,accuracy,precision,recall,f1 score,balanced accuracy
data,0.991989,0.965997,0.94765,0.950901,0.94765


### Fit and evaluate on test dataset

In [24]:
# make predictions on test dataset
y_pred = logistic_model.predict(test_X)

In [25]:
# apply the function to get the evaluation metric
test_metric = calculate_metric(testPathology_y, y_pred, index=["data"])

  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
# show the result
test_metric

Unnamed: 0,accuracy,precision,recall,f1 score,balanced accuracy
data,0.991927,0.966378,0.946374,0.949807,0.946374
