In [173]:
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import classification_report
from sklearn import preprocessing

# read data from dementia_patients_health_data 2.csv file into a pandas dataframe
patients_health_df = pd.read_csv(f'{os.getcwd()}/dementia_patients_health_data 2.csv')

# Drop columns 
patients_health_df.drop(columns = ["Dosage in mg", "Education_Level" ], inplace=True)

# transform categorical column Smoking_Status to numeric
smoker_enc = preprocessing.OrdinalEncoder()
smoking_status = patients_health_df['Smoking_Status'].unique()
smoker_enc.fit(smoking_status.reshape(-1, 1))
ct = ColumnTransformer( transformers = [( "", smoker_enc, ["Smoking_Status"])])
patients_health_df['Smoking_Status'] = ct.fit_transform(patients_health_df)

# transform categorical column Physical_Activity to numeric
physical_activity_enc = preprocessing.OrdinalEncoder()
physical_activity_values = patients_health_df['Physical_Activity'].unique()
physical_activity_enc.fit(physical_activity_values.reshape(-1, 1))
physical_activity_ct = ColumnTransformer( transformers = [( "", physical_activity_enc, ["Physical_Activity"])])
physical_activity_ct.fit_transform(patients_health_df)
patients_health_df['Physical_Activity'] = physical_activity_ct.fit_transform(patients_health_df)

# transform categorical column Nutrition_Diet to numeric
diet_enc = preprocessing.OrdinalEncoder()
diet_values = patients_health_df['Nutrition_Diet'].unique()
diet_enc.fit(diet_values.reshape(-1, 1))
diet_ct = ColumnTransformer( transformers = [( "", diet_enc, ["Nutrition_Diet"])])
diet_ct.fit_transform(patients_health_df)
patients_health_df['Nutrition_Diet'] = diet_ct.fit_transform(patients_health_df)

# transform categorical column Chronic_Health_Conditions to numeric
patients_health_df['Chronic_Health_Conditions'] = np.where(patients_health_df['Chronic_Health_Conditions'].isna(), 'None', patients_health_df['Chronic_Health_Conditions'])
chronic_conditions_enc = preprocessing.OrdinalEncoder()
chronic_conditions_values = patients_health_df['Chronic_Health_Conditions'].unique()
chronic_conditions_enc.fit(chronic_conditions_values.reshape(-1, 1))
chronic_conditions_ct = ColumnTransformer( transformers = [( "", chronic_conditions_enc, ["Chronic_Health_Conditions"])])
chronic_conditions_ct.fit_transform(patients_health_df)
patients_health_df['Chronic_Health_Conditions'] = chronic_conditions_ct.fit_transform(patients_health_df)

# transform categorical column Prescription to numeric
patients_health_df['Prescription'] = np.where(patients_health_df['Prescription'].isna(), 0, 1)

# transform categorical column Gender to numeric and rename column to isMale
patients_health_df.rename(columns={'Gender': 'isMale'}, inplace=True)
patients_health_df['isMale'] = np.where(patients_health_df['isMale'] == 'Male', 1, 0)

# transform categorical column APOE_ε4 to numeric and rename column to APOE_ε4_Positive
patients_health_df.rename(columns={'APOE_ε4': 'APOE_ε4_Positive'}, inplace=True)
patients_health_df['APOE_ε4_Positive'] = np.where(patients_health_df['APOE_ε4_Positive'] == 'Positive', 1, 0)

# rename column Dominant_Hand to Dominant_Hand_Left and transform to numerical value
patients_health_df.rename(columns={'Dominant_Hand': 'Dominant_Hand_Left'}, inplace=True)
patients_health_df['Dominant_Hand_Left'] = np.where(patients_health_df['Dominant_Hand_Left'] == 'Left', 1, 0)

# transform Family_History column to numerical value
patients_health_df['Family_History'] = np.where(patients_health_df['Family_History'] == 'Yes', 1, 0)

# rename column Sleep_Quality to Sleep_Quality_Good and transform to numerical value
patients_health_df.rename(columns={'Sleep_Quality': 'Sleep_Quality_Good'}, inplace=True)
patients_health_df['Sleep_Quality_Good'] = np.where(patients_health_df['Sleep_Quality_Good'] == 'Good', 1, 0)

# patients_health_df.info()
features = ["isMale", "Family_History", "APOE_ε4_Positive", "Diabetic", "Prescription", "Dominant_Hand_Left", "Physical_Activity", "Nutrition_Diet", "Sleep_Quality_Good", "Chronic_Health_Conditions"]
label = "Dementia"

X, y = patients_health_df[features].values, patients_health_df[label].values

# print(patients_health_df)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
# print (f'X_train: {X_train.shape} \nX_val: {X_val.shape} \ny_train: {y_train.shape} \ny_val: {y_val.shape}')

# perform standard scaling on data
standard_scaler = StandardScaler()
X_train = standard_scaler.fit_transform(X_train)
X_val = standard_scaler.transform(X_val)

# Train the model
from sklearn.linear_model import LogisticRegression

# Set regularization rate
reg = 0.01

# train a logistic regression model on the training set
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)
print (model)

predictions = model.predict(X_val)
# print('Predicted labels: ', predictions)
# print('Actual labels:    ' ,y_val)

print(f"Accuracy score: {accuracy_score(y_val, predictions)}")
print(classification_report(y_val, predictions))

print("Overall Precision:",precision_score(y_val, predictions))
print("Overall Recall:",recall_score(y_val, predictions))

LogisticRegression(C=100.0, solver='liblinear')
Accuracy score: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       111
           1       1.00      1.00      1.00        89

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

Overall Precision: 1.0
Overall Recall: 1.0
