In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import f1_score, accuracy_score, recall_score, confusion_matrix, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
import warnings

warnings.filterwarnings('ignore')

In [None]:
def apply_lr_all_events(my_dataset_symptoms):

    dataset_symptoms = shuffle(my_dataset_symptoms).reset_index(drop=True) # disrupt the order
    # split training dataset and test dataset
    X = dataset_symptoms.iloc[:, 0:-1]
    Y = dataset_symptoms.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1234)

    # normalisation
    X_scaler = StandardScaler().fit(X_train)
    standardized_X_train = X_scaler.transform(X_train)
    standardized_X_test = X_scaler.transform(X_test)

    # Initialising the model and training
    log_reg = linear_model.LogisticRegression(penalty='l2',C=0.0001, solver='sag', class_weight = 'balanced',random_state=10)
    log_reg.fit(X_train, y_train)

    # prediction and results
    pred_test = log_reg.predict_proba(standardized_X_test)
    pred_train = log_reg.predict(standardized_X_train)
    pred_test = log_reg.predict(standardized_X_test)

    train_recall = recall_score(y_train, pred_train, average="binary")
    train_f1 = f1_score(y_train, pred_train, average='binary')
    test_recall = recall_score(y_test, pred_test,average="binary")
    test_f1 = f1_score(y_test, pred_test, average='binary')
    train_acc = accuracy_score(y_train, pred_train)
    test_acc = accuracy_score(y_test, pred_test)

    # Compute the predicted probabilities of the positive class
    probs = log_reg.predict_proba(X_test)[:, 1]

    # Compute the TPR and FPR for various threshold values
    fpr, tpr, thresholds = roc_curve(y_test, probs)

    auc_data = pd.DataFrame([fpr, tpr]).T
    auc_data.columns = ['fpr','tpr']
    auc_data.to_csv('auc_baseline_LR.csv')

    # Plot the ROC curve
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show()

    return train_recall, test_recall, train_f1, test_f1, train_acc, test_acc

In [None]:
DATA_DIR_BASELINE = os.path.join('dataset_baseline_all_events.csv')
dataset_baseline = pd.read_csv(DATA_DIR_BASELINE)
dataset_baseline

In [None]:
# pre-processing dataset
dataset_all_events = pd.DataFrame(dataset_baseline, columns=dataset_baseline.columns[1:])
dataset_all_events_positive = dataset_all_events[(dataset_all_events.disturbed_sleep_pattern == 1) | (dataset_all_events.agitation_irritability_aggression == 1) | (dataset_all_events.depressed_anxiety == 1)| (dataset_all_events.accidental_fall == 1)| (dataset_all_events.motor_function_behavior == 1)| (dataset_all_events.period_of_confusion == 1)| (dataset_all_events.hospital == 1)| (dataset_all_events.uti == 1)]
dataset_all_events_positive['label']=1
dataset_all_events = pd.merge(dataset_all_events, dataset_all_events_positive, how='left')
dataset_all_events.label = dataset_all_events.label.fillna(0.5)
dataset_all_events = pd.DataFrame(dataset_all_events, columns=['patient_id','week','bathroom_daytime','bedroom_daytime','hallway_daytime','kitchen_daytime','lounge_daytime','bathroom_night','bedroom_night','hallway_night','kitchen_night','lounge_night','label'])
dataset_all_events = dataset_all_events.dropna()
dataset_all_events.head()

In [None]:
# model and test, lr
train_recall_lr = []
test_recall_lr = []
train_f1_lr = []
test_f1_lr = []
train_acc_lr = []
test_acc_lr = []
for i in range(30):

    # # balance negative and positive lables
    dataset_num = dataset_all_events.label.value_counts().min()
    dataset_symptoms_negative = dataset_all_events[dataset_all_events['label']==0.5].sample(dataset_num)
    dataset_symptoms_positive = dataset_all_events[dataset_all_events['label']==1.0].sample(dataset_num)
    dataset_symptoms = pd.concat([dataset_symptoms_negative, dataset_symptoms_positive])
    dataset_symptoms = pd.DataFrame(dataset_symptoms, columns=['bathroom_daytime','bedroom_daytime','lounge_daytime','kitchen_daytime','hallway_daytime','bathroom_night','bedroom_night','hallway_night','kitchen_night','lounge_night','label'])
    
    # optimise the format of the labels
    mapping = {0.5:0, 1.0:1}
    dataset_symptoms['label'] = dataset_symptoms['label'].map(mapping)

    train_recall_lr_i, test_recall_lr_i, train_f1_lr_i, test_f1_lr_i, train_acc_lr_i, test_acc_lr_i = apply_lr_all_events(dataset_symptoms)

    train_recall_lr.append(train_recall_lr_i)
    test_recall_lr.append(test_recall_lr_i)
    train_f1_lr.append(train_f1_lr_i)
    test_f1_lr.append(test_f1_lr_i)
    train_acc_lr.append(train_acc_lr_i)
    test_acc_lr.append(test_acc_lr_i)

train_recall_lr = pd.DataFrame(train_recall_lr, columns=['train_recall'])
test_recall_lr = pd.DataFrame(test_recall_lr, columns=['test_recall'])
train_f1_lr = pd.DataFrame(train_f1_lr, columns=['train_f1'])
test_f1_lr = pd.DataFrame(test_f1_lr, columns = ['test_f1'])
train_acc_lr = pd.DataFrame(train_acc_lr, columns=['train_acc'])
test_acc_lr = pd.DataFrame(test_acc_lr, columns=['test_acc'])

all_events_lr = pd.concat([train_recall_lr, test_recall_lr, train_f1_lr, test_f1_lr, train_acc_lr, test_acc_lr],axis=1)
# all_events_recall_logic_reg.to_csv('all_events_recall_logic_reg.csv')
all_events_lr.boxplot()  
print(all_events_lr.mean())
plt.ylim(0,1) 

In [None]:
# all_events_lr.to_csv('n_evaluation_baseline_LR.csv')