In [5]:
import pandas as pd

dfCases = pd.read_csv("https://api.vitaldb.net/cases")  # clinical information
dfTrack = pd.read_csv("https://api.vitaldb.net/trks")  # track list
dflabs = pd.read_csv('https://api.vitaldb.net/labs')  # laboratory results

# A1. Please evaluate confusion matrix for your classification problem. From confusion matrix, the other performance metrics such as precision, recall and F1-Score measures for both training and test data. Based on your observations, infer the models learning outcome (underfit / regularfit / overfit).


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

def a1():
    #create a new column 'op_duration' which is the difference between 'opend' and 'opstart'
    dfCases["op_duration"] = dfCases["opend"] - dfCases["opstart"]
    threshold_minutes = 240*60
    #create a new column 'surgery_duration_class' which is 'long' if 'op_duration' is greater than 'threshold_minutes' else 'short'
    dfCases['surgery_duration_class'] = dfCases['op_duration'].apply(lambda x: 'long' if x > threshold_minutes else 'short')
    X = dfCases[['opstart', 'opend', 'op_duration']]
    y = dfCases['surgery_duration_class']
    #split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    #create a KNeighborsClassifier object
    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(X_train, y_train)
    y_train_pred = neigh.predict(X_train)
    y_test_pred = neigh.predict(X_test)
    #calculate the confusion matrix for training and testing sets
    conf_matrix_train = confusion_matrix(y_train, y_train_pred)
    
    conf_matrix_test = confusion_matrix(y_test, y_test_pred)
    #calculate the precision, recall, and f1 score for training and testing sets
    precision_train = precision_score(y_train, y_train_pred, pos_label='long')
    recall_train = recall_score(y_train, y_train_pred, pos_label='long')
    f1_train = f1_score(y_train, y_train_pred, pos_label='long')

    precision_test = precision_score(y_test, y_test_pred, pos_label='long')
    recall_test = recall_score(y_test, y_test_pred, pos_label='long')
    f1_test = f1_score(y_test, y_test_pred, pos_label='long')
    return conf_matrix_train, conf_matrix_test, precision_train, recall_train, f1_train, precision_test, recall_test, f1_test


In [7]:
print(a1())

(array([[ 640,    1],
       [   1, 3829]], dtype=int64), array([[ 263,    3],
       [   0, 1651]], dtype=int64), 0.9984399375975039, 0.9984399375975039, 0.9984399375975039, 1.0, 0.9887218045112782, 0.9943289224952742)
