In [13]:
import numpy as np
import pandas as pd

from typing import List, Tuple
import matplotlib.pyplot as plt
import math

from sklearn.datasets import fetch_openml
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.neural_network import MLPClassifier

# Fix the random seed for reproducibility
# !! Important !! : do not change this
# hw1_seed = 1234
# np.random.seed(hw1_seed)  
# pd.options.mode.chained_assignment = None 

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
diabetes_130_us_hospitals_for_years_1999_2008 = fetch_ucirepo(id=296) 
  
# data (as pandas dataframes) 
X = diabetes_130_us_hospitals_for_years_1999_2008.data.features 
y = diabetes_130_us_hospitals_for_years_1999_2008.data.targets 
  
# metadata 
print(diabetes_130_us_hospitals_for_years_1999_2008.metadata) 
  
# variable information 
print(diabetes_130_us_hospitals_for_years_1999_2008.variables) 


## testing splits ##

## 20 percent set aside
X_temp, X_te, y_temp, y_te = train_test_split(X, y, 
                                    test_size=0.20, random_state=1234, shuffle=True)

## that 75/25 split for the end 
X_tr, X_val, y_tr, y_val = train_test_split(X_temp, y_temp, 
                                    test_size=0.25, random_state=1234, shuffle=True)



  df = pd.read_csv(data_url)


{'uci_id': 296, 'name': 'Diabetes 130-US Hospitals for Years 1999-2008', 'repository_url': 'https://archive.ics.uci.edu/dataset/296/diabetes+130-us+hospitals+for+years+1999-2008', 'data_url': 'https://archive.ics.uci.edu/static/public/296/data.csv', 'abstract': 'The dataset represents ten years (1999-2008) of clinical care at 130 US hospitals and integrated delivery networks. Each row concerns hospital records of patients diagnosed with diabetes, who underwent laboratory, medications, and stayed up to 14 days. The goal is to determine the early readmission of the patient within 30 days of discharge.\nThe problem is important for the following reasons. Despite high-quality evidence showing improved clinical outcomes for diabetic patients who receive various preventive and therapeutic interventions, many patients do not receive them. This can be partially attributed to arbitrary diabetes management in hospital environments, which fail to attend to glycemic control. Failure to provide pro

In [7]:
#rebecca 

In [None]:
#amy


Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
13394,AfricanAmerican,Male,[60-70),,8,1,1,1,,,...,No,No,No,No,No,No,No,No,No,No
38437,Hispanic,Female,[60-70),,2,1,7,3,OG,Surgery-General,...,No,No,Up,No,No,No,No,No,Ch,Yes
26896,AfricanAmerican,Female,[50-60),,1,1,7,1,,InternalMedicine,...,No,No,Down,No,No,No,No,No,Ch,Yes
93936,Asian,Female,[50-60),,1,1,7,1,SP,Emergency/Trauma,...,No,No,Steady,No,No,No,No,No,Ch,Yes
39543,Caucasian,Male,[70-80),,1,1,7,9,MC,InternalMedicine,...,No,No,Steady,No,No,No,No,No,Ch,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73632,Caucasian,Female,[70-80),,1,1,7,2,UN,InternalMedicine,...,No,No,No,No,No,No,No,No,No,Yes
55753,Caucasian,Female,[40-50),,1,1,7,5,,,...,No,No,No,No,No,No,No,No,No,No
48948,Caucasian,Female,[80-90),,1,1,7,5,,,...,No,No,Steady,No,No,No,No,No,Ch,Yes
27009,AfricanAmerican,Female,[70-80),,2,1,1,3,MC,,...,No,No,Steady,No,No,No,No,No,No,Yes


In [15]:
#varsh
#logistic regression
seed = 1234
def logistic_classification(X, y): 
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    #  set the state of the random number generator so that we get the same results across runs when testing our code

    ### YOUR CODE START HERE ### 

    # Specify the logistic classifier model
    # # Please use 'l1' penality type, 'liblinear' solver and enable fit_intercept
    classifier = LogisticRegression(penalty='l1', solver='liblinear', fit_intercept=True)
    # Train a logistic regression classifier and evaluate accuracy on the training data
    print('\nTraining a model with', X_train.shape[0], 'examples.....')
    # Training 
    classifier.fit(X_train, y_train)
    
    train_predictions = classifier.predict(X_train)
    train_accuracy = accuracy_score(y_train, train_predictions)
    print('\nTraining accuracy:',format( 100*train_accuracy , '.2f') ) 

    # Compute and print accuracy on the test data
    test_predictions = classifier.predict(X_test)
    test_accuracy = accuracy_score(y_test, test_predictions)

    ### YOUR CODE ENDS HERE ###
    print('\nTesting accuracy:', format( 100*test_accuracy , '.2f') )

    return classifier, train_accuracy, test_accuracy


#feedforward neural network

def train_mlp_with_hyperparameters(hyper_params, X_tr_10k, y_tr_10k, X_val_10k, y_val_10k, X_te_scaled, y_te, seed) -> Tuple[float, float, float]:
    mlp = MLPClassifier(**hyper_params, shuffle=True, random_state=seed, verbose=False)
    mlp.fit(X_tr_10k, y_tr_10k)
    train_accuracy = mlp.score(X_tr_10k, y_tr_10k)
    val_accuracy = mlp.score(X_val_10k, y_val_10k)
    test_accuracy = mlp.score(X_te_scaled, y_te)
    
    print('Hyperparameters performance:')
    print(f'Training accuracy: {train_accuracy}')
    print(f'Validation accuracy: {val_accuracy}')
    print(f'Test accuracy: {test_accuracy}')
    return train_accuracy, val_accuracy, test_accuracy

best_hyperparams = {'hidden_layer_sizes': (100,),
                    'activation': 'relu', 
                    'solver': 'sgd',
                    'alpha': .1,
                    'learning_rate': 'constant',
                    'learning_rate_init': .05,
                    'max_iter': 100,
                    'n_iter_no_change': 1}

X_tr = X_tr.select_dtypes(exclude=['object', 'category'])
X_val = X_val.select_dtypes(exclude=['object', 'category'])
X_te = X_te.select_dtypes(exclude=['object', 'category'])

logistic_classification(X_tr, y_tr)
train_mlp_with_hyperparameters(best_hyperparams, X_tr, y_tr, X_val, y_val, X_te, y_te, seed)



Training a model with 42741 examples.....


  y = column_or_1d(y, warn=True)



Training accuracy: 56.70

Testing accuracy: 56.82


  y = column_or_1d(y, warn=True)


Hyperparameters performance:
Training accuracy: 0.5393471887846182
Validation accuracy: 0.5449319510637253
Test accuracy: 0.5380760538469097


(0.5393471887846182, 0.5449319510637253, 0.5380760538469097)