<a href="https://colab.research.google.com/github/taegeonyu/HDS-5230-07/blob/main/week12/Week12_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required R libraries
!Rscript -e 'install.packages(c("mlbench", "purrr"))'

Installing packages into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/src/contrib/mlbench_2.1-6.tar.gz'
trying URL 'https://cran.rstudio.com/src/contrib/purrr_1.0.4.tar.gz'
* installing *source* package ‘mlbench’ ...
** this is package ‘mlbench’ version ‘2.1-6’
** package ‘mlbench’ successfully unpacked and MD5 sums checked
** using staged installation
** libs
using C compiler: ‘gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0’
gcc -I"/usr/share/R/include" -DNDEBUG       -fpic  -g -O2 -ffile-prefix-map=/build/r-base-TYDrW1/r-base-4.5.0=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2  -c init.c -o init.o
gcc -I"/usr/share/R/include" -DNDEBUG       -fpic  -g -O2 -ffile-prefix-map=/build/r-base-TYDrW1/r-base-4.5.0=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2  -c waveform.c -o waveform.o
gcc -shared -L/usr/lib/R/lib -Wl,-Bsymbolic-functions -flto=auto -ffat-

In [2]:
# install rpy2
!pip install -q rpy2

In [3]:
# load rpy2 extension
%load_ext rpy2.ipython

In [4]:
# run R code to generate data
%%R

library(mlbench)
library(purrr)

data("PimaIndiansDiabetes2")
ds <- as.data.frame(na.omit(PimaIndiansDiabetes2))

## Fit logistic regression
logmodel <- glm(diabetes ~ ., data = ds, family = "binomial")

cfs <- coefficients(logmodel)
prednames <- variable.names(ds)[-9]

sz <- 100000

dfdata <- map_dfc(prednames,
                  function(nm){
                    eval(parse(text = paste0("sample(ds$", nm,
                                             ", size = sz, replace = T)")))
                  })

names(dfdata) <- prednames

pvec <- map((1:8),
            function(pnum){
              cfs[pnum+1] * eval(parse(text = paste0("dfdata$", prednames[pnum])))
            }) %>%
  reduce(`+`) + cfs[1]

dfdata$outcome <- ifelse(1/(1 + exp(-pvec)) > 0.5, 1, 0)

write.csv(dfdata, file = "dfdata.csv", row.names = FALSE)

New names:
• `` -> `...1`
• `` -> `...2`
• `` -> `...3`
• `` -> `...4`
• `` -> `...5`
• `` -> `...6`
• `` -> `...7`
• `` -> `...8`


## Preparation

In [5]:
# import necessary libraries for the task
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import BinaryCrossentropy

In [6]:
# read data
df = pd.read_csv('/content/dfdata.csv')
df.shape

(100000, 9)

In [7]:
df.head()

Unnamed: 0,pregnant,glucose,pressure,triceps,insulin,mass,pedigree,age,outcome
0,1,187,80,48,112,37.2,0.507,36,1
1,7,77,68,22,130,38.7,0.219,26,0
2,8,189,50,14,144,30.2,0.968,33,1
3,6,109,78,10,106,37.8,0.443,25,0
4,8,179,58,35,63,34.9,0.366,33,1


## Deep Learning Pipeline

In [8]:
class DLPipeline:
    def __init__(self, data):
        # load dataframe
        self.df = data
        self.results = []

    # prepare data
    def prepare_data(self, data_size):
        # sample data according to data_size
        data = self.df.sample(n = data_size, random_state = 42)

        # train/test split
        X = data.drop('outcome', axis=1)
        y = data['outcome']
        return train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 42)

    # model building
    def build_model(self, input_dim, hidden_layers = 1, hidden_nodes = 4):

        # keras sequential model
        model = Sequential()

        # input layer
        model.add(Dense(hidden_nodes, activation='relu', input_shape=(input_dim,)))

        # hidden layers
        for _ in range(hidden_layers - 1):
          model.add(Dense(hidden_nodes, activation='relu'))

        # output layer
        model.add(Dense(1, activation = 'sigmoid'))

        # compile rmsprop optimizer for binary classification with accuracy
        model.compile(
            optimizer = 'rmsprop',
            loss = 'binary_crossentropy',
            metrics = [tf.keras.metrics.BinaryAccuracy(name = 'accuracy')]
        )

        return model

    # run the model with sample data
    def run_sample(self, data_size, hidden_layers = 1):
        # prepare training/test data
        X_train, X_test, y_train, y_test = self.prepare_data(data_size)

        # scaling
        scaler = StandardScaler()

        # fit & transform training
        X_train_scaled = scaler.fit_transform(X_train)

        # transform only for test
        X_test_scaled = scaler.transform(X_test)

        # build model
        model = self.build_model(input_dim = X_train.shape[1],
                                  hidden_layers = hidden_layers,
                                  hidden_nodes = 4)

        # train and time execution

        # start time
        start = time.time()

        # fit the model
        mod = model.fit(
            X_train_scaled, y_train,
            epochs = 20,
            batch_size = 32,
            verbose = 0,
            validation_data = (X_test_scaled, y_test)
        )

        # end time
        end = time.time()

        # extract metrics, the last results of final epoch
        train_loss = round(mod.history['loss'][-1], 4)
        val_loss = round(mod.history['val_loss'][-1], 4)
        train_acc = round(mod.history['accuracy'][-1], 4)
        val_acc = round(mod.history['val_accuracy'][-1], 4)
        time_taken = round(end - start, 2)

        # store results
        config = f"{hidden_layers} hidden layer{'s' if hidden_layers > 1 else ''} of 4 nodes"
        result = {
            'Data size': data_size,
            'Configuration': config,
            'Training error (loss)': train_loss,
            'Validation error (loss)': val_loss,
            'Training accuracy': train_acc,
            'Validation accuracy': val_acc,
            'Time of execution (secs)': time_taken
        }

        self.results.append(result)
        return pd.DataFrame([result])

In [9]:
# load data to the pipeline
pipeline = DLPipeline(df)

In [10]:
# 1 hidden layer of 4 nodes with 1000 observations
sample1 = pipeline.run_sample(data_size = 1000, hidden_layers = 1)
sample1

Unnamed: 0,Data size,Configuration,Training error (loss),Validation error (loss),Training accuracy,Validation accuracy,Time of execution (secs)
0,1000,1 hidden layer of 4 nodes,0.3445,0.3484,0.9112,0.94,3.05


In [11]:
# 1 hidden layer of 4 nodes with 10000 observations
sample2 = pipeline.run_sample(data_size = 10000, hidden_layers = 1)
sample2

Unnamed: 0,Data size,Configuration,Training error (loss),Validation error (loss),Training accuracy,Validation accuracy,Time of execution (secs)
0,10000,1 hidden layer of 4 nodes,0.0283,0.0308,0.996,0.9975,10.85


In [12]:
# 1 hidden layer of 4 nodes with 100000 observations
sample3 = pipeline.run_sample(data_size = 100000, hidden_layers = 1)
sample3

Unnamed: 0,Data size,Configuration,Training error (loss),Validation error (loss),Training accuracy,Validation accuracy,Time of execution (secs)
0,100000,1 hidden layer of 4 nodes,0.0067,0.0072,0.9986,0.998,90.05


In [13]:
# 2 hidden layer of 4 nodes with 1000 observations
sample4 = pipeline.run_sample(data_size = 1000, hidden_layers = 2)
sample4

Unnamed: 0,Data size,Configuration,Training error (loss),Validation error (loss),Training accuracy,Validation accuracy,Time of execution (secs)
0,1000,2 hidden layers of 4 nodes,0.3958,0.3835,0.7987,0.805,2.87


In [14]:
# 2 hidden layer of 4 nodes with 10000 observations
sample5 = pipeline.run_sample(data_size = 10000, hidden_layers = 2)
sample5

Unnamed: 0,Data size,Configuration,Training error (loss),Validation error (loss),Training accuracy,Validation accuracy,Time of execution (secs)
0,10000,2 hidden layers of 4 nodes,0.0133,0.0155,0.9975,0.998,11.37


In [15]:
# 2 hidden layer of 4 nodes with 100000 observations
sample6 = pipeline.run_sample(data_size = 100000, hidden_layers = 2)
sample6

Unnamed: 0,Data size,Configuration,Training error (loss),Validation error (loss),Training accuracy,Validation accuracy,Time of execution (secs)
0,100000,2 hidden layers of 4 nodes,0.0039,0.0065,0.9985,0.9977,94.11


In [16]:
# combined results
samples = [sample1, sample2, sample3, sample4, sample5, sample6]
combined_samples = pd.concat(samples, ignore_index = True)
combined_samples

Unnamed: 0,Data size,Configuration,Training error (loss),Validation error (loss),Training accuracy,Validation accuracy,Time of execution (secs)
0,1000,1 hidden layer of 4 nodes,0.3445,0.3484,0.9112,0.94,3.05
1,10000,1 hidden layer of 4 nodes,0.0283,0.0308,0.996,0.9975,10.85
2,100000,1 hidden layer of 4 nodes,0.0067,0.0072,0.9986,0.998,90.05
3,1000,2 hidden layers of 4 nodes,0.3958,0.3835,0.7987,0.805,2.87
4,10000,2 hidden layers of 4 nodes,0.0133,0.0155,0.9975,0.998,11.37
5,100000,2 hidden layers of 4 nodes,0.0039,0.0065,0.9985,0.9977,94.11
