From 22df4c98b99d0bd64a758d634c776f280de4acab Mon Sep 17 00:00:00 2001 From: thomaswiemann Date: Mon, 6 Nov 2023 20:21:06 -0600 Subject: [PATCH] kcmeans --- DESCRIPTION | 4 +- NAMESPACE | 2 + R/Fhat.R | 121 +++++++++++++++++++++++++++++++++++-- R/ols.R | 57 +++++++++++++++++ man/kcmeans.Rd | 28 +++++++++ man/ols.Rd | 13 ++++ man/predict.kcmeans.Rd | 51 ++++++++++++++++ man/toy_fun.Rd | 19 +----- tests/testthat/test-Fhat.R | 40 +++++++++++- 9 files changed, 309 insertions(+), 26 deletions(-) create mode 100644 R/ols.R create mode 100644 man/kcmeans.Rd create mode 100644 man/ols.Rd create mode 100644 man/predict.kcmeans.Rd diff --git a/DESCRIPTION b/DESCRIPTION index af0ed95..74ee170 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,7 +17,9 @@ Depends: R (>= 3.6) Imports: AER, - Ckmeans.1d.dp + Ckmeans.1d.dp, + MASS, + Matrix Suggests: testthat (>= 3.0.0), covr, diff --git a/NAMESPACE b/NAMESPACE index ecc6d4b..38718e5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,3 +1,5 @@ # Generated by roxygen2: do not edit by hand +S3method(predict,kcmeans) +export(kcmeans) export(toy_fun) diff --git a/R/Fhat.R b/R/Fhat.R index 923591b..105bf2a 100644 --- a/R/Fhat.R +++ b/R/Fhat.R @@ -17,8 +17,119 @@ #' @examples #' res <- toy_fun(rnorm(100)) #' res$fun -toy_fun <- function(y) { - output <- list(fun = TRUE, y = y) - class(output) <- "toy_fun" # define S3 class - return(output) -}#TOY_FUN +kcmeans <- function(y, X, K) { + + # Data parameters + nobs <- length(y) + + # Check whether additional features are included, residualize accordingly + if (length(X) > nobs) { + Z <- X[, 1] # categorical variable + X <- X[, -1, drop = FALSE] # additional features + # Compute \pi and residualize y + nX <- ncol(X) + Z_mat <- model.matrix(~ 0 + as.factor(Z)) + ols_fit <- ols(y, cbind(X, Z_mat)) # ols w/ generalized inverse + pi <- ols_fit$coef[1:nX] + y <- y - X %*% pi + } else { + Z <- X # categorical variable + pi <- NULL + }#IFELSE + + # Prepare data and prepare the cluster map + unique_Z <- unique(Z) + cluster_map <- t(simplify2array(lapply(unique_X, function (x) { + c(x, mean(y[Z == x]), mean(Z == x)) + })))#LAPPLY + + # Estimate kmeans on means of D given Z = z + kmeans_fit <- Ckmeans.1d.dp::Ckmeans.1d.dp(x = cluster_map[, 2], k = K, + y = cluster_map[, 3]) + + # Amend the cluster map + cluster_map <- cbind(cluster_map, kmeans_fit$cluster, + kmeans_fit$centers[kmeans_fit$cluster]) + colnames(cluster_map) <- c("x", "EYx", "Px", "gx", "mx") + + # Compute the unconditional mean + mean_y <- mean(y) + + # Prepare and return the model fit object + mdl_fit <- list(cluster_map = cluster_map, + mean_y = mean_y, pi = pi) + class(mdl_fit) <- "kcmeans" # define S3 class + return(mdl_fit) +}#kcmeans + + +#' Inference Methods for Partially Linear Estimators. +#' +#' @seealso [sandwich::vcovHC()] +#' +#' @description Inference methods for partially linear estimators. Simple +#' wrapper for [sandwich::vcovHC()]. +#' +#' @param object An object of class \code{ddml_plm}, \code{ddml_pliv}, or +#' \code{ddml_fpliv} as fitted by [ddml::ddml_plm()], [ddml::ddml_pliv()], +#' and [ddml::ddml_fpliv()], respectively. +#' @param ... Additional arguments passed to \code{vcovHC}. See +#' [sandwich::vcovHC()] for a complete list of arguments. +#' +#' @return An array with inference results for each \code{ensemble_type}. +#' +#' @references +#' Zeileis A (2004). "Econometric Computing with HC and HAC Covariance Matrix +#' Estimators.” Journal of Statistical Software, 11(10), 1-17. +#' +#' Zeileis A (2006). “Object-Oriented Computation of Sandwich Estimators.” +#' Journal of Statistical Software, 16(9), 1-16. +#' +#' Zeileis A, Köll S, Graham N (2020). “Various Versatile Variances: An +#' Object-Oriented Implementation of Clustered Covariances in R.” Journal of +#' Statistical Software, 95(1), 1-36. +#' +#' @export +#' +#' @examples +#' # Construct variables from the included Angrist & Evans (1998) data +#' y = AE98[, "worked"] +#' D = AE98[, "morekids"] +#' X = AE98[, c("age","agefst","black","hisp","othrace","educ")] +#' +#' # Estimate the partially linear model using a single base learner, ridge. +#' plm_fit <- ddml_plm(y, D, X, +#' learners = list(what = mdl_glmnet, +#' args = list(alpha = 0)), +#' sample_folds = 2, +#' silent = TRUE) +#' summary(plm_fit) +predict.kcmeans <- function(object, newdata, clusters = FALSE, ...) { + + # Check whether additional features are included, compute X\pi if needed + if (!is.null(object$pi)) { + Z <- newdata[, 1] + X <- newdata[, -1, drop = FALSE] + if(!clusters) Xpi <- X %*% object$pi + } else { + Z <- newdata + Xpi <- 0 + }#IFELSE + + # Construct fitted values from cluster map + fitted_mat <- merge(Z, object$cluster_map, + by.x = 1, by.y = 1, all.x = TRUE) + + # Construct predictions + if (clusters) { + # Return estimated cluster assignment + return(fitted_mat[, "gx"]) + } else { + # Replace unseen categories with unconditional mean of y - X\pi + fitted_mat[is.na(fitted_mat[, "mx"]), 5] <- object$mean_y + # Construct and return fitted values + fitted <- fitted_mat[, "mx"] + Xpi + return(fitted) + }#IFELSE + +}#PREDICT.KCMEANS diff --git a/R/ols.R b/R/ols.R new file mode 100644 index 0000000..8707f4b --- /dev/null +++ b/R/ols.R @@ -0,0 +1,57 @@ +#' Ordinary least squares with generalized inverse from the ddml package +#' See ?ddml::ols +ols <- function(y, X, + const = FALSE, + w = NULL) { + # Add constant (optional) + if (const) X <- cbind(1, X) + + # Data parameters + calc_wls <- !is.null(w) + + # Compute OLS coefficient + if (!calc_wls) { + XX_inv <- csolve(as.matrix(Matrix::crossprod(X))) + coef <- XX_inv %*% Matrix::crossprod(X, y) + } else { # Or calculate WLS coefficient whenever weights are specified + Xw <- X * w # weight rows + XX_inv <- csolve(as.matrix(Matrix::crossprod(Xw, X))) + coef <- XX_inv %*% Matrix::crossprod(Xw, y) + }#IFELSE + # Return estimate + coef <- as.matrix(coef) + try(rownames(coef) <- colnames(X)) # assign coefficient names + output <- list(coef = coef, y = y, X = X, + const = const, w = w) + class(output) <- "ols" # define S3 class + return(output) +}#OLS + +# Complementary methods ======================================================== + +# Constructed fitted values +predict.ols <- function(object, newdata = NULL, ...){ + # Obtain datamatrix + if (is.null(newdata)) { + newdata <- object$X + } else if (object$const) { + newdata <- cbind(1, newdata) + }#IFELSE + # Calculate and return fitted values with the OLS coefficient + fitted <- newdata%*%object$coef + return(fitted) +}#PREDICT.OLS + +# help function for generalized inverse ======================================== + +# Simple generalized inverse wrapper. +csolve <- function(X) { + # Attempt inversion + X_inv <- tryCatch(solve(X), error = function(e) NA) + # If inversion failed, calculate generalized inverse + if (any(is.na(X_inv))) { + X_inv <- MASS::ginv(X) + }#IF + # Return (generalized) inverse + return(X_inv) +}#CSOLVE diff --git a/man/kcmeans.Rd b/man/kcmeans.Rd new file mode 100644 index 0000000..9bf3f2f --- /dev/null +++ b/man/kcmeans.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Fhat.R +\name{kcmeans} +\alias{kcmeans} +\title{First stage estimator.} +\usage{ +kcmeans(y, X, K) +} +\arguments{ +\item{x}{A numerical vector.} +} +\value{ +\code{toy_fun} returns an object of S3 class +\code{toy_fun}. An object of class \code{toy_fun} is a list containing +the following components: +\describe{ +\item{\code{fun}}{A boolean on whether you had fun..} +\item{\code{y}}{Pass-through of the ser-provided arguments. +See above.} +} +} +\description{ +This is a simple toy function. +} +\examples{ +res <- toy_fun(rnorm(100)) +res$fun +} diff --git a/man/ols.Rd b/man/ols.Rd new file mode 100644 index 0000000..640082b --- /dev/null +++ b/man/ols.Rd @@ -0,0 +1,13 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ols.R +\name{ols} +\alias{ols} +\title{Ordinary least squares with generalized inverse from the ddml package +See ?ddml::ols} +\usage{ +ols(y, X, const = FALSE, w = NULL) +} +\description{ +Ordinary least squares with generalized inverse from the ddml package +See ?ddml::ols +} diff --git a/man/predict.kcmeans.Rd b/man/predict.kcmeans.Rd new file mode 100644 index 0000000..d1cd0bb --- /dev/null +++ b/man/predict.kcmeans.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Fhat.R +\name{predict.kcmeans} +\alias{predict.kcmeans} +\title{Inference Methods for Partially Linear Estimators.} +\usage{ +\method{predict}{kcmeans}(object, newdata, clusters = FALSE, ...) +} +\arguments{ +\item{object}{An object of class \code{ddml_plm}, \code{ddml_pliv}, or +\code{ddml_fpliv} as fitted by \code{\link[ddml:ddml_plm]{ddml::ddml_plm()}}, \code{\link[ddml:ddml_pliv]{ddml::ddml_pliv()}}, +and \code{\link[ddml:ddml_fpliv]{ddml::ddml_fpliv()}}, respectively.} + +\item{...}{Additional arguments passed to \code{vcovHC}. See +\code{\link[sandwich:vcovHC]{sandwich::vcovHC()}} for a complete list of arguments.} +} +\value{ +An array with inference results for each \code{ensemble_type}. +} +\description{ +Inference methods for partially linear estimators. Simple +wrapper for \code{\link[sandwich:vcovHC]{sandwich::vcovHC()}}. +} +\examples{ +# Construct variables from the included Angrist & Evans (1998) data +y = AE98[, "worked"] +D = AE98[, "morekids"] +X = AE98[, c("age","agefst","black","hisp","othrace","educ")] + +# Estimate the partially linear model using a single base learner, ridge. +plm_fit <- ddml_plm(y, D, X, + learners = list(what = mdl_glmnet, + args = list(alpha = 0)), + sample_folds = 2, + silent = TRUE) +summary(plm_fit) +} +\references{ +Zeileis A (2004). "Econometric Computing with HC and HAC Covariance Matrix +Estimators.” Journal of Statistical Software, 11(10), 1-17. + +Zeileis A (2006). “Object-Oriented Computation of Sandwich Estimators.” +Journal of Statistical Software, 16(9), 1-16. + +Zeileis A, Köll S, Graham N (2020). “Various Versatile Variances: An +Object-Oriented Implementation of Clustered Covariances in R.” Journal of +Statistical Software, 95(1), 1-36. +} +\seealso{ +\code{\link[sandwich:vcovHC]{sandwich::vcovHC()}} +} diff --git a/man/toy_fun.Rd b/man/toy_fun.Rd index 69b4af8..8aaeb05 100644 --- a/man/toy_fun.Rd +++ b/man/toy_fun.Rd @@ -1,26 +1,15 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/Fhat.R, R/toy_fun.R +% Please edit documentation in R/toy_fun.R \name{toy_fun} \alias{toy_fun} -\title{First stage estimator.} +\title{Toy function.} \usage{ -toy_fun(y) - toy_fun(y) } \arguments{ \item{x}{A numerical vector.} } \value{ -\code{toy_fun} returns an object of S3 class -\code{toy_fun}. An object of class \code{toy_fun} is a list containing -the following components: -\describe{ -\item{\code{fun}}{A boolean on whether you had fun..} -\item{\code{y}}{Pass-through of the ser-provided arguments. -See above.} -} - \code{toy_fun} returns an object of S3 class \code{toy_fun}. An object of class \code{toy_fun} is a list containing the following components: @@ -31,14 +20,10 @@ See above.} } } \description{ -This is a simple toy function. - This is a simple toy function. } \examples{ res <- toy_fun(rnorm(100)) res$fun -res <- toy_fun(rnorm(100)) -res$fun } \concept{toys} diff --git a/tests/testthat/test-Fhat.R b/tests/testthat/test-Fhat.R index 4be29bf..ccf4380 100644 --- a/tests/testthat/test-Fhat.R +++ b/tests/testthat/test-Fhat.R @@ -1,8 +1,42 @@ -test_that("Fhat computes", { - # Generate data +test_that("kcmenas computes", { + # Get data from the included SimDat data + y <- SimDat$D + X <- SimDat$Z + # Compute kcmeans + kcmeans_fit <- kcmeans(y, X, K = 3) + # Check output with expectations + expect_equal(dim(kcmeans_fit), c(60, 5)) +})#TEST_THAT + +test_that("kcmenas computes with additional controls", { + # Get data from the included SimDat data + y <- SimDat$D + X <- cbind(SimDat$Z, SimDat$X) + + + # Compute kcmeans + kcmeans_fit <- kcmeans(y, X, K = 3) + + # Check output with expectations + expect_equal(dim(kcmeans_fit), c(60, 5)) +})#TEST_THAT + +test_that("predict.kcmenas computes w/ unseen categories", { + # Get data from the included SimDat data + y <- SimDat$D + X <- cbind(SimDat$Z, SimDat$X) + + # Compute kcmeans + kcmeans_fit <- kcmeans(y, X, K = 3) + + # Compute predictions w/ unseen categories + newdata <- X + newdata[1:20, 1] <- -22 + + fitted_values <- predict(kcmeans_fit, newdata) # Check output with expectations - expect_equal(length(myfun), 1) + expect_equal(dim(kcmeans_fit), c(60, 5)) })#TEST_THAT