From 22df4c98b99d0bd64a758d634c776f280de4acab Mon Sep 17 00:00:00 2001
From: thomaswiemann <wiemann@uchicago.edu>
Date: Mon, 6 Nov 2023 20:21:06 -0600
Subject: [PATCH] kcmeans

---
 DESCRIPTION                |   4 +-
 NAMESPACE                  |   2 +
 R/Fhat.R                   | 121 +++++++++++++++++++++++++++++++++++--
 R/ols.R                    |  57 +++++++++++++++++
 man/kcmeans.Rd             |  28 +++++++++
 man/ols.Rd                 |  13 ++++
 man/predict.kcmeans.Rd     |  51 ++++++++++++++++
 man/toy_fun.Rd             |  19 +-----
 tests/testthat/test-Fhat.R |  40 +++++++++++-
 9 files changed, 309 insertions(+), 26 deletions(-)
 create mode 100644 R/ols.R
 create mode 100644 man/kcmeans.Rd
 create mode 100644 man/ols.Rd
 create mode 100644 man/predict.kcmeans.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index af0ed95..74ee170 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -17,7 +17,9 @@ Depends:
     R (>= 3.6)
 Imports: 
     AER,
-    Ckmeans.1d.dp
+    Ckmeans.1d.dp,
+    MASS,
+    Matrix
 Suggests: 
     testthat (>= 3.0.0),
     covr,
diff --git a/NAMESPACE b/NAMESPACE
index ecc6d4b..38718e5 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,3 +1,5 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(predict,kcmeans)
+export(kcmeans)
 export(toy_fun)
diff --git a/R/Fhat.R b/R/Fhat.R
index 923591b..105bf2a 100644
--- a/R/Fhat.R
+++ b/R/Fhat.R
@@ -17,8 +17,119 @@
 #' @examples
 #' res <- toy_fun(rnorm(100))
 #' res$fun
-toy_fun <- function(y) {
-  output <- list(fun = TRUE, y = y)
-  class(output) <- "toy_fun" # define S3 class
-  return(output)
-}#TOY_FUN
+kcmeans <- function(y, X, K) {
+
+  # Data parameters
+  nobs <- length(y)
+
+  # Check whether additional features are included, residualize accordingly
+  if (length(X) > nobs) {
+    Z <- X[, 1] # categorical variable
+    X <- X[, -1, drop = FALSE] # additional features
+    # Compute \pi and residualize y
+    nX <- ncol(X)
+    Z_mat <- model.matrix(~ 0 + as.factor(Z))
+    ols_fit <- ols(y, cbind(X, Z_mat)) # ols w/ generalized inverse
+    pi <- ols_fit$coef[1:nX]
+    y <- y - X %*% pi
+  } else {
+    Z <- X # categorical variable
+    pi <- NULL
+  }#IFELSE
+
+  # Prepare data and prepare the cluster map
+  unique_Z <- unique(Z)
+  cluster_map <- t(simplify2array(lapply(unique_X, function (x) {
+    c(x, mean(y[Z == x]), mean(Z == x))
+    })))#LAPPLY
+
+  # Estimate kmeans on means of D given Z = z
+  kmeans_fit <- Ckmeans.1d.dp::Ckmeans.1d.dp(x = cluster_map[, 2], k = K,
+                                             y = cluster_map[, 3])
+
+  # Amend the cluster map
+  cluster_map <- cbind(cluster_map, kmeans_fit$cluster,
+                   kmeans_fit$centers[kmeans_fit$cluster])
+  colnames(cluster_map) <- c("x", "EYx", "Px", "gx", "mx")
+
+  # Compute the unconditional mean
+  mean_y <- mean(y)
+
+  # Prepare and return the model fit object
+  mdl_fit <- list(cluster_map = cluster_map,
+                  mean_y = mean_y, pi = pi)
+  class(mdl_fit) <- "kcmeans" # define S3 class
+  return(mdl_fit)
+}#kcmeans
+
+
+#' Inference Methods for Partially Linear Estimators.
+#'
+#' @seealso [sandwich::vcovHC()]
+#'
+#' @description Inference methods for partially linear estimators. Simple
+#'     wrapper for [sandwich::vcovHC()].
+#'
+#' @param object An object of class \code{ddml_plm}, \code{ddml_pliv}, or
+#'     \code{ddml_fpliv} as fitted by [ddml::ddml_plm()], [ddml::ddml_pliv()],
+#'     and [ddml::ddml_fpliv()], respectively.
+#' @param ... Additional arguments passed to \code{vcovHC}. See
+#'     [sandwich::vcovHC()] for a complete list of arguments.
+#'
+#' @return An array with inference results for each \code{ensemble_type}.
+#'
+#' @references
+#' Zeileis A (2004). "Econometric Computing with HC and HAC Covariance Matrix
+#'     Estimators.” Journal of Statistical Software, 11(10), 1-17.
+#'
+#' Zeileis A (2006). “Object-Oriented Computation of Sandwich Estimators.”
+#'     Journal of Statistical Software, 16(9), 1-16.
+#'
+#' Zeileis A, Köll S, Graham N (2020). “Various Versatile Variances: An
+#'     Object-Oriented Implementation of Clustered Covariances in R.” Journal of
+#'     Statistical Software, 95(1), 1-36.
+#'
+#' @export
+#'
+#' @examples
+#' # Construct variables from the included Angrist & Evans (1998) data
+#' y = AE98[, "worked"]
+#' D = AE98[, "morekids"]
+#' X = AE98[, c("age","agefst","black","hisp","othrace","educ")]
+#'
+#' # Estimate the partially linear model using a single base learner, ridge.
+#' plm_fit <- ddml_plm(y, D, X,
+#'                     learners = list(what = mdl_glmnet,
+#'                                     args = list(alpha = 0)),
+#'                     sample_folds = 2,
+#'                     silent = TRUE)
+#' summary(plm_fit)
+predict.kcmeans <- function(object, newdata, clusters = FALSE, ...) {
+
+  # Check whether additional features are included, compute X\pi if needed
+  if (!is.null(object$pi)) {
+    Z <- newdata[, 1]
+    X <- newdata[, -1, drop = FALSE]
+    if(!clusters) Xpi <- X %*% object$pi
+  } else {
+    Z <- newdata
+    Xpi <- 0
+  }#IFELSE
+
+  # Construct fitted values from cluster map
+  fitted_mat <- merge(Z, object$cluster_map,
+                      by.x = 1, by.y = 1, all.x = TRUE)
+
+  # Construct predictions
+  if (clusters) {
+    # Return estimated cluster assignment
+    return(fitted_mat[, "gx"])
+  } else {
+    # Replace unseen categories with unconditional mean of y - X\pi
+    fitted_mat[is.na(fitted_mat[, "mx"]), 5] <- object$mean_y
+    # Construct and return fitted values
+    fitted <- fitted_mat[, "mx"] + Xpi
+    return(fitted)
+  }#IFELSE
+
+}#PREDICT.KCMEANS
diff --git a/R/ols.R b/R/ols.R
new file mode 100644
index 0000000..8707f4b
--- /dev/null
+++ b/R/ols.R
@@ -0,0 +1,57 @@
+#' Ordinary least squares with generalized inverse from the ddml package
+#' See ?ddml::ols
+ols <- function(y, X,
+                const = FALSE,
+                w = NULL) {
+  # Add constant (optional)
+  if (const) X <- cbind(1, X)
+
+  # Data parameters
+  calc_wls <- !is.null(w)
+
+  # Compute OLS coefficient
+  if (!calc_wls) {
+    XX_inv <- csolve(as.matrix(Matrix::crossprod(X)))
+    coef <- XX_inv %*% Matrix::crossprod(X, y)
+  } else { # Or calculate WLS coefficient whenever weights are specified
+    Xw <- X * w # weight rows
+    XX_inv <- csolve(as.matrix(Matrix::crossprod(Xw, X)))
+    coef <- XX_inv %*% Matrix::crossprod(Xw, y)
+  }#IFELSE
+  # Return estimate
+  coef <- as.matrix(coef)
+  try(rownames(coef) <- colnames(X)) # assign coefficient names
+  output <- list(coef = coef, y = y, X = X,
+                 const = const, w = w)
+  class(output) <- "ols" # define S3 class
+  return(output)
+}#OLS
+
+# Complementary methods ========================================================
+
+# Constructed fitted values
+predict.ols <- function(object, newdata = NULL, ...){
+  # Obtain datamatrix
+  if (is.null(newdata)) {
+    newdata <- object$X
+  } else if (object$const) {
+    newdata <- cbind(1, newdata)
+  }#IFELSE
+  # Calculate and return fitted values with the OLS coefficient
+  fitted <- newdata%*%object$coef
+  return(fitted)
+}#PREDICT.OLS
+
+# help function for generalized inverse ========================================
+
+# Simple generalized inverse wrapper.
+csolve <- function(X) {
+  # Attempt inversion
+  X_inv <- tryCatch(solve(X), error = function(e) NA)
+  # If inversion failed, calculate generalized inverse
+  if (any(is.na(X_inv))) {
+    X_inv <- MASS::ginv(X)
+  }#IF
+  # Return (generalized) inverse
+  return(X_inv)
+}#CSOLVE
diff --git a/man/kcmeans.Rd b/man/kcmeans.Rd
new file mode 100644
index 0000000..9bf3f2f
--- /dev/null
+++ b/man/kcmeans.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/Fhat.R
+\name{kcmeans}
+\alias{kcmeans}
+\title{First stage estimator.}
+\usage{
+kcmeans(y, X, K)
+}
+\arguments{
+\item{x}{A numerical vector.}
+}
+\value{
+\code{toy_fun} returns an object of S3 class
+\code{toy_fun}. An object of class \code{toy_fun} is a list containing
+the following components:
+\describe{
+\item{\code{fun}}{A boolean on whether you had fun..}
+\item{\code{y}}{Pass-through of the ser-provided arguments.
+See above.}
+}
+}
+\description{
+This is a simple toy function.
+}
+\examples{
+res <- toy_fun(rnorm(100))
+res$fun
+}
diff --git a/man/ols.Rd b/man/ols.Rd
new file mode 100644
index 0000000..640082b
--- /dev/null
+++ b/man/ols.Rd
@@ -0,0 +1,13 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ols.R
+\name{ols}
+\alias{ols}
+\title{Ordinary least squares with generalized inverse from the ddml package
+See ?ddml::ols}
+\usage{
+ols(y, X, const = FALSE, w = NULL)
+}
+\description{
+Ordinary least squares with generalized inverse from the ddml package
+See ?ddml::ols
+}
diff --git a/man/predict.kcmeans.Rd b/man/predict.kcmeans.Rd
new file mode 100644
index 0000000..d1cd0bb
--- /dev/null
+++ b/man/predict.kcmeans.Rd
@@ -0,0 +1,51 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/Fhat.R
+\name{predict.kcmeans}
+\alias{predict.kcmeans}
+\title{Inference Methods for Partially Linear Estimators.}
+\usage{
+\method{predict}{kcmeans}(object, newdata, clusters = FALSE, ...)
+}
+\arguments{
+\item{object}{An object of class \code{ddml_plm}, \code{ddml_pliv}, or
+\code{ddml_fpliv} as fitted by \code{\link[ddml:ddml_plm]{ddml::ddml_plm()}}, \code{\link[ddml:ddml_pliv]{ddml::ddml_pliv()}},
+and \code{\link[ddml:ddml_fpliv]{ddml::ddml_fpliv()}}, respectively.}
+
+\item{...}{Additional arguments passed to \code{vcovHC}. See
+\code{\link[sandwich:vcovHC]{sandwich::vcovHC()}} for a complete list of arguments.}
+}
+\value{
+An array with inference results for each \code{ensemble_type}.
+}
+\description{
+Inference methods for partially linear estimators. Simple
+wrapper for \code{\link[sandwich:vcovHC]{sandwich::vcovHC()}}.
+}
+\examples{
+# Construct variables from the included Angrist & Evans (1998) data
+y = AE98[, "worked"]
+D = AE98[, "morekids"]
+X = AE98[, c("age","agefst","black","hisp","othrace","educ")]
+
+# Estimate the partially linear model using a single base learner, ridge.
+plm_fit <- ddml_plm(y, D, X,
+                    learners = list(what = mdl_glmnet,
+                                    args = list(alpha = 0)),
+                    sample_folds = 2,
+                    silent = TRUE)
+summary(plm_fit)
+}
+\references{
+Zeileis A (2004). "Econometric Computing with HC and HAC Covariance Matrix
+Estimators.” Journal of Statistical Software, 11(10), 1-17.
+
+Zeileis A (2006). “Object-Oriented Computation of Sandwich Estimators.”
+Journal of Statistical Software, 16(9), 1-16.
+
+Zeileis A, Köll S, Graham N (2020). “Various Versatile Variances: An
+Object-Oriented Implementation of Clustered Covariances in R.” Journal of
+Statistical Software, 95(1), 1-36.
+}
+\seealso{
+\code{\link[sandwich:vcovHC]{sandwich::vcovHC()}}
+}
diff --git a/man/toy_fun.Rd b/man/toy_fun.Rd
index 69b4af8..8aaeb05 100644
--- a/man/toy_fun.Rd
+++ b/man/toy_fun.Rd
@@ -1,26 +1,15 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/Fhat.R, R/toy_fun.R
+% Please edit documentation in R/toy_fun.R
 \name{toy_fun}
 \alias{toy_fun}
-\title{First stage estimator.}
+\title{Toy function.}
 \usage{
-toy_fun(y)
-
 toy_fun(y)
 }
 \arguments{
 \item{x}{A numerical vector.}
 }
 \value{
-\code{toy_fun} returns an object of S3 class
-\code{toy_fun}. An object of class \code{toy_fun} is a list containing
-the following components:
-\describe{
-\item{\code{fun}}{A boolean on whether you had fun..}
-\item{\code{y}}{Pass-through of the ser-provided arguments.
-See above.}
-}
-
 \code{toy_fun} returns an object of S3 class
 \code{toy_fun}. An object of class \code{toy_fun} is a list containing
 the following components:
@@ -31,14 +20,10 @@ See above.}
 }
 }
 \description{
-This is a simple toy function.
-
 This is a simple toy function.
 }
 \examples{
 res <- toy_fun(rnorm(100))
 res$fun
-res <- toy_fun(rnorm(100))
-res$fun
 }
 \concept{toys}
diff --git a/tests/testthat/test-Fhat.R b/tests/testthat/test-Fhat.R
index 4be29bf..ccf4380 100644
--- a/tests/testthat/test-Fhat.R
+++ b/tests/testthat/test-Fhat.R
@@ -1,8 +1,42 @@
-test_that("Fhat computes", {
-  # Generate data
+test_that("kcmenas computes", {
+  # Get data from the included SimDat data
+  y <- SimDat$D
+  X <- SimDat$Z
 
+  # Compute kcmeans
+  kcmeans_fit <- kcmeans(y, X, K = 3)
 
+  # Check output with expectations
+  expect_equal(dim(kcmeans_fit), c(60, 5))
+})#TEST_THAT
+
+test_that("kcmenas computes with additional controls", {
+  # Get data from the included SimDat data
+  y <- SimDat$D
+  X <- cbind(SimDat$Z, SimDat$X)
+
+
+  # Compute kcmeans
+  kcmeans_fit <- kcmeans(y, X, K = 3)
+
+  # Check output with expectations
+  expect_equal(dim(kcmeans_fit), c(60, 5))
+})#TEST_THAT
+
+test_that("predict.kcmenas computes w/ unseen categories", {
+  # Get data from the included SimDat data
+  y <- SimDat$D
+  X <- cbind(SimDat$Z, SimDat$X)
+
+  # Compute kcmeans
+  kcmeans_fit <- kcmeans(y, X, K = 3)
+
+  # Compute predictions w/ unseen categories
+  newdata <- X
+  newdata[1:20, 1] <- -22
+
+  fitted_values <- predict(kcmeans_fit, newdata)
 
   # Check output with expectations
-  expect_equal(length(myfun), 1)
+  expect_equal(dim(kcmeans_fit), c(60, 5))
 })#TEST_THAT