diff --git a/paper/joint.png b/paper/joint.png
index baeb239..03626fa 100644
Binary files a/paper/joint.png and b/paper/joint.png differ
diff --git a/paper/jones.R b/paper/jones.R
index b329374..5dfdc1f 100644
--- a/paper/jones.R
+++ b/paper/jones.R
@@ -42,14 +42,16 @@ ggplot(mp.int[fun == "variance"], aes(Petal.Width, Petal.Length)) +
 ggsave("mp_int_var.png", width = 8, height = 4)
 
 ## marginalization example
-mp = function(n, sigma, f) {
+mp = function(n, sigma, f, ...) {
   aggregate.fun = function(x, w) {
     mu = weighted.mean(x, w)
     sigma = (sum(w * x^2) * sum(w) - sum(w * x)^2) / (sum(w)^2 - sum(w^2))
     list(
       "mean" = mu,
       "lower" = mu + qnorm(.025) * sigma,
-      "upper" = mu + qnorm(.975) * sigma)
+      "upper" = mu + qnorm(.975) * sigma,
+      "sigma" = sigma
+      )
   }
 
   points = list("X1" = seq(-4, 4, length.out = 100))
@@ -58,7 +60,7 @@ mp = function(n, sigma, f) {
   p.marginal = function(design, data, sigma) dnorm(design$X2)
 
   X = rmvnorm(n, sigma = sigma)
-  data = data.frame(y = f(NULL, X), X)
+  data = data.frame(y = f(NULL, data.frame(X)), X)
   list(
     "joint" = marginalPrediction(data[, c("X1", "X2")], "X1", c(25, n), NULL,
       points = points, weight.fun = p.joint, predict.fun = f,
@@ -72,19 +74,24 @@ mp = function(n, sigma, f) {
 n = 10000
 sigma.diagonal = diag(2)
 sigma.dependent = matrix(c(1, .5, .5, 1), 2, 2)
+sigma.het = matrix(c(1, .9, .9, 1), 2, 2)
 
 X = data.frame(rmvnorm(500, sigma = sigma.dependent))
 ggplot(X, aes(X1, X2)) + 
   geom_point()
 ggsave("joint.png", width = 7, height = 6)
 
-f.additive = function(object, newdata) as.numeric(as.matrix(newdata) %*% c(1, 1))
-f.interaction = function(object, newdata)
-  as.numeric(as.matrix(data.frame(newdata, newdata[, 1] * newdata[, 2])) %*% c(1, 1, .5))
+f.additive = function(object, newdata, theta = c(1, 1))
+  newdata$X1 * theta[1] + newdata$X2 * theta[2]
+f.interaction = function(object, newdata, theta = c(1, 1, .5))
+  f.additive(object, newdata, theta[1:2]) + newdata$X1 * newdata$X2 * theta[3]
+## f.breakit = function(object, newdata)
+##   f.interaction(object, newdata, c(1, 15, 10))
 
 plt = list(
   "additive" = mp(n, sigma.dependent, f.additive),
-  "interaction" = mp(n, sigma.dependent, f.interaction)
+  "interaction" = mp(n, sigma.dependent, f.interaction)## ,
+  ## "broken" = mp(n, sigma.het, f.breakit)
 )
 
 plt = rbindlist(lapply(plt, rbindlist, idcol = "estimation"), idcol = "sim.type")
@@ -95,3 +102,8 @@ ggplot(plt, aes(X1, preds.mean, color = estimation)) +
   labs(y = expression(f(X[1])), x = expression(X[1])) +
   geom_line(aes(X1, X1), linetype = "dashed", color = "black")
 ggsave("mvj.png", width = 8, height = 4)
+
+ggplot(plt[estimation == "marginal", ], aes(X1, preds.sigma)) + geom_line() +
+  facet_wrap(~ sim.type) +
+  labs(y = expression(var(f(X[1]))), x = expression(X[1]))
+ggsave("mvv.png", width = 8, height = 4)
diff --git a/paper/jones.bib b/paper/jones.bib
index dfa8381..53439c9 100644
--- a/paper/jones.bib
+++ b/paper/jones.bib
@@ -111,3 +111,147 @@ @article{jones2016
                   Forests},
   journal =	 {The Journal of Open Source Software}
 }
+
+@article{metropolis1949monte,
+  title =	 {The monte carlo method},
+  author =	 {Metropolis, Nicholas and Ulam, Stanislaw},
+  journal =	 {Journal of the American statistical association},
+  volume =	 44,
+  number =	 247,
+  pages =	 {335--341},
+  year =	 1949,
+  publisher =	 {Taylor \& Francis Group}
+}
+
+@incollection{hammersley1964general,
+  title =	 {General principles of the Monte Carlo method},
+  author =	 {Hammersley, John Michael and Handscomb, David
+                  Christopher},
+  booktitle =	 {Monte Carlo Methods},
+  pages =	 {50--75},
+  year =	 1964,
+  publisher =	 {Springer}
+}
+
+@article{athey2015machine,
+  title={Machine learning methods for estimating heterogeneous causal effects},
+  author={Athey, Susan and Imbens, Guido W},
+  journal={stat},
+  volume={1050},
+  number={5},
+  year={2015}
+}
+
+@article{wager2017estimation,
+  title={Estimation and inference of heterogeneous treatment effects using random forests},
+  author={Wager, Stefan and Athey, Susan},
+  journal={Journal of the American Statistical Association},
+  number={just-accepted},
+  year={2017},
+  publisher={Taylor \& Francis}
+}
+
+@inproceedings{athey2015machine,
+  title={Machine learning and causal inference for policy evaluation},
+  author={Athey, Susan},
+  booktitle={Proceedings of the 21th ACM SIGKDD international conference on knowledge discovery and data mining},
+  pages={5--6},
+  year={2015},
+  organization={ACM}
+}
+
+@article{mullainathan2017machine,
+  title={Machine learning: an applied econometric approach},
+  author={Mullainathan, Sendhil and Spiess, Jann},
+  journal={Journal of Economic Perspectives},
+  volume={31},
+  number={2},
+  pages={87--106},
+  year={2017}
+}
+
+@article{athey2016recursive,
+  title={Recursive partitioning for heterogeneous causal effects},
+  author={Athey, Susan and Imbens, Guido},
+  journal={Proceedings of the National Academy of Sciences},
+  volume={113},
+  number={27},
+  pages={7353--7360},
+  year={2016},
+  publisher={National Acad Sciences}
+}
+
+@article{athey2017state,
+  title={The state of applied econometrics: Causality and policy evaluation},
+  author={Athey, Susan and Imbens, Guido W},
+  journal={Journal of Economic Perspectives},
+  volume={31},
+  number={2},
+  pages={3--32},
+  year={2017}
+}
+
+
+@article{LupuJones,
+  title =	 {Is There More Violence in the Middle?},
+  author =	 {Lupu, Yonatan and Jones, Zachary M.},
+  journal =	 {American Journal of Political Science},
+  year =	 2018
+}
+
+@article{hill2014empirical,
+  title =	 {An Empirical Evaluation of Explanations for State
+                  Repression},
+  author =	 {Hill, Daniel W and Jones, Zachary M},
+  journal =	 {American Political Science Review},
+  volume =	 108,
+  number =	 03,
+  pages =	 {661--687},
+  year =	 2014,
+  publisher =	 {Cambridge Univ Press}
+}
+
+@article{doi:10.1162/003465304323023651,
+author = { Guido W. Imbens },
+title = {Nonparametric Estimation of Average Treatment Effects Under Exogeneity: A Review},
+journal = {The Review of Economics and Statistics},
+volume = {86},
+number = {1},
+pages = {4-29},
+year = {2004},
+doi = {10.1162/003465304323023651},
+
+URL = { 
+        https://doi.org/10.1162/003465304323023651
+    
+},
+eprint = { 
+        https://doi.org/10.1162/003465304323023651
+    
+}
+,
+    abstract = { Recently there has been a surge in econometric work focusing on estimating average treatment effects under various sets of assumptions. One strand of this literature has developed methods for estimating average treatment effects for a binary treatment under assumptions variously described as exogeneity, unconfoundedness, or selection on observables. The implication of these assumptions is that systematic (for example, average or distributional) differences in outcomes between treated and control units with the same values for the covariates are attributable to the treatment. Recent analysis has considered estimation and inference for average treatment effects under weaker assumptions than typical of the earlier literature by avoiding distributional and functional-form assumptions. Various methods of semiparametric estimation have been proposed, including estimating the unknown regression functions, matching, methods using the propensity score such as weighting and blocking, and combinations of these approaches. In this paper I review the state of this literature and discuss some of its unanswered questions, focusing in particular on the practical implementation of these methods, the plausibility of this exogeneity assumption in economic applications, the relative performance of the various semiparametric estimators when the key assumptions (unconfoundedness and overlap) are satisfied, alternative estimands such as quantile treatment effects, and alternate methods such as Bayesian inference. }
+}
+
+
+@article{berk2009forecasting,
+  title={Forecasting murder within a population of probationers and parolees: a high stakes application of statistical learning},
+  author={Berk, Richard and Sherman, Lawrence and Barnes, Geoffrey and Kurtz, Ellen and Ahlman, Lindsay},
+  journal={Journal of the Royal Statistical Society: Series A (Statistics in Society)},
+  volume={172},
+  number={1},
+  pages={191--211},
+  year={2009},
+  publisher={Wiley Online Library}
+}
+
+@article{blair2017predicting,
+  title={Predicting local violence: Evidence from a panel survey in Liberia},
+  author={Blair, Robert A and Blattman, Christopher and Hartman, Alexandra},
+  journal={Journal of Peace Research},
+  volume={54},
+  number={2},
+  pages={298--312},
+  year={2017},
+  publisher={SAGE Publications Sage UK: London, England}
+}
\ No newline at end of file
diff --git a/paper/jones.pdf b/paper/jones.pdf
index 02f3e86..1909eb7 100644
Binary files a/paper/jones.pdf and b/paper/jones.pdf differ
diff --git a/paper/jones.tex b/paper/jones.tex
index a0dc555..7774bf6 100644
--- a/paper/jones.tex
+++ b/paper/jones.tex
@@ -5,18 +5,20 @@
 \maketitle
 
 \abstract{
-Machine learning methods can often learn high-dimensional functions which generalize well but are not human interpretable. \code{mmpf} marginalizes prediction functions using Monte-Carlo methods, allowing users to investigate the behavior of these learned functions as on a lower dimensional subset of input features: partial dependence and variations thereof. This makes machine learning methods more useful in situations where accurate prediction is not the only goal, such as in the social sciences where linear models are commonly used because of their interpretability.
+Machine learning methods can often learn high-dimensional functions which generalize well but are not human interpretable. \code{mmpf} marginalizes prediction functions using Monte-Carlo methods, allowing users to investigate the behavior of these learned functions on subsets of input features: partial dependence and variations thereof. This makes machine learning methods more useful in situations where accurate prediction is not the only goal. 
 }
 
-Many methods for estimating prediction functions produce estimated functions which are not directly human-interpretable because of their complexity: they may include high-dimensional interactions and/or complex nonlinearities. While a learning method's capacity to automatically learn interactions and nonlinearities is attractive when the goal is prediction, there are many cases where users want good predictions \textit{and} the ability to understand how predictions depend on the features. \code{mmpf} implements general methods for interpreting prediction functions using Monte-Carlo methods. These methods allow any function which generates predictions to be be interpreted. \code{mmpf} is currently used in other packages for machine learning like \code{edarf} and \code{mlr} \citep{jones2016,JMLR:v17:15-066}.
+Many methods for estimating prediction functions produce estimated functions which are not directly human-interpretable because of their complexity: they may include high-dimensional interactions and/or complex nonlinearities. Additionally, with many machine learning methods, the resulting prediction function is difficult to interpret even when the function it represents is simple enough to be human-interpretable (e.g., a random forest fit to data from a linear and additive statistical model). While a learning method's capacity to automatically learn interactions and nonlinearities is attractive when the goal is prediction, there are many cases where users want good predictions \textit{and} the ability to understand how predictions depend on the features. \code{mmpf} implements general methods for interpreting prediction functions using Monte-Carlo methods \citep{friedman2001greedy}. These methods allow any function which generates predictions to be be interpreted. \code{mmpf} is currently used in other packages for machine learning like \code{edarf} and \code{mlr} \citep{jones2016,JMLR:v17:15-066}.
+
+Machine learning has found application in a number of areas where, correspondingly, this package may be of use. Within economics and political science, for example, useful areas of application might include the estimation of treatment effects and their heterogeneity \citep{doi:10.1162/003465304323023651,athey2017state}, exploratory analysis wherein the goal is to uncover predictively important features for future study \citep{LupuJones,hill2014empirical}, and in the analysis of forecast models \citep{berk2009forecasting,blair2017predicting}.
 
 \section{Marginalizing Prediction Functions}
 
-The core function of \code{mmpf}, \code{marginalPrediction}, allows marginalization of a prediction function so that it depends on a subset of the features. Say the matrix of features $\mathbf{X}$ is partitioned into two subsets, $\mathbf{X}_u$ and $\mathbf{X}_{-u}$, where the former is of primary interest. A prediction function $f$ which in the regression case maps $\mathbf{X} \rightarrow \mathbf{y}$, where $\mathbf{y}$ is a real-valued vector might not be additive or linear in the columns of $\mathbf{X}_u$, making $f$ difficult to interpret directly. To obtain the marginal relationship between $\mathbf{X}_u$ and $f$ we could marginalize the joint distribution so that we obtain a function $f_u$ which only depends on the relevant subset of the features.
+The core function of \code{mmpf}, \code{marginalPrediction}, allows marginalization of a prediction function so that it depends on a subset of the features. Say the matrix of features $\mathbf{X}$ is partitioned into two subsets, $\mathbf{X}_u$ and $\mathbf{X}_{-u}$, where the former is of primary interest. A prediction function $f$ which in the regression case maps $\mathbf{X} \rightarrow \mathbf{y}$, where $\mathbf{y}$ is a real-valued vector, might not be additive or linear in the columns of $\mathbf{X}_u$, making $f$ difficult to interpret directly. To obtain the marginal relationship between $\mathbf{X}_u$ and $f$ we could marginalize the joint distribution so that we obtain a function $f_u$ which only depends on the relevant subset of the features.
 
 $$f_u (\mathbf{X}_u) = \int f(\mathbf{X}_u, \mathbf{X}_{-u}) \mathbb{P}(\mathbf{X}_u | \mathbf{X}_{-u}) \mathbb{P}(\mathbf{X}_{-u}) d \mathbf{X_{-u}} \label{eq:joint}$$
 
-This however, can distort the relationship between $\mathbf{X}_u$ and $f$ because of the inclusion of dependence between $\mathbf{X}_u$ and $\mathbf{X}_{-u}$ (specifically $\mathbf{X}_u | \mathbf{X}_{-u}$), which is unrelated to $f$. An alternative is to instead integrate against the marginal distribution of $\mathbf{X}_{-u}$ as in \ref{eq:marginal}, as suggested by \citep{friedman2001greedy}.
+This however, can distort the relationship between $\mathbf{X}_u$ and $f$ because of the inclusion of dependence between $\mathbf{X}_u$ and $\mathbf{X}_{-u}$ (specifically $\mathbf{X}_u | \mathbf{X}_{-u}$), which is not directly related to $f$.\footnote{As noted by \cite{hooker2004discovering} being able to separate $f$ into components additive in $\mathbf{X}_u$ and $\mathbf{X}_{-u}$ does not mean that $\mathbf{X}_u$ and $\mathbf{X}_{-u}$ are independent.} An alternative is to instead integrate against the marginal distribution of $\mathbf{X}_{-u}$ as in \ref{eq:marginal}, as suggested by \citep{friedman2001greedy}.
 
 $$\tilde{f}_u (\mathbf{X}_u) = \int f(\mathbf{X}_u, \mathbf{X}_{-u}) \mathbb{P}(\mathbf{X}_{-u}) d \mathbf{X_{-u}}  \label{eq:marginal}$$
 
@@ -27,17 +29,17 @@ \section{Marginalizing Prediction Functions}
 
 To illustrate this point, suppose data are generated from an additive model, $f(\cdot) = \mathbf{\mathbf{x}_1} + \mathbf{\mathbf{x}_2}$ and $(\mathbf{x}_1, \mathbf{x}_2) \sim \text{MVN}(\mathbf{0}, \Sigma)$ where the diagonal entries of $\Sigma$ are $1$ and the off-diagonals are $.5$. That is, $(\mathbf{x}_1, \mathbf{x}_2)$ are correlated by construction. Now if we want to understand how $f$ depends on $\mathbf{x}_1$ we could integrate against the true joint distribution as in \ref{eq:joint}. However, this distorts the relationship between $\mathbf{x}_1$ and $f$ because the conditional probability of $\mathbf{x}_1$ given $\mathbf{x}_2$ is higher at values of $(\mathbf{x}_1, \mathbf{x}_2)$ which are more extreme (due to their correlation). Since $\mathbf{x}_2$ is related to $f$ this has the effect of distorting the relationship between $\mathbf{x}_1$ and $f$, and, in this case, makes the relationship appear more extreme than it is, as can be seen in the left panel of Figure \ref{fig:sim}. This distortion of the relationship between $\mathbf{x}_1$ and $f$ can be made more misleading if $\mathbf{x}_2$ interacts with $\mathbf{x}_1$ to produce $f$, or if $\mathbf{x}_2$ has a nonlinear relationship with $f$, as can be seen in the right panel of Figure \ref{fig:sim}.
 
-Integrating against the marginal distribution of $\mathbf{x}_1$ recovers the true additive effect (left panel) and the average marginal effect ($\mathbf{x}_1 + .5 \mathbf{x}_1 \bar{\mathbf{x}_2}$, in the right panel) respectively.
+Integrating against the marginal distribution of $\mathbf{x}_1$ recovers the true additive effect (left panel) and the average marginal effect ($\mathbf{x}_1 + .5 \mathbf{x}_1 \bar{\mathbf{x}}_2$, in the right panel) respectively.
 
 \section{Using \code{mmpf}}
 
-In practical settings we do not know $\mathbb{P}(\mathbf{X})$ or $f$. We can use $\hat{f}$, estimated from $(\mathbf{X}, \mathbf{y})$ as a plug-in estimator for $f$ and can estimate $\mathbb{P}(\mathbf{X}_{-u})$ from the training data, allowing us to estimate the \emph{partial dependence} of $\mathbf{X}_u$ on $\hat{f}$ \cite{friedman2001greedy}.
+In practical settings we do not know $\mathbb{P}(\mathbf{X})$ or $f$. Using $\hat{f}$, estimated from $(\mathbf{X}, \mathbf{y})$ as a plug-in estimator for $f$ and  treating the training data as a realization from $\mathbb{P}(\mathbf{X})$ allows us to estimate the \emph{partial dependence} of $\mathbf{X}_u$ on $\hat{f}$ \cite{friedman2001greedy}.
 
 $$\hat{f}_u (\mathbf{X}_u) = \sum_{i = 1}^N \hat{f} (\mathbf{X}_u, \mathbf{X}_{-u}^{(i)}) \label{eq:pd}$$
 
-This the behavior of the prediction function at a vector or matrix of values for $\mathbf{X}_u$, averaged over the empirical marginal distribution of $\mathbf{X}_{-u}$.
+This the behavior of the prediction function at a vector or matrix of values for $\mathbf{X}_u$, averaged over the empirical marginal distribution of $\mathbf{X}_{-u}$, which is an application of Monte-Carlo integration \citep{metropolis1949monte,hammersley1964general}.
 
-The core function of \code{mmpf}, \code{marginalPrediction}, allows users to compute partial dependence and many variations thereof easily. The key arguments of \code{marginalPrediction} are the prediction function (\code{predict.fun}), the training data (\code{data}), the names of the columns of the training data which are of interest (\code{vars}), the number of points to use in the grid for $\mathbf{X}_u$ and the number of points to sample from $\mathbf{X}_{-u}$ (\code{n}, an integer vector of length 2). Additional arguments control how the grid is constructed (e.g., uniform sampling, user chosen values, non-uniform sampling), allow the use of weights, and how aggregation is done (e.g., deviations from partial dependence). Below is an example using the Iris data \citep{anderson1936species}.
+The core function of \code{mmpf}, \code{marginalPrediction}, allows users to compute partial dependence and many variations thereof easily. The key arguments of \code{marginalPrediction} are the prediction function (\code{predict.fun}), the training data (\code{data}), the names of the columns of the training data which are of interest (\code{vars}), the number of points to use in the grid for $\mathbf{X}_u$ and the number of points to sample from $\mathbf{X}_{-u}$ (\code{n}, an integer vector of length 2). Additional arguments control how the grid is constructed (e.g., uniform sampling, user chosen values, non-uniform sampling), allow the use of weights (e.g., to exclude certain regions of the feature space), and how aggregation is done (e.g., deviations from partial dependence). Below is an example using the Iris data \citep{anderson1936species}.
 
 \begin{example}
   library(mmpf)
@@ -72,7 +74,7 @@ \section{Using \code{mmpf}}
   \caption{The expected value of $\hat{f}$ estimated by a random forest and marginalized by Monte-Carlo integration to depend only on ``Petal.Width.'' \label{figure:mp}}
 \end{figure}
 
-In fact, \textit{any} function of the marginalized function $\hat{f}_u$ can be computed, including vector-valued functions. For example the expectation and variance of $\hat{f}_u$ can be simultaneously computed, the results of which are shown in Figures \ref{figure:mp_int_mean} and \ref{figure:mp_int_var}. Computing the variance of $\hat{f}_u$ can be used for detecting interactions between $\mathbf{X}_u$ and $\mathbf{X}_{-u}$ \citep{goldstein2015peeking}. If the variance of $\hat{f}_u(\mathbf{X}_u)$ is constant then this indicates that $\mathbf{X}_{-u}$ does not interact with $\mathbf{X}_u$, since, if it did, this would make $\hat{f}$ more variable in regions of the joint distribution wherein there is interaction between $\mathbf{X}_u$ and $\mathbf{X}_{-u}$.
+In fact, \textit{any} function of the marginalized function $\hat{f}_u$ can be computed, including vector-valued functions. For example the expectation and variance of $\hat{f}_u$ can be simultaneously computed, the results of which are shown in Figures \ref{figure:mp_int_mean} and \ref{figure:mp_int_var}. Computing the variance of $\hat{f}_u$ can be used for detecting interactions between $\mathbf{X}_u$ and $\mathbf{X}_{-u}$ \citep{goldstein2015peeking}. If the variance of $\hat{f}_u(\mathbf{X}_u)$ is constant then this indicates that $\mathbf{X}_{-u}$ does not interact with $\mathbf{X}_u$, since, if it did, this would make $\hat{f}$ more variable in regions of the joint distribution wherein there is interaction between $\mathbf{X}_u$ and $\mathbf{X}_{-u}$. An additional result of this capacity is the ability to propagate estimates of sampling uncertainty to the output of \code{marginalPrediction}. 
 
 \begin{example}
 mp.int = marginalPrediction(data = iris.features,
@@ -114,7 +116,7 @@ \section{Using \code{mmpf}}
 \bibliography{jones}
 
 \address{Zachary M. Jones\\
-  Pennsylvania State University\\
-  University Park, Pennsylvania\\
+  The University of Washington\\
+  Seattle, Washington\\
   United States\\}
 \email{zmj@zmjones.com}
\ No newline at end of file
diff --git a/paper/memo.txt b/paper/memo.txt
new file mode 100644
index 0000000..67a2514
--- /dev/null
+++ b/paper/memo.txt
@@ -0,0 +1,19 @@
+
+for reviewer 1.
+
+there is a bit of literature on treatment effect heterogeneity that seems very similar to what you suggest. i added a paragraph which discusses potential applications of these methods and i reference some papers in this literature there. the consensus amongst the econometricians and statisticians of whose work i am aware of seems to be that regularization bias can be a significant problem, and to avoid this ml has to be tuned a bit differently, used in a multi-step process which removes this bias, or formulated differently (i.e., by predicting treatment effects rather than outcomes). really seemingly all of these things. discussing all of this i think would be too much (beyond the scope of the paper), so i felt it was appropriate to instead just reference some of the literature, as i haven't made a contribution to ml/causal inference at all.
+
+since mmpf can operate on prediction function which produce a vector of outputs for each input (e.g., class probabilities) it can also operate on summaries of the conditional distribution of the outcome which are also vector valued. so you could do quantile regression and then compute partial dependence using mmpf. thus it will work in the case where your prediction model's output is a expected value and a variance or standard error. in that sense i can propagate estimates of sampling uncertainty based on the underlying model. i've now added a sentence mentioning this. in some of the software i've built that uses mmpf this is more of a built-out feature (e.g. in mlr). i don't currently have a way to represent uncertainty from the additional approximation error induced by using the mmpf approximation though. that is something i am working on now as a part of a larger project.
+
+for reviewer 2.
+
+1. i added some appropriate references.
+1.2. i don't think that is specifically correct. for example suppose we have data from a two parameter linear-additive model y = x * beta + epsilon_w where when w = 1 epsilon is 1 and when it is 0 epsilon is 10. if you fit a model of the form y | x = x * beta_1 i(w = 1) + x * beta_2 i(w = 0) where i(.) is the indicator function, and took the derivative w.r.t. x you'd get beta_1 i(w = 1) + beta_2 i(w = 0). the expectation of this is just beta. i know that is very basic but all i'm saying is that heteroskedasticity can screw up inference which depends on homoskedasticity, but almost by definition it can't screw up the mean function estimation, which is what all of this is about. asymptotics aside i am sure you could get unlucky and have heteroskedasticity look like first order misspecification (i tend to think they both occur simultaneously in practice), but as an exploratory diagnostic i don't think it is problematic for this reason. in any case what you seem to be focused on is the ability of an underlying model to be biased because of heteroskedasticity (i.e. confusing unequal variance for unequal means) which is certainly possible. i view it as advantageous that a diagnostic would faithfully represent that the model learned that to the user. i did some simulations using the simulation i used for this paper and at least in that simple case my intuition appears to be right. heteroskedasticity can make integrating against the joint distribution much more distorting as well.
+
+2.1. i use mmpf in two separate packages (edarf and mlr) that do exactly this, but i don't think this fits here. this was intended to be lightweight, have minimal dependencies, and provide the user with maximum control. it isn't really usable if you don't have some programming ability and i think it is reasonable to expect the user to be able to make basic plots, or they can use one of the more end-to-end solutions i mentioned.
+
+2.2. in this scenario x2 is indirectly related to f of course. the way i've (and giles hooker and also david friedman) have posed it the goal is the accurate (faithful to the fit model) recovery of additive components. to them i guess it is obvious that integrating against the joint distribution is misleading, and my little simulation just highlights that. so in that sense i'm not sure why you'd want to integrate against the joint distribution even if you had it (like i do in the simulation). 
+
+friedman discusses using an estimate of  p(x_u | x_{-u}) in his 2001 paper (and dismisses it for the first reason), and hooker (2007) sort of indirectly discusses it by discussing a symptom of integrating against the marginal distribution, which is that the estimates of the additive components can, in some scenarios, depend on functional behavior in regions of low density (especially in high dimensions). i.e. if there is strong dependence between x_u and x_{-u} this can mean that their joint distribution is very non-uniform, and what a model learned on such data says in those empty regions depends much more on the model class than on the data.
+
+in any case you can already do what you suggest by using the weights argument to marginalPrediction (which is how i setup the simulation to show how this is a bad idea). 
diff --git a/paper/mp.png b/paper/mp.png
index 55d3f8d..f4b25bc 100644
Binary files a/paper/mp.png and b/paper/mp.png differ
diff --git a/paper/mp_int_mean.png b/paper/mp_int_mean.png
index b8596ac..b576785 100644
Binary files a/paper/mp_int_mean.png and b/paper/mp_int_mean.png differ
diff --git a/paper/mp_int_var.png b/paper/mp_int_var.png
index 8a8fb82..f07ac49 100644
Binary files a/paper/mp_int_var.png and b/paper/mp_int_var.png differ
diff --git a/paper/mvj.png b/paper/mvj.png
index d91a566..31e5946 100644
Binary files a/paper/mvj.png and b/paper/mvj.png differ
diff --git a/paper/mvv.png b/paper/mvv.png
new file mode 100644
index 0000000..ecd0d01
Binary files /dev/null and b/paper/mvv.png differ