Chapter1.tex

\documentclass[10pt]{beamer}

\setbeamersize{text margin left=0.5cm, text margin right=0.5cm}

\usepackage{alltt}%
%\usetheme{Boadilla}
\usetheme[progressbar = foot, background=light]{metropolis} 
%\useoutertheme{split}

%\usecolortheme{beaver}

%\usepackage{listings}
\makeatletter
\def\maxwidth{ %
  \ifdim\Gin@nat@width>\linewidth
    \linewidth
  \else
    \Gin@nat@width
  \fi
}
\makeatother

\definecolor{fgcolor}{rgb}{0.345, 0.345, 0.345}
\newcommand{\hlnum}[1]{\textcolor[rgb]{0.686,0.059,0.569}{#1}}%
\newcommand{\hlstr}[1]{\textcolor[rgb]{0.192,0.494,0.8}{#1}}%
\newcommand{\hlcom}[1]{\textcolor[rgb]{0.678,0.584,0.686}{\textit{#1}}}%
\newcommand{\hlopt}[1]{\textcolor[rgb]{0,0,0}{#1}}%
\newcommand{\hlstd}[1]{\textcolor[rgb]{0.345,0.345,0.345}{#1}}%
\newcommand{\hlkwa}[1]{\textcolor[rgb]{0.161,0.373,0.58}{\textbf{#1}}}%
\newcommand{\hlkwb}[1]{\textcolor[rgb]{0.69,0.353,0.396}{#1}}%
\newcommand{\hlkwc}[1]{\textcolor[rgb]{0.333,0.667,0.333}{#1}}%
\newcommand{\hlkwd}[1]{\textcolor[rgb]{0.737,0.353,0.396}{\textbf{#1}}}%
\let\hlipl\hlkwb

\usepackage{framed}
\makeatletter
\newenvironment{kframe}{%
 \def\at@end@of@kframe{}%
 \ifinner\ifhmode%
  \def\at@end@of@kframe{\end{minipage}}%
  \begin{minipage}{\columnwidth}%
 \fi\fi%
 \def\FrameCommand##1{\hskip\@totalleftmargin \hskip-\fboxsep
 \colorbox{shadecolor}{##1}\hskip-\fboxsep
     % There is no \\@totalrightmargin, so:
     \hskip-\linewidth \hskip-\@totalleftmargin \hskip\columnwidth}%
 \MakeFramed {\advance\hsize-\width
   \@totalleftmargin\z@ \linewidth\hsize
   \@setminipage}}%
 {\par\unskip\endMakeFramed%
 \at@end@of@kframe}
\makeatother

\definecolor{shadecolor}{rgb}{.97, .97, .97}
\definecolor{messagecolor}{rgb}{0, 0, 0}
\definecolor{warningcolor}{rgb}{1, 0, 1}
\definecolor{errorcolor}{rgb}{1, 0, 0}
\newenvironment{knitrout}{}{} % an empty environment to be redefined in TeX
    
\usepackage[utf8]{inputenc}
\usepackage{default}

\usepackage{xcolor}%for color mixing

\usepackage{amsmath}%
\usepackage{amsfonts}%
\usepackage{amssymb}%
\usepackage{graphicx}

\usepackage{tikz}


\setbeamertemplate{itemize/enumerate body begin}{\small}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\title{Statistical Thinking in Biology Research}
\subtitle{Chapter 1}
\author[T.Bonnet, T. Neeman]{Timoth\'ee Bonnet \& Terry Neeman}
\institute[RSB/BDSI]{Research School Biology and Biological Data Science Institute}
\date{\today}

\begin{document}

%\lstset{language=R}%code
\setbeamerfont{section in toc}{size*={14}{16}}
\AtBeginSection[]
{
  \begin{frame}<beamer>
    \frametitle{}
    \tableofcontents[currentsection,sectionstyle=show/shaded,subsectionstyle=show/shaded/hide]% down vote\tableofcontents[currentsection,currentsubsection,hideothersubsections,sectionstyle=show/hide,subsectionstyle=show/shaded/hide] 
  \end{frame}
}


\begin{frame}{}
\maketitle

\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}{A bit of history of statistical methods}
 
 R.A. Fisher: 1890-1962
 \only<1>{\begin{center}
  \includegraphics[width=0.5\textwidth]{Figures/fisher}
 \end{center}}
 
 \only<2>{\begin{center}
  \includegraphics[width=0.9\textwidth]{Figures/fields}
 \end{center}}
 
 Statistical Principles for Research Workers (1925)

\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}{The big picture}
\begin{quote}
To call in the statistician after the experiment is done may be no more than asking him to perform a post-mortem examination: he may be able to say what the experiment died of.
\end{quote}
\textbf{Sir Ronald Fisher} \\ \footnotesize Presidential Address to the First Indian Statistical Congress, 1938. Sankhya 4, 14-17

\pause

\vfill
\textbf{You won't need to call in a statistician too often if you are (almost) one yourself}
\vfill

\pause 

\begin{exampleblock}{Statistics is\dots}
    \begin{enumerate}
     \item interesting
     \item a unifying language of sciences
     \item empowering 
    \end{enumerate}
\end{exampleblock}

\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}{Approximate plan}
\begin{columns}
 \begin{column}{0.2\textwidth}
  
 \end{column}
 \begin{column}{0.8\textwidth}
 \begin{exampleblock}{}
    \begin{itemize}
     \item[Monday morning] Cautionary tales, General approach to modelling
     \item[Monday afternoon] Experimental design
     \item[Tuesday morning] Mean structure, \textit{additive effects and interactions}
     \item[Tuesday afternoon] Variance structure, \textit{mixed models}
     \item[Wednesday] Data generating process, \textit{GLMs}; + practice with your data?
    \end{itemize}
    \end{exampleblock}
    \end{column}
\end{columns}

If you want, send me:
\begin{itemize}
 \item Your data / your planned experiment
 \item Brief explanation of biological system and question
\end{itemize}
\hfill \dots we will look at it together

\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}{Key ideas for today}
\begin{block}{}
\begin{itemize}[<+->]
 \item Statistics in biology = study biological variation
 \item Undestanding statistical ideas about biological variation:
    \begin{itemize}
     \item Informs the design of experiments
     \item Informs the analysis of experiments
    \end{itemize}
 \item Statistical thinking is an essential component of scientific thinking
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Cautionary tales from the front}

%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}{Message 1: A small p-value is not always evidence of a treatment effect}

  \begin{columns}
    \begin{column}{0.5\textwidth}
	\begin{center}
	\includegraphics[width=\textwidth]{Figures/message1}
	\end{center}
    \end{column}
    
    \begin{column}{0.5\textwidth}
    \begin{block}{Vaccine challenge experiment:}
      \begin{itemize} 
       \item 6 mice/group (saline/low dose/high dose)
       \item All mice challenged with Shigella
       \item Followed for 14 days
       \item  Outcome: Symptom score average Days 2 - 8
      \end{itemize}
      \end{block}
      
      \begin{alertblock}{}
       One-way ANOVA (post-hoc Bonferroni) p=0.04
      \end{alertblock}

    \end{column}
  \end{columns}
  
  \pause \vspace{0.3cm}
  \emph{\large Do you think the vaccine works? What is strange?}
  

\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}{Message 1: A small p-value is not always evidence of a treatment effect}
\pause
 \vspace{-0.2cm}
 \begin{center}
  \includegraphics[width=\textwidth]{Figures/mice}
 \end{center}
 
\end{frame}
%%%%%%%%%%%

\begin{frame}{Message 2: p-values from simple comparisons cannot tell us when differences are “different”}
 \pause
  \begin{columns}
    \begin{column}{0.5\textwidth}
	\begin{center}
	\includegraphics[width=\textwidth]{Figures/drought}
	\end{center}
    \end{column}
    
    \begin{column}{0.5\textwidth}
    \begin{block}{Are temperature mechanisms modified in a genetically modified tomato plant?}
      \begin{itemize}
	\item Genotypes: WT/mutant 
	\item Water condition: Normal/Drought
	\item Leaf temperature measured
      \end{itemize}
      \end{block}
  
    \end{column}
  \end{columns}
   
   
   \begin{alertblock}{T-tests between water conditions:}
GM: p=0.46 ; Wt: p=0.02
   \pause
    \begin{center}
        Evidence of difference + No evidence of difference \\ $\neq$ \\ Evidence that differences are different.
    \end{center}

  \end{alertblock}
      
\end{frame}
%%%%%%%%%%%


\begin{frame}{Message 3: Interpreting experimental results needs more than t-tests}
 \pause
 Research question: Are mice susceptible to obesity when exposed to a high fat diet?
 
  \begin{columns}
    \begin{column}{0.5\textwidth}
	\begin{center}
	\includegraphics[width=\textwidth]{Figures/message3}
	\end{center}
    \end{column}
    
    \begin{column}{0.5\textwidth}
    \begin{block}{Experimental set-up:}
      \begin{itemize}
	\item 37 mice: 16 NODk /21 WT
	\item Randomised to either regular or high fat diet
	\item Monitored for 14 weeks
	\item Outcome measure: Body weight (g)
	\item Experimental factors: Diet (2), Strain (2), Time (8)
      \end{itemize}
      \end{block}
      \tiny Acknowledgements: Ainy Hussain, PhD student 2013
    \end{column}
  \end{columns}
   
\end{frame}
%%%%%%%%%%%

\begin{frame}{Message 4: Knowing how to combine information across subgroups  can improve inference}
 
 \pause 
 
 Comparing yield in five barley varieties (1930s) \\
 Experimental factors: 5 varieties of barley, 6 locations, 2 time points. Outcome measure: yield
  \begin{columns}
    \begin{column}{0.6\textwidth}
	\begin{center}
	\includegraphics[width=\textwidth]{Figures/message4a}
	\end{center}
    \end{column}
    
    \begin{column}{0.4\textwidth}
  
    \end{column}
  \end{columns}
      \tiny Acknowledgements: MASS R-package
   
\end{frame}
%%%%%%%%%%%


\begin{frame}{Message 4: Knowing how to combine information across subgroups  can improve inference}
 
  
 Comparing yield in five barley varieties (1930s) \\
 Experimental factors: 5 varieties of barley, 6 locations, 2 time points. Outcome measure: yield
  \begin{columns}
    \begin{column}{0.5\textwidth}
	\begin{center}
	\includegraphics[width=\textwidth]{Figures/message4b}
	\end{center}
    \end{column}
    
    \begin{column}{0.5\textwidth}
    \begin{block}{Controlling for other sources of variation:}
      \begin{itemize}
	\item Controlling for year = comparing yield WITHIN years and combining these
      \end{itemize}
      \end{block}
      \tiny Acknowledgements: MASS R-package
    \end{column}
  \end{columns}
   
\end{frame}
%%%%%%%%%%%


\begin{frame}{Message 4: Knowing how to combine information across subgroups  can improve inference}
 
  
 Comparing yield in five barley varieties (1930s) \\
 Experimental factors: 5 varieties of barley, 6 locations, 2 time points. Outcome measure: yield
  \begin{columns}
    \begin{column}{0.5\textwidth}
	\begin{center}
	\includegraphics[width=\textwidth]{Figures/message4c}
	\end{center}
    \end{column}
    
    \begin{column}{0.5\textwidth}
    \begin{block}{Controlling for other sources of variation:}
      \begin{itemize}
	\item Control for year = compare yield WITHIN years and combine these
	\item Control for location = compare yield WITHIN locations and combine these
      \end{itemize}
      \end{block}
      \tiny Acknowledgements: MASS R-package
    \end{column}
  \end{columns}
   
\end{frame}
%%%%%%%%%%%


\begin{frame}{Message 4: Knowing how to combine information across subgroups  can improve inference}
 
  \begin{columns}
    \begin{column}{0.5\textwidth}
	\begin{center}
	\includegraphics[width=\textwidth]{Figures/message4d}
	\end{center}
    \end{column}
    
    \begin{column}{0.5\textwidth}
    \begin{block}{Controlling for other sources of variation:}
      \begin{itemize}
	\item Control for year = compare yield WITHIN years and combine these
	\item Control for location = compare yield WITHIN locations and combine these
      \end{itemize}
      \end{block}
      \tiny Acknowledgements: MASS R-package
    \end{column}
  \end{columns}
   
\end{frame}
%%%%%%%%%%%


\begin{frame}{Message 5: Knowing what factors contribute to the variation in outcome helps design experiments and analyses}
 \pause
 Research question: 
How does cold duration impact upon germination in alpine plant \textit{A. glacialis}?

  \begin{columns}
    \begin{column}{0.4\textwidth}
	\begin{center}
	\includegraphics[width=\textwidth]{Figures/message5}
	\end{center}
    \end{column}
    
    \begin{column}{0.6\textwidth}
    \begin{block}{Experimental set-up:}
      \begin{itemize}
	\item Seed collections from alpine region in Australia
	\item 3 Regions -- low/high altitude
	\item 4 sets of Petri dishes
	\item 4 cabinet shelves with different temperatures
	\item Response - \% germinated
      \end{itemize}
      \end{block}
    \end{column}
  \end{columns}
   \vspace{0.1cm}
  \textbf{\emph{What factors other than temperature to consider? }}
\end{frame}
%%%%%%%%%%%

\begin{frame}{Summary}
 \begin{enumerate}[<+->]
  \item A small p-value is not always evidence of a treatment effect. \textbf{Good experimental design matters.}
  \item p-values from simple comparisons cannot tell us when differences are “different”. \textbf{For each question / comparison, a specific test}
  \item Interpreting experimental results needs more than t-tests. \textbf{Need a statistical model of the experiment, matching scientific question.}
  \item Combining information across subgroups can improve inference. \textbf{A statistical model enables accumulation of evidence across experiments.}
  \item Knowing what factors contribute to the variation in outcome matters. \textbf{A statistical model allows one to incorporate effect of other factors in the analysis.}
 \end{enumerate}

\end{frame}
%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Introduction to Statistical Modelling}

\begin{frame}{Introduction to Statistical Modelling}
 
 \begin{block}{What is a statistical model?}
 \pause
   \begin{itemize}
      \item A formal but simplified representation of the world / an experiment
      \item If the representation is good enough for our goal\dots
      \item \dots we can learn something from data
    \end{itemize}
 \end{block}

 
 \pause
 \includegraphics[width=0.5\textwidth]{Figures/mapaus}
 

\end{frame}
%%%%%%%%%%%%

\begin{frame}{Key components of a statistical model of an experiment}
 \begin{enumerate}
\item \textbf{Outcome measure}
  \begin{itemize}
    \item Response variable
    \item Measure of interest
  \end{itemize}
\item \textbf{Experimental factors}
  \begin{itemize}
    \item Conditions that can be manipulated 
    \item Conditions of interest (e.g. genotype, gender) 
    \item Main questions: do the conditions impact upon the outcome measure?
  \end{itemize}
\item \textbf{Blocking factors}
  \begin{itemize}
    \item Conditions (not of interest) that may impact upon the outcome
    \item Sources of variation in the experiment that need to be controlled for
    \item Clustering of experimental units
 \end{itemize}
\end{enumerate}
\pause
  \alert{ALWAYS BEGIN WITH A RESEARCH QUESTION}\\
  $=>$ what is outcome / exp factors / blocking factors; \\
  $=>$ Do not make-up hypotheses a-posteriori

\end{frame}
%%%%%%%%%%%

\begin{frame}{Key Objectives of a statistical model of an experiment}

\begin{itemize}
 \item Compare mean response across different experimental conditions.
  \begin{itemize}
   \item Obtain estimate of “Treatment effect”
   \item Is this “effect” different in subgroups of interest?
  \end{itemize}
 \item What are the most important factors influencing the mean response? 
 \item Subsidiary question: how can we design our experiment in future to more efficiently test our hypotheses?
\end{itemize}


\end{frame}
%%%%%%%%%%%

\begin{frame}{Example 1: Does dark respiration differ between C3 and C4 plants?}

\begin{columns}
 \begin{column}{0.6\textwidth}
  Outcome measure: dark respiration\\
  Experimental factor: Plant type (C4/C3)\\
  Data: 6 plants each of C4, C3

  \begin{block}{Can calculate}
 \begin{itemize}
  \item Observed overall mean
  \item Observed mean C3 plants
  \item Observed mean C4 plants
  \item Variation around each mean
 \end{itemize}
\end{block}
  \end{column}
  \begin{column}{0.4\textwidth}
   \includegraphics[width=0.9\textwidth]{Figures/c34}
  \end{column}

\end{columns}


\end{frame}
%%%%%%%%%%%


\begin{frame}{Example 1: Does dark respiration differ between C3 and C4 plants?}

    \begin{block}{Can calculate}
 \begin{itemize}
  \item Observed overall mean
  \item Observed mean C3 plants
  \item Observed mean C4 plants
  \item Variation around each mean
 \end{itemize}
\end{block}

\textbf{Statistical model}\\
 {\color{purple}{Respiration}} = {\color{blue}{Mean for C3}} + {\color{red}{Difference C4-C3}} * {\color{orange}{(is C4?)}} + {\color{gray}{Noise}}\\
 \pause

${\color{purple}{response}} = {\color{blue}{A}} + {\color{red}{D}} \times {\color{orange}{predictor}} + {\color{gray}{\epsilon}}$

 ${\color{blue}{A}} $ and ${\color{red}{D}}$ are the model PARAMETERS. \\
 We want to infer whether ${\color{red}{D}}$ is different from 0

\end{frame}
%%%%%%%%%%%

\begin{frame}{Example 1: Does dark respiration differ between C3 and C4 plants?}
${\color{purple}{response}} = {\color{blue}{A}} + {\color{red}{D}} \times {\color{orange}{predictor}} + {\color{gray}{\epsilon}}$

  Can we separate the signal ${\color{red}{D}}$ from the noise ${\color{gray}{\epsilon}}$ ?

 \pause
 
 \begin{block}{T-test}
  \begin{itemize}
   \item Outcome is a continuous variable
   \item Experimental factor is one factor with 2 conditions
   \item No blocking factor / corrections
  \end{itemize}
 \end{block}
 
 \pause
 
 $ t = \frac{\color{red}{D}}{\text{\color{gray}{Variation of }}\color{gray}{\epsilon}} \times \frac{\text{Sample Size}}{\sqrt{2}}$

\end{frame}
%%%%%%%%%%%%

\begin{frame}{When can we know whether $D \neq 0$ ?}

 \begin{columns}
 \begin{column}{0.5\textwidth}
 \includegraphics[width=\textwidth]{Figures/figure/ttestdiff-1}
 \end{column}
 \begin{column}{0.5\textwidth}
  $ t = \frac{\color{red}{D}}{\text{\color{gray}{Variation of }}\color{gray}{\epsilon}} \times \frac{\text{Sample Size}}{\sqrt{2}}$

  \vspace{1cm}
  Is it easier when the true difference is 0.5 or when it is 3 ?
 \end{column}
 \end{columns}
 
 \pause
 \begin{alertblock}{}
  \begin{enumerate}
   \item Large true difference between the means
  \end{enumerate}
 \end{alertblock}

\end{frame}
%%%%%%%%%%%

\begin{frame}{When can we know whether $D \neq 0$ ?}

 \begin{columns}
 \begin{column}{0.5\textwidth}
 \includegraphics[width=\textwidth]{Figures/figure/ttestsample-1}
 \end{column}
 \begin{column}{0.5\textwidth}
  $ t = \frac{\color{red}{D}}{\text{\color{gray}{Variation of }}\color{gray}{\epsilon}} \times \frac{\text{Sample Size}}{\sqrt{2}}$

  \vspace{1cm}
  Is it easier when sample size is 4 or when it is 100?
 \end{column}
 \end{columns}
 
 \pause
 \begin{alertblock}{}
  \begin{enumerate}
   \item Large true difference between the means
   \item Large sample size
  \end{enumerate}
 \end{alertblock}

\end{frame}
%%%%%%%%%%%


\begin{frame}{When can we know whether $D \neq 0$ ?}

 \begin{columns}
 \begin{column}{0.5\textwidth}
 \includegraphics[width=0.9\textwidth]{Figures/figure/ttestvar-1}
 \end{column}
 \begin{column}{0.5\textwidth}
  $ t = \frac{\color{red}{D}}{\text{\color{gray}{Variation of }}\color{gray}{\epsilon}} \times \frac{\text{Sample Size}}{\sqrt{2}}$

  \vspace{1cm}
  Is it easier when unexplained variation is 1 or when it is 3?
 \end{column}
 \end{columns}
 
 \pause
 \begin{alertblock}{What makes $t$ large:}
  \begin{enumerate}
   \item Large true difference between the means
   \item Large sample size
   \item Small unexplained variation
  \end{enumerate}
 \end{alertblock}

\end{frame}
%%%%%%%%%%%


\begin{frame}{When can we know whether $D \neq 0$ ?}
  \centering \includegraphics[width=0.7\textwidth]{Figures/figure/tvalue-1}
\end{frame}
%%%%%%%%%%%%


\begin{frame}{When can we know whether $D \neq 0$ ?}
\textbf{p-value}: probability (area under curve) of getting a value as extreme as what you observed, when the true D=0
\centering  \includegraphics[width=0.7\textwidth]{Figures/figure/tvalueth-1}
\end{frame}
%%%%%%%%%%%%


\begin{frame}{But really, what is a p-value?}

 \begin{block}{Candy practical}
 \begin{itemize}[<+->]
  \item You got a pack of 20 candies with a mix of Halloween and Fruit candies
  \item You pick one, it's a Halloween one\dots looks quite disgusting. You put in back
  \item You pick a second one. Again a disgusting Halloween candy! You put it back
  \item And so on, until 5 candies. You wonder if you have been cheated.
  \item Are there more Halloween than Fruit candies in that pack?
  \item You decide to use statistics to find out
 \end{itemize}
 \end{block}

\only<6->{
 \begin{exampleblock}{How to?}
  \begin{itemize}
   \item Draw 5 candies out of the pack
   \item Write down how many Halloween candies
   \item How often is it 5?
  \end{itemize}
 \end{exampleblock}
  }
\end{frame}
%%%%%%%%%%%%

\begin{frame}{But really, what is a p-value?}

\url{https://docs.google.com/spreadsheets/d/1Y9512z1xxkphjAZ_dYT9SqfQH02UqTDKlW2X0mDEwZY/edit?usp=sharing}

 \begin{exampleblock}{}
  \begin{itemize}
   \item Draw 5 candies out of the pack
   \item Write down how many Halloween candies
   \item How often is it 5?
   \pause
   \item Estimate the p-value for the test ``candies have same frequency''
   \pause
   \item Redo the experiment in R, using random sampling (rbinom)
   \pause
   \item What is the correct null-distribution?
  \end{itemize}
 \end{exampleblock}
  
\end{frame}
%%%%%%%%%%%%

\begin{frame}[fragile]{Back to C3/C4 plants. Analyse real data in R}

1. Set working directory (\texttt{setwd(`` / '')}) or create a R-project\\

2. Load and check data
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{verbatim}
resp <- read.csv("d_respiration.csv”)
str(resp)
View(resp)
\end{verbatim}
\end{kframe}
\end{knitrout}

3. Visualize data
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{verbatim}
library(ggplot2)
ggplot(resp,aes(Plant_type,rrarea,colour=Plant_type))+
    geom_point()+facet_wrap(~Variation)
\end{verbatim}
\end{kframe}
\end{knitrout}

\end{frame}
%%%%%%%%%%%%


\begin{frame}[fragile]{Fit a t-test in R: \texttt{t.test()}}

\textbf{Subset data by Variation (High and Low)}

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{verbatim}
resp_H <- subset(resp,Variation == "High")
resp_L <- subset(resp,Variation == "Low")
\end{verbatim}
\end{kframe}
\end{knitrout}

\pause

\textbf{Compare C3 and C4 plants in “High Variation” subset}
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{verbatim}
t.test(rrarea~Plant_type, data=resp_H, var.equal=TRUE)
\end{verbatim}
\end{kframe}
\end{knitrout}

\pause
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.9, 0.9, 0.969}\color{fgcolor}\begin{kframe}
\small
\begin{verbatim}
	Two Sample t-test
data:  rrarea by Plant_type
t = -0.93776, df = 10, p-value = 0.3705
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -1.7619349  0.7181446
sample estimates:
mean in group C3 mean in group C4 
        2.720021         3.241916 
\end{verbatim}
\end{kframe}
\end{knitrout}

\pause
${\color{purple}{response}} = {\color{blue}{A}} + {\color{red}{D}} \times {\color{orange}{predictor}} + {\color{gray}{\epsilon}}$

\end{frame}
%%%%%%%%%%%

\begin{frame}[fragile]{Fit a t-test in R: \texttt{t.test()}}

\textbf{Compare C3 and C4 plants in “Low Variation” subset}
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{verbatim}
t.test(rrarea~Plant_type, data=resp_L, var.equal=TRUE)
\end{verbatim}
\end{kframe}
\end{knitrout}

\end{frame}
%%%%%%%%%%%

\begin{frame}[fragile]{Fit an anova in R: \texttt{aov()}}

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{verbatim}
aov1 <- aov(rrarea~Plant_type, data=resp_H)
summary(aov1)
\end{verbatim}
\end{kframe}
\end{knitrout}

\vspace{-0.15cm}
\pause
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.9, 0.9, 0.969}\color{fgcolor}\begin{kframe}
\footnotesize
\begin{verbatim}
            Df Sum Sq Mean Sq F value Pr(>F)
Plant_type   1  0.817  0.8171   0.879   0.37
Residuals   10  9.292  0.9292  
\end{verbatim}
\end{kframe}
\end{knitrout}

\pause
${\color{purple}{response}} = {\color{blue}{A}} + {\color{red}{D}} \times {\color{orange}{predictor}} + {\color{gray}{\epsilon}}$

\end{frame}
%%%%%%%%%%%

\begin{frame}[fragile]{Fit a linear model in R: \texttt{lm()}}

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{verbatim}
lm1<-lm(rrarea ~ Plant_type, data = resp_L)
summary(lm1)
\end{verbatim}
\end{kframe}
\end{knitrout}

\vspace{-0.15cm}
\pause
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.9, 0.9, 0.969}\color{fgcolor}\begin{kframe}
\footnotesize
\begin{verbatim}
lm(formula = rrarea ~ Plant_type, data = resp_H)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.7380 -0.4201 -0.1437  0.6706  1.6754 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)    2.7200     0.3935   6.912 4.13e-05 ***
Plant_typeC4   0.5219     0.5565   0.938     0.37    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9639 on 10 degrees of freedom
Multiple R-squared:  0.08083,	Adjusted R-squared:  -0.01109 
F-statistic: 0.8794 on 1 and 10 DF,  p-value: 0.3705
\end{verbatim}
\end{kframe}
\end{knitrout}

\pause
${\color{purple}{response}} = {\color{blue}{A}} + {\color{red}{D}} \times {\color{orange}{predictor}} + {\color{gray}{\epsilon}}$

\end{frame}
%%%%%%%%%%%

\begin{frame}[fragile]{Fit a linear model in R: \texttt{lm()}}

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{verbatim}
library(emmeans)
emmeans(lm1, ~Plant_type)
\end{verbatim}
\end{kframe}
\end{knitrout}


\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.9, 0.9, 0.969}\color{fgcolor}\begin{kframe}
\footnotesize
\begin{verbatim}
 Plant_type   emmean        SE df lower.CL upper.CL
 C3         2.720021 0.3935305 10 1.843180 3.596861
 C4         3.241916 0.3935305 10 2.365076 4.118757

Confidence level used: 0.95 
\end{verbatim}
\end{kframe}
\end{knitrout}

\pause
 ${\color{purple}{response}} = {\color{blue}{A}} + {\color{red}{D}} \times {\color{orange}{predictor}} + {\color{gray}{\epsilon}}$

\end{frame}
%%%%%%%%%%%

\begin{frame}{Compare the output from t.test, aov and lm}
 
\end{frame}
%%%%%%%%%%%%


\begin{frame}{Three equivalent ways to look at data}
 \centering
 \only<1>{T-test, focus on difference between two means\\
 \includegraphics[width=0.8\textwidth]{Figures/figure/ttestrep-1}}
 \only<2>{ANOVA, focus on variation within VS. between\\
 \includegraphics[width=0.8\textwidth]{Figures/figure/aovrep-1}}
 \only<3>{Linear regression, focus on rate of change\\
 \includegraphics[width=0.8\textwidth]{Figures/figure/lmrep-1}}
\end{frame}
%%%%%%%%%%%%


\begin{frame}{All is one\dots}
\pause
  \begin{block}{\dots but \texttt{lm()} rules (IMHO)}
    \begin{itemize}
      \item t-test, ANOVA, regression and others can be mathematically equivalent
      \item In R, \texttt{lm()} and related functions can do them all\dots
      \item \dots and much more!
    \end{itemize}
  \end{block}
\end{frame}
%%%%%%%%%%%


\begin{frame}{All is one\dots}
\centering
    \includegraphics[width=0.8\textwidth]{Figures/modeldecision}\\
ALL can be done as linear models
\end{frame}
%%%%%%%%%%%

% more on LM, general steps, and another example
\begin{frame}{Focus on linear models}
  \textbf{{\color{purple}{Response}} = {\color{blue}{Intercept}} + {\color{red}{Slope}} $\times$ {\color{orange}{Predictor}} + {\color{gray}{Error}}} \\

\centering
\includegraphics[width=0.6\textwidth]{Figures/figure/lmprinc-1}
\end{frame}
%%%%%%%%%%%%


\begin{frame}[fragile]{A simple linear model}
  \textbf{{\color{purple}{Response}} = {\color{blue}{Intercept}} + {\color{red}{Slope}} $\times$ {\color{orange}{Predictor}} + {\color{gray}{Error}}} \\
  \vspace{-0.1cm}
  \begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\footnotesize
\begin{verbatim}
  lm(response ~ 1 + predictor1 + predictor2, data=data)
\end{verbatim}
\end{kframe}
\end{knitrout}
equivalent to
  \begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\footnotesize
\begin{verbatim}
  lm(response ~ predictor1 + predictor2, data=data) \end{verbatim}
\end{kframe}
\end{knitrout}
equivalent to
  \begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\footnotesize
\begin{verbatim}
  lm(response ~ predictor2 + predictor1, data=data) \end{verbatim}
\end{kframe}
\end{knitrout}

\begin{itemize}
  \item Intercept can be explicit or implicit
  \item Can remove intercept with \texttt{\dots $\sim $ 0 + \dots}
  \item Error is implicit
  \item Feed the option \texttt{data=} to keep code short, reliable and flexible
  \item Order of predictors do not matter 
\end{itemize}

\end{frame}
%%%%%%%%%%%


\section{Another look at essential steps}

 
\begin{frame}{General approach}

\begin{center}
  \begin{tikzpicture}
    \node (sq) at (0,-1) {\color{red}{1. Scientific question}};
    \pause
    \node (mo) at (0,-2) {2. Model and Statistical question};
    \draw[->, thick] (sq)--(mo);
    \pause
    \node (dac) at (6,-2) {\color{red}{3. Data collection}};
    \draw[<->, thick] (mo)--(dac);
    \pause
    \node (est) at (0,-3) {4. Estimation};
        \draw[->, thick] (mo)--(est);
    \node (unc) at (0,-3.5) {4.b Uncertainty and statistical significance};
    \pause
    
    \node (che) at (0,-5) {5. Check assumptions, visualize results};
        \draw[->, thick] (unc)--(che);
    \draw[->, thick] (che.west) to [out=150, in=210] (mo.west);

    \pause
    \node (int) at (0,-6) {\color{red}{6. Interpret and think about the biology}};
        \draw[->, thick] (che)--(int);

  \draw[rounded corners, color=blue] (-4.5,-1.5) rectangle (4,-5.5);
  \node[anchor=north west] (r) at (-4.5,-1.5) {\includegraphics[width=0.1\textwidth]{Figures/r}};
  \end{tikzpicture}
  \end{center}
\end{frame}
%%%%%%%%%%%

\begin{frame}[fragile]{Back to C3/C4}
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{verbatim}
lmL<-lm(rrarea ~ Plant_type, data = resp_L)
summary(lmL)
\end{verbatim}
\end{kframe}
\end{knitrout}

\vspace{-0.15cm}
\pause
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.9, 0.9, 0.969}\color{fgcolor}\begin{kframe}
\footnotesize
\begin{verbatim}
...
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   2.81857    0.07856  35.878 6.72e-12 ***
Plant_typeC4  0.44235    0.11110   3.982  0.00259 ** ---
...
\end{verbatim}
\end{kframe}
\end{knitrout}

\textbf{Estimation:\\}
${\color{purple}{response}} = {\color{blue}{A}} + {\color{red}{D}} \times {\color{orange}{predictor}} + {\color{gray}{\epsilon}}$\\

\only<2>{
${\color{blue}{A}}=?$, 
${\color{red}{D}}=?$
}
\only<3->{
${\color{blue}{A}}=2.81857$, 
${\color{red}{D}}= 0.44235$\\
}

\only<4->{
\textbf{Uncertainty:\\}
For ${\color{red}{D}}$ SE= 0.11110 ; p-value=0.00259\\
}
\vspace{0.2cm}
\only<5->{
\textbf{What do we do next?}
}

\end{frame}
%%%%%%%%%%%

\begin{frame}{Check assumptions, visualize results}
\pause 
Linear model basic assumptions
 \begin{block}{}
     \begin{itemize}[<+->]
       \item Predictor not perfectly correlated \\ \textit{Risk: Model won't run, unstable convergence, or huge SE}
       \item {\color{red!20!black}{Little error in predictors}}\\ \textit{Risk: bias estimates (underestimate with Gaussian error)}
       \item {\color{red!50!black}{Gaussian error distribution}}\\ \textit{Risk: Poor predictions}
       \item {\color{red!70!black}{Homoscedasticity (constant error variance)}}\\ \textit{Risk: Over-optimistic uncertainty, unreliable predictions}
       \item {\color{red!99!black}{Independence of error}}\\ \textit{Risk: Bias and over-optimistic uncertainty}
     \end{itemize}
 \end{block}
\end{frame}
%%%%%%%%%%%

\begin{frame}[fragile]{Check assumptions, visualize results}

Assessing model assumptions in R:

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{verbatim}
lmL<-lm(rrarea ~ Plant_type, data = resp_L)
plot(lmL)
summary(lmL)
\end{verbatim}
\end{kframe}
\end{knitrout}

\end{frame}
%%%%%%%%%%%%

\begin{frame}[fragile]{Check assumptions, visualize results}
Visualize and report results
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\footnotesize
\begin{verbatim}
lm1.results<-summary(emmeans(lm1,~Plant_type))

ggplot(lm1.results,aes(Plant_type,emmean, fill=Plant_type))+
  geom_bar(stat="identity", width=.4)+
  geom_errorbar(aes(ymin =lm1.results$lower.CL, 
  ymax = lm1.results$upper.CL), width=.2)+
  ylim(0,4)+
  geom_point(data=resp_L, aes(x=Plant_type, y=rrarea), color="red")+
  labs(y = "Dark Respiration (units)")+
  geom_text(aes(x=1.5, y=3.5, label="p=0.002"))  
\end{verbatim}
\end{kframe}
\end{knitrout}
\end{frame}
%%%%%%%%%%%


\begin{frame}{Check assumptions, visualize results}

\includegraphics[width=0.8\textwidth]{Figures/C3C4output}
\end{frame}
%%%%%%%%%%%


\begin{frame}{Another example}

\begin{block}{\textbf{Compare wheat yields between 3 varieties}}
\begin{itemize}
  \item Outcome measure: Tonnes/hectare
  \item Experimental factor: Variety (new/newPLUS/standard)
  \item Data:  6 plots/ variety
\end{itemize}
\end{block}

\begin{columns}
 \begin{column}{0.5\textwidth}
  \includegraphics[width=\textwidth]{Figures/wheat}
 \end{column}
\begin{column}{0.5\textwidth}
    How many parameters in this model?
 \end{column}
\end{columns}

.
\end{frame}
%%%%%%%%%%%

\begin{frame}{Results from Wheat Yield Experiment with 3 Varieties}
 \includegraphics[width=0.9\textwidth]{Figures/wheatyield}
\end{frame}
%%%%%%%%%%%

\begin{frame}[fragile]{Analyse these data in R}
 
 \begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\footnotesize
\begin{verbatim}
 wheat2<-read.csv("wheat yield PLUS.csv”)
str(wheat2) #check data types for each variable
View(wheat2) #View data
ggplot(wheat2, aes(Variety, Yield, colour=Variety)) +
            geom_point()
\end{verbatim}
\end{kframe}
\end{knitrout}

\end{frame}
%%%%%%%%%%%

\begin{frame}[fragile]{Sample analysis in R: 1-way ANOVA}
 
  \begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\footnotesize
\begin{verbatim}
aov1=aov(Yield~Variety, data = wheat2)
summary(aov1)
\end{verbatim}
\end{kframe}
\end{knitrout}
 
   \begin{knitrout}
\definecolor{shadecolor}{rgb}{0.9, 0.9, 0.969}\color{fgcolor}\begin{kframe}
\footnotesize
\begin{verbatim}
  Df 	Sum Sq    Mean Sq 	F value  	Pr(>F)   
Variety      2 	0.6820        0.3410         8.951 		0.00276 **
Residuals 15 	0.5714        0.0381   
\end{verbatim}
\end{kframe}
\end{knitrout}
 
 \begin{alertblock}{ANOVA}
 \begin{itemize}
  \item Compares means between TWO or MORE GROUPS
  \item Relies on F-statistic = $\frac{\text{Between-groups variance}}{\text{Within groups variance}} = \frac{\text{Explained}}{\text{Unexplained}}$ 
  \item One test for significance of all groups
 \end{itemize}
 \end{alertblock}
 
 \pause
 
 \emph{But which groups are different???}
\end{frame}
%%%%%%%%%%%

\begin{frame}[fragile]{Using emmeans to extract group estimates}
   \begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\footnotesize
\begin{verbatim}
emmeans(aov1, ~Variety)
\end{verbatim}
\end{kframe}
\end{knitrout}
 
   \begin{knitrout}
\definecolor{shadecolor}{rgb}{0.9, 0.9, 0.969}\color{fgcolor}\begin{kframe}
\footnotesize
\begin{verbatim}
Variety   	 emmean   	SE 	df    lower.CL    upper.CL
 Standard 	2.81 		0.079 	15 	2.64 	2.98
 New      		3.26 		0.079 	15 	3.09 	3.43
 NewPlus  	3.19 		0.079 	15 	3.02 	3.36
 
Confidence level used: 0.95 
\end{verbatim}
\end{kframe}
\end{knitrout}

\end{frame}
%%%%%%%%%%%%

\begin{frame}[fragile]{Using emmeans to compare groups}
   \begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\footnotesize
\begin{verbatim}
emmeans(aov1, pairwise~Variety)
\end{verbatim}
\end{kframe}
\end{knitrout}
 
   \begin{knitrout}
\definecolor{shadecolor}{rgb}{0.9, 0.9, 0.969}\color{fgcolor}\begin{kframe}
\footnotesize
\begin{verbatim}
$contrasts
 contrast              		estimate        SE 	df  	 t.ratio  p.value
 Standard - New    	 -0.44 	   0.112 	15  	-3.925  0.0036
 Standard – NewPlus	 -0.37 	   0.112 	15  	-3.330  0.0120
 New - NewPlus      	 0.06 	   0.112 	15   	 0.595  0.8248
 
P value adjustment: tukey method for comparing a family of 3 estimates
\end{verbatim}
\end{kframe}
\end{knitrout}

\end{frame}
%%%%%%%%%%%%

\begin{frame}[fragile]{Equivalent with \texttt{lm()}}
    \begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\footnotesize
\begin{verbatim}
lm2<-lm(Yield ~ Variety, data = wheat2) 
anova(lm2)
summary(lm2)
\end{verbatim}
\end{kframe}
\end{knitrout}
 
   \begin{knitrout}
\definecolor{shadecolor}{rgb}{0.9, 0.9, 0.969}\color{fgcolor}\begin{kframe}
\footnotesize
\begin{verbatim}
Analysis of Variance Table
 Response: Yield
          	Df 	 Sum Sq 	Mean Sq     F value   Pr(>F)   
Variety       2 	0.68203   	0.34101       8.9513    0.002764 **
Residuals 15 	0.57145  	0.03810 
\end{verbatim}
\end{kframe}
\end{knitrout}

 
   \begin{knitrout}
\definecolor{shadecolor}{rgb}{0.9, 0.9, 0.969}\color{fgcolor}\begin{kframe}
\footnotesize
\begin{verbatim}
Coefficients:
               	Estimate  Std. Error    t value     Pr(>|t|)    
(Intercept)     	2.81857    0.07968     35.372     7.26e-16 ***
VarietyNew      	0.44235    0.11269       3.925     0.00135 ** 
VarietyNewPlus 	0.37529    0.11269       3.330     0.00457 ** 
\end{verbatim}
\end{kframe}
\end{knitrout}

\end{frame}
%%%%%%%%%%%%


\begin{frame}[fragile]{Equivalent with \texttt{lm()}}
    \begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\footnotesize
\begin{verbatim}
 emmeans(lm2, pairwise~Variety)
\end{verbatim}
\end{kframe}
\end{knitrout}
 
   \begin{knitrout}
\definecolor{shadecolor}{rgb}{0.9, 0.9, 0.969}\color{fgcolor}\begin{kframe}
\footnotesize
\begin{verbatim}
$emmeans
 Variety    emmean         SE      df    lower.CL upper.CL
 Standard     2.81       0.0796    15      2.64       2.98
 New            3.26        0.0796   15       3.09      3.43
 NewPlus     3.19        0.0796   15       3.02      3.36
 
Confidence level used: 0.95 
 
$contrasts
 contrast              estimate        SE      df    t.ratio p.value
 Standard - New          -0.44    0.112   15  -3.925  0.0036
 Standard - NewPlus   -0.37    0.112   15  -3.330  0.0120
 New - NewPlus           0.067   0.112   15   0.595  0.8248
 
P value adjustment: tukey method for comparing a family of 3 estimates 
             
\end{verbatim}
\end{kframe}
\end{knitrout}

\end{frame}
%%%%%%%%%%%%


\begin{frame}[fragile]{Assessing model assumptions for \texttt{lm()}}
     \begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\footnotesize
\begin{verbatim}
 plot(lm2)
\end{verbatim}
\end{kframe}
\end{knitrout}
 
 \begin{figure}
  \includegraphics[width=0.49\textwidth]{Figures/assump1}
  \includegraphics[width=0.49\textwidth]{Figures/assump2}
 \end{figure}

\end{frame}
%%%%%%%%%%%%


\begin{frame}{Summary of results with post hoc comparisons:}
\centering
\includegraphics[width=0.7\textwidth]{Figures/wheatres}
\end{frame}
%%%%%%%%%%%%


\end{document}