-
Notifications
You must be signed in to change notification settings - Fork 0
/
msthesis.tex
2657 lines (2416 loc) · 102 KB
/
msthesis.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
%% NYU PhD thesis format. Created by Jos Koiller 2007--2008.
%% Use the first of the following lines during production to
%% easily spot "overfull boxes" in the output. Use the second
%% line for the final version.
%\documentclass[12pt,draft,letterpaper]{report}
\documentclass[12pt,letterpaper]{report}
%% Replace the title, name, advisor name, graduation date and dedication below with
%% your own. Graduation months must be January, May or September.
\newcommand{\thesistitle}{A Lanczos Procedure for Approximating Eigenvalues of Large Stochastic Matrices}
\newcommand{\thesisauthor}{William J. DeMeo}
\newcommand{\thesisadvisor}{Professor Jonathan Goodman}
\newcommand{\graddate}{January 1999}
%\urladdr{williamdemeo@gmail.com}
%% \urladdr{http://williamdemeo.org}
%% \address{Department of Mathematics\\
%% University of South Carolina\\Columbia 29208\\USA}
%% If you do not want a dedication, scroll down and comment out
%% the appropriate lines in this file.
\newcommand{\thesisdedication}{To my parents, with love and appreciation.}
%% The following makes chapters and sections, but not subsections,
%% appear in the TOC (table of contents). Increase to 2 or 3 to
%% make subsections or subsubsections appear, respectively. It seems
%% to be usual to use the "1" setting, however.
\setcounter{tocdepth}{1}
%% Sectional units up to subsubsections are numbered. To number
%% subsections, but not subsubsections, decrease this counter to 2.
\setcounter{secnumdepth}{3}
%% Page layout (customized to letter paper and NYU requirements):
\setlength{\oddsidemargin}{.6in}
\setlength{\textwidth}{5.8in}
\setlength{\topmargin}{.1in}
\setlength{\headheight}{0in}
\setlength{\headsep}{0in}
\setlength{\textheight}{8.3in}
\setlength{\footskip}{.5in}
%% Use the following commands, if desired, during production.
%% Comment them out for final version.
%\usepackage{layout} % defines the \layout command, see below
%\setlength{\hoffset}{-.75in} % creates a large right margin for notes and \showlabels
%% Controls spacing between lines (\doublespacing, \onehalfspacing, etc.):
\usepackage{setspace}
%% Use the line below for official NYU version, which requires
%% double line spacing. For all other uses, this is unnecessary,
%% so the line can be commented out.
\doublespacing % requires package setspace, invoked above
%% Each of the following lines defines the \com command, which produces
%% a comment (notes for yourself, for instance) in the output file.
%% Example: \com{this will appear as a comment in the output}
%% Choose (uncomment) only one of the three forms:
%\newcommand{\com}[1]{[/// {#1} ///]} % between [/// and ///].
\newcommand{\com}[1]{\marginpar{\tiny #1}} % as (tiny) margin notes
%\newcommand{\com}[1]{} % suppress all comments.
%% This inputs your auxiliary file with \usepackage's and \newcommand's:
%% It is assumed that that file is called "definitions.tex".
%%
%% \input{wjd_masters.def}
\usepackage{enumerate,amsmath,amssymb,fancyhdr,mathrsfs,amsthm,url,stmaryrd}
\usepackage[colorlinks=true,urlcolor=black,linkcolor=black,citecolor=black]{hyperref}
\usepackage{relsize}
\usepackage{marvosym}
\usepackage{graphicx}
\usepackage{enumerate}
\usepackage{tikz}
\usetikzlibrary{calc}
\usepackage{scalefnt}
\usepackage{verbatim}
\usepackage{color}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{ifthen}
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[chapter]
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{prop}[theorem]{Proposition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{example}[theorem]{Example}
\newtheorem{question}[theorem]{Question}
\newcounter{claim}
\newtheorem{claim}[claim]{Claim}
\newcounter{conjecture}
\newtheorem{conjecture}[conjecture]{Conjecture}
\newtheorem*{problem}{Problem}
\newtheorem{fact}{Fact}[chapter]
\newtheorem{case}{Case}
\theoremstyle{remark}
\newtheorem*{remark}{Remark}
\newtheorem*{remarks}{Remarks}
\newtheorem*{notation}{Notation}
\numberwithin{theorem}{chapter}
\numberwithin{claim}{chapter}
\numberwithin{equation}{chapter}
\numberwithin{conjecture}{chapter}
\renewcommand\S{\ensuremath{\mathcal{S}}}
\newcommand\sK{\ensuremath{\mathcal{K}}}
\newcommand\F{\ensuremath{\mathcal{F}}}
\newcommand\R{\ensuremath{\mathbb{R}}}
\newcommand\bC{\ensuremath{\mathbf{C}}}
\newcommand\C{\ensuremath{\mathrm{C}}}
\renewcommand\P{\ensuremath{\mathrm{P}}}
\newcommand\bP{\ensuremath{\mathbf{P}}}
\newcommand\M{\ensuremath{\mathrm{M}}}
\renewcommand\H{\ensuremath{\mathrm{H}}}
\newcommand\T{\ensuremath{\mathrm{T}}}
\newcommand\I{\ensuremath{\mathrm{I}}}
\newcommand\K{\ensuremath{\mathrm{K}}}
\newcommand\Q{\ensuremath{\mathrm{Q}}}
\newcommand\mR{\ensuremath{\mathrm{R}}}
\newcommand\Z{\ensuremath{\mathrm{Z}}}
\newcommand\E{\ensuremath{\mathrm{E}}}
\newcommand\U{\ensuremath{\mathrm{U}}}
\newcommand\V{\ensuremath{\mathrm{V}}}
\newcommand\bE{\ensuremath{\mathbf{E}}}
\newcommand\Var{\ensuremath{\mathbf{Var}}}
\newcommand\ran{\ensuremath{\operatorname{ran}}}
\newcommand\psip[1]{\ensuremath{\psi^{(#1)}}}
\newcommand\pij{\ensuremath{p_{ij}}}
\newcommand\pji{\ensuremath{p_{ji}}}
\newcommand\<{\ensuremath{\langle}}
\renewcommand\>{\ensuremath{\rangle}}
\newcommand\diag{\ensuremath{\operatorname{diag}}}
\newcommand\one{\ensuremath{\mathbf{1}}}
\newcommand\lambdamax{\ensuremath{\lambda_{\max}(\M)}}
%% Cross-referencing utilities. Use one or the other--whichever you prefer--
%% but comment out both lines for final version.
%\usepackage{showlabels}
%\usepackage{showkeys}
\begin{document}
%% Produces a test "layout" page, for "debugging" purposes only.
%% Comment out for final version.
%\layout % requires package layout (see above, on this same file)
%%%%%% Title page %%%%%%%%%%%
%% Sets page numbering to "roman style" i, ii, iii, iv, etc:
\pagenumbering{roman}
%
%% No numbering in the title page:
\thispagestyle{empty}
%
\begin{center}
{\large\textbf{\thesistitle}}
\vspace{5mm}
by
\vspace{5mm}
\thesisauthor
\vfill
%% \begin{doublespace}
A dissertation submitted in partial satisfaction of the\\[-4pt]
requirements for the degree of\\[-4pt]
Master of Science\\[6pt]
in\\[6pt]
Mathematics\\[6pt]
in the\\[6pt]
GRADUATE DIVISION\\[-4pt]
of the \\[-4pt]
NEW YORK UNIVERSITY\\
%% \end{doublespace}
\end{center}
Committee in charge:\\
\phantom{XXX} Professor Jonathan Goodman, Chair\\[-4pt]
\phantom{XXX} Professor Leslie Greengard
\begin{center}
\graddate
\end{center}
\vfill
\noindent\makebox[\textwidth]{\hfill\makebox[2.5in]{\hrulefill}}\\
\makebox[\textwidth]{\hfill\makebox[2.5in]{\hfill\thesisadvisor\hfill}}
\newpage
%%%%%%%%%%%%% Blank page %%%%%%%%%%%%%%%%%%
\thispagestyle{empty}
\vspace*{0in}
\newpage
%%%%%%%%%%%%%% Dedication %%%%%%%%%%%%%%%%%
%% Comment out the following lines if you do not want to dedicate
%% this to anyone...
\vspace*{\fill}
\begin{center}
\thesisdedication\addcontentsline{toc}{section}{Dedication}
\end{center}
\vfill
\newpage
%%%%%%%%%%%%%% Acknowledgements %%%%%%%%%%%%
%% Comment out the following lines if you do not want to acknowledge
%% anyone's help...
\section*{Acknowledgments}\addcontentsline{toc}{section}{Acknowledgments}
%%%
%%% \input{acknowledge}
%%%
First, I thank my advisor, Professor Jonathan Goodman, for giving me the opportunity to work on
this problem, and helping me arrive at the following exposition. Next, I wish to thank Professor
Helena Frydman for first sparking my interest in Markov chains by giving lucid descriptions of their
many interesting properties and applications. I thank Professors James Demmel and Beresford
Parlett, for answering many questions pertaining to this problem, and Professor Leslie Greengard
for agreeing to review this paper.
Finally, I would like to thank my family, for their support of my interests in research (and
everything else!), and especially my parents, Barbara and Ted Terry, and Benita and Bill DeMeo.
Needless to say, without them this paper would not have been written.
\newpage
%%%% Abstract %%%%%%%%%%%%%%%%%%
\section*{Abstract}\addcontentsline{toc}{section}{Abstract}
%%%----------------------------------------------------------------
%%%
%%% \input{abstract}
%%%
\begin{center}
\newcommand\skipsize{6pt}
A Lanczos Procedure for Approximating Eigenvalues of Large Stochastic Matrices\\[\skipsize]
by\\[\skipsize]
William J. DeMeo\\[\skipsize]
Master of Science in Mathematics\\[\skipsize]
New York University\\[\skipsize]
Professor Jonathan Goodman, Chair
\end{center}
The rate at which a Markov chain converges to a given probability distribution
has long been an active area of research. Well known bounds on this rate of
convergence involve the subdominant eigenvalue of the chain's underlying
transition probability matrix. However, many transition probability matrices are
so large that we are unable to store even a vector of the matrix in fast
computer memory. Thus, traditional methods for approximating eigenvalues are
rendered useless.
In this paper we demonstrate that, if the Markov chain is reversible, and we
understand the structure of the chain, we can derive the coefficients of the
traditional Lanczos algorithm without storing a single vector. We do this by
considering the variational properties of observables on the chain's state
space. In the process we present the classical theory which relates the
information contained in the Lanczos coefficients to the eigenvalues of the
Markov chain.
%%%----------------------------------------------------------------
\newpage
%%%% Table of Contents %%%%%%%%%%%%
\tableofcontents
\newpage
%%%%% List of Figures %%%%%%%%%%%%%
%% Comment out the following two lines if your thesis does not
%% contain any figures. The list of figures contains only
%% those figures included withing the "figure" environment.
%% \listoffigures\addcontentsline{toc}{section}{List of Figures}
%% \newpage
%%%%% List of Tables %%%%%%%%%%%%%
%% Comment out the following two lines if your thesis does not
%% contain any tables. The list of tables contains only
%% those tables included withing the "table" environment.
%% \listoftables\addcontentsline{toc}{section}{List of Tables}
%% \newpage
%%%%% List of Algorithms %%%%%%%%%%%%%
%% Comment out the following two lines if your thesis does not
%% contain any algorithms. The list of tables contains only
%% those tables included withing the "table" environment.
%% \listofalgorithms\addcontentsline{toc}{section}{List of Algorithms}
%% \newpage
%%%%% Body of thesis starts %%%%%%%%%%%%
\pagenumbering{arabic} % switches page numbering to arabic: 1, 2, 3, etc.
%% Introduction. If your thesis has no introduction, or chapter 1 is
%% meant to be the introduction, then comment out the lines below.
\section*{Introduction}\addcontentsline{toc}{section}{Introduction}
%%%----------------------------------------------------------------
%%%
%%% \input{intro}
%%%
The rate at which a Markov chain converges to a given probability distribution
has long been an active area of research. This is not surprising considering
this problem's relevance to the areas of statistics, statistical mechanics, and
computer science. Markov Chain Monte Carlo (MCMC) algorithms provide important
examples. These algorithms come in handy when we encounter a complicated
probability distribution from which we want to draw random samples. In
statistical mechanics, we might wish to estimate the phase average of a function
on the state space. Goodman and Sokal~\cite{GoodmanSokal:1989} examine Monte
Carlo methods in this context. Examples from statistics occur in the Bayesian
paradigm when we are forced to simulate an unwieldy posterior distribution (see,
e.g., Geman and Geman~\cite{Geman:1984}.
To implement the MCMC algorithm, we invent a Markov chain that converges to the
desired distribution (this is often accomplished using the \emph{Metropolis algorithm}
described in Chapter~\ref{cha:an-appl-conv}). Realizations of the chain will
eventually represent samples from this distribution. Sometimes
``eventually''---meaning all but finitely many terms of the chain---is just not
enough. We need more practical results. In particular, we want to know how many
terms of the chain should be discarded before we are sampling from a
distribution that is close (in total variation distance) to
%
%
% ------------- 03.txt -----------------------------------------------------
%
%
the distribution of interest. This is the purpose of bounding convergence rates
for Markov chains.
Often the Markov chains encountered in this context satisfy a condition known in
the physics literature as \emph{detailed balance}. Probabilists call chains with
this property \emph{reversible}. This simply means that the chain has the same
probability law whether time moves forward or backward.\footnote{This is not a
precise definition. In particular the chain must have started from its
stationary distribution. Full rigor is postponed until
Section~\ref{sec:general-theory}.}
In this paper, we consider the rate at which such chains converge to
a \emph{stationary distribution}. This and other italicized terms are
defined in Section~\ref{sec:general-theory}.
There are a number of different methods in common use for bounding convergence
rates of Markov chains, and a good review of these methods with many references
can be found in~\cite{Rosenthal:1995}. More recently developed methods
employing logarithmic Sobolev inequalities are reviewed in~\cite{Diaconis:1996}.
Most of the bounds in common use involve the subdominant eigenvalue of the
Markov chain's transition probability matrix, and thus require good
approximations to such eigenvalues. In many applications, however, the
transition probability matrix is so large that it becomes impossible to store
even a single vector of the matrix in conventional computer memory. These so
called \emph{out-of-core} problems are not amenable to traditional eigenvalue
algorithms without modification.\footnote{By ``traditional
eigenvalue algorithms'' we mean those found, for example, in Golub and Van
Loan~\cite{Golub:1996}. See also the book by Demmel\cite{Demmel:1997} for more
recent treatment.}
In this paper we develop such a modification for the Markov chain eigenvalue
problem. In particular we develop a method for approximating the first few
eigenvalues of a transition probability matrix when we know the general
structure of the underlying Markov chain. The method does not require storage
of large matrices or vectors. Instead we need only simulate the Markov chain,
and conduct a statistical analysis of the simulation.
Here is a brief summary of the paper. Section~\ref{sec:general-theory} contains
a review of the relevant Markov chain theory. Readers who already know the
basics of asymptotic theory of Markov chains might wish to skim
Section\ref{sec:general-theory} if only to familiarize themselves with our
notation. Section~\ref{sec:funct-state-space} describes functions on the state
% 04.txt
space of the Markov process. This section and Chapter~\ref{cha:invar-appr-invar}
develop the context in which we formulate the new ideas of the paper. In the
last section of Chapter~\ref{cha:invar-appr-invar},
Section~\ref{sec:krylov-subspace}, we describe the \emph{Krylov subspace} and
explain why this represents our best approximation to a subspace containing
extremal eigenvectors of the transition probability matrix (more
precisely, a similarity transformation of this matrix). The first section of
Chapter~\ref{cha:lanczos-procedures} describes the \emph{Lanczos algorithm} for
generating an orthonormal basis for the Krylov subspace. As it stands, this
algorithm is useless for an out-of-core problem since, by
definition of such problems, it requires too much data movement; all the
computing time is spent swapping data between slow and fast memory (disk to ram
to cache and back). We develop alternatives to the Lanczos algorithm and
demonstrate that the \emph{Lanczos coefficients} of the algorithm can be
obtained by simulations of the Markov chain, and this allows us to avoid
the standard algorithm altogether. Following this is a chapter describing the
Metropolis algorithm used to produce a reversible stochastic matrix. It is here
that we experiment with the procedure described in
Section~\ref{sec:lancz-proc-mark} and approximate the extremal eigenvalues of
the matrix, without storing any of its vectors. Finally,
Chapter~\ref{cha:conclusion} concludes the paper.
% 05.txt
%%%===================================================================
%%%-------------------------------------------------------------------
%% If your thesis has different "Parts", use commands such as the following:
\part{Theory\label{part:one}}%
%%%-------------------------------------------------------------------
%%%
%%% \input{MarkovChains} % Chapter 2
%%%
\chapter{Markov Chains}
\section{General Theory}
\label{sec:general-theory}
This review of Markov chain theory can be found in any good probability text. The present
discussion is most similar to that of Durrett~\cite{Durret:1996}, to which we
refer the reader desiring greater detail.
\subsection{The Basic Setup}
Heuristically, a Markov chain is a stochastic process with a lack of memory property. Here
this means that the future of the process, given its past behavior and its present state, depends only
on its present state. This is the probabilistic analogue of a familiar property of classical particle
systems. Given the position and velocities of all particles at time t, the equations of motion can be
completely solved for the future evolution of the system. Thus, information describing the behavior
of the process prior to time t is superfluous. To be a bit more precise, if technical, we need the
following definitions.
\begin{definition}
%Definition 2.1.1
Let $(S, \S)$ be a measurable space. A sequence $X_n$, $n\geq 0$, of random variables
taking values in $S$ is said to be a Markov chain with respect to the filtration
$\sigma(X_0, \dots, X_{n})$ if for all $B \in \S$,
\begin{equation}
% Eqn 2.1
\label{eq:1}
P(X_{n+1} \in B \mid \sigma(X_0, \dots, X_{n}))
=P(X_{n+1} \in B \mid \sigma(X_{n})).
\end{equation}
\end{definition}
Equation~(\ref{eq:1}) merely states that if we know the present location or state of $X_{n}$,
then information about earlier locations or states is irrelevant for predicting $X_{n+1}$.
\begin{definition}
%% Definition 2.1.2
A function $p : S \times \S \rightarrow \R$ is said to be a \emph{transition probability} if:
\begin{enumerate}
\item for each $x \in S$, $A \mapsto p(x, A)$ is a probability measure on $(S, \S)$.
\item for each $A \in \S$, $x \mapsto p(x, A)$ is a measurable function.
\end{enumerate}
\end{definition}
We call $X_n$ a Markov chain with transition probabilities $p_n$ if
\begin{equation}
% Eqn 2.2
\label{eq:2.2}
P(X_{n+1} \in B \mid \sigma(X_n)) = p_n(X_n, B).
\end{equation}
The spaces $(S, \S)$ that we encounter below are standard Borel spaces, so the existence of the
transition probabilities follows from the existence of regular conditional
probabilities on Borel spaces---a standard measure theory result
(see e.g.\cite{Durret:1996}). %[3] page 230
Suppose we are given an initial probability distribution $\mu$ on $(S, \S)$ and a sequence $p_n$ of
transition probabilities. We can define a consistent set of finite dimensional distributions by
\begin{equation}
% Eqn 2.3
\label{eq:2.3}
P(X_j\in B_j ,0 \leq j \leq n) = \int_{B_0} \mu(dx_0) \int_{B_1} p_0(x_0, dx_1)
%\int_{B_2} p_1(x_1, dx_2)
\cdots
\int_{B_n} p_{n-1}(x_{n-1}, dx_n).
\end{equation}
Furthermore, denote our probability space by
\[
(\Omega, \F) = (S^\omega,\S^\omega), \quad \text{ where } \omega = \{0,1,\dots\}.
\]
We call this \emph{sequence space} and it is defined more explicitly by
\[
S^\omega = \{(\omega_0, \omega_1, \dots) : \omega_i \in S\}\quad \text{ and }
\quad
\S^\omega = \sigma(\omega : \omega_i \in A_i \in \S).
\]
% 07.txt
The Markov chain that we will study on this space is simply $X_n(\omega) = \omega$, the coordinate maps.
Then, by the Kolmogorov extension theorem, there exists a unique probability measure $P_\mu$ on
$(\Omega, \F)$ so that the $X_n(\omega)$ have finite dimensional distributions~(\ref{eq:2.3}).
If instead of $\mu$ we begin with the initial distribution $\delta_x$, i.e., point mass at $x$, then we
denote the probability measure by $P_x$. With such measures defined for each $x$, we can in turn
define distributions $P_\mu$, given any initial distribution $\mu$, by
\[
P_\mu(A) = \int \mu(dx)P_x(A).
\]
That the foregoing construction---which, recall, was derived merely from an
initial distribution $\mu$ and a sequence $p_n$ of transition
probabilities---satisfies Definition~(\ref{eq:2.2}) of a Markov chain is not
obvious, and a proof can be found in~\cite{Durret:1996}.
To state the converse of the foregoing, if $X_n$ is a Markov chain with
transition probabilities $p_n$ and initial distribution $\mu$, then its finite
dimensional distributions are given by~(\ref{eq:2.3}). Proof of this is also found
in~\cite{Durret:1996}.
Now that we have put the theory on a firm, if abstract, foundation, we can bring
the discussion down to earth by making the forgoing a little more
concrete. First, we specialize our study of Markov chains by assuming that our
chain is \emph{temporally homogeneous}, which means that the transition
probabilities do not depend on time; i.e.,
$p_n(\omega_n, B) = p(\omega_n, B)$. (This is the stochastic analogue of a
conservative system.)
Next we assume that our state space $S$ is finite, and suppose for all states
$i,j \in S$ that $p(i, j) \geq 0$, and $\sum_j p(i, j) = 1$ for all $i$. In
this case, equation~(\ref{eq:2.2}) takes a more intuitive form,
\[
P(X_{n+1}=j \mid X_n = i) =p(i,j),
\]
and our transition probabilities become
\[
p(i,A) = \sum_{j\in A} p(i,j).
\]
% 08.txt
If $\P$ is a matrix whose $(i, j)$ element is the transition probability
$p(i, j)$ then P is a \emph{stochastic matrix}; that is, a matrix with elements
$p_{ij}$ satisfying
\[
\pij \geq 0, \quad \sum_j \pij =1, \quad (i, j =1, 2, \dots, d).
\]
We also refer to $\P$ as the transition probability matrix.
Without loss of generality, we can further suppose our Markov chain is \emph{irreducible}. This
means that, for any states $i, j$, starting in state $i$ the chain will make a
transition to state $j$ at some future time with positive probability. This
state of affairs is often described by saying that all states \emph{communicate}. We
lose no generality with this assumption because any \emph{reducible} Markov chain can
be factored into irreducible classes of states which can each be studied
separately.
The final two conditions we place on the Markov chains considered below will cost us
some generality. Nonetheless, there are many examples of chains meeting these conditions
and making the present study worthwhile. Furthermore, it may be the case that, with a little
more work, we will be able to drop these conditions in future studies. The first condition is that
the chain is \emph{aperiodic}. If we let $I_x = \{n \geq 1: p^n(x,x) > 0\}$, we
call a Markov chain \emph{aperiodic} if, for any state $x$, the greatest common
divisor of $I_x$ is 1. The second assumption is that our chain is
\emph{reversible}. This characterization is understood in terms of the following definition.
\begin{definition}
%% Definition 2.1.3
\label{def:2.1.3}
A measure $\mu$ is called \emph{reversible} if it satisfies
\[
\mu(x)p(x,y) = \mu(y)p(y,x), \quad \text{ for all $x$ and $y$.}
\]
We call a Markov chain \emph{reversible} if its stationary distribution (defined
in Section~\ref{sec:convergence-theorem}) is reversible.
\end{definition}
% ---------- 09.txt ------------------------------------------------------------
\subsection{A Convergence Theorem}
% Section 2.1.2
\label{sec:convergence-theorem}
In succeeding arguments, we use some results concerning the asymptotic behavior of
Markov chains. These results require a few more definitions.
\begin{definition}
%% Definition 2.1.4
\label{def:2.1.4}
A measure $\pi$ is said to be a \emph{stationary measure} if
\begin{equation}
% Eqn 2.4
\label{eq:2.4}
\sum_x \pi(x) p(x,y) = \pi(x).
\end{equation}
\end{definition}
Equation~(\ref{eq:2.4}) says $P_\pi(X_1 = y) = \pi(y)$,
and by induction that $P_\pi(X_n = y) = \pi(y)$ for all $n \geq 1$.
If $\pi$ is a probability measure, then we call $\pi$ a stationary distribution. It
represents an equilibrium for the chain in the sense that, if $X_0$ has
distribution $\pi$, then so does $X_n$ for all $n$.
When the Markov chain is irreducible and aperiodic, the distribution of the state at time
$n$ converges pointwise to $\pi$ as $n \rightarrow \infty$, regardless of the
initial state. It is convenient to state this convergence result in terms of the
Markov chain's transition probability matrix $\P$. Before doing so, we note that
irreducibility of a Markov chain is equivalent to irreducibility (in the usual
matrix theory sense) of its transition probability matrix. Furthermore, it turns
out that a transition probability matrix of an aperiodic Markov chain falls into
that class of matrices often called acyclic, but for simplicity we will call
such stochastic matrices aperiodic. With this terminology, we can state the
convergence theorem in terms of the transition probability matrix \P.
% Theorem 2.1.1
\begin{theorem}
\label{thm-2.1.1}
Suppose $\P$ is irreducible, aperiodic, and has stationary distribution
$\pi$. Then as $n\rightarrow \infty$, $p^n(i,j) \rightarrow \pi(j)$.
\end{theorem}
The notation $p^n(i,j)$ means the $(i,j)$ element of the $n$th power of $\P$.
A Markov chain whose transition probability matrix satisfies the hypotheses of
Theorem~\ref{thm-2.1.1} is called \emph{ergodic}. If we simulate an ergodic chain for
sufficiently many steps, having
%
%
%
% ---------------10.txt --------------------------------------------------
%
%
%
begun in any initial state, the final state is a sample point from a
distribution that is close to $\pi$.
To make this statement more precise requires that we define ``close.''
\begin{definition}
%% Definition 2.1.5
Let $\pi$ be a probability measure on $S$. Then the \emph{total variation distance} at time
$n$ with initial state $x$ is given by
\[
\Delta_x(n) = \|\P^n(x, A) - \pi(A)\|_{TV} = \max_{A\in \S} | \P^n(x,A)-\pi(A)|.
\]
\end{definition}
In what follows, we will measure rate of convergence using the function
$\tau_x(\epsilon)$, defined as the first time after which the total variation
distance is always less than $\epsilon$. That is,
\[
\tau_x(\epsilon) = \min\{m : \Delta_x(n) \leq \epsilon \text{ for all } n\geq
m\}.
\]
To begin our consideration of the connection between convergence rates of Markov
chains and eigenvalues, we first note that an aperiodic stochastic matrix $\P$ has real
eigenvalues
$1 = \lambda_0 > \lambda_1 \geq \lambda_2 \geq \cdots \geq \lambda_{d-1} \geq -1$,
where $d = |S|$ is the dimension of the state space. For an ergodic chain, the
rate of convergence to the stationary distribution $\pi$ is bounded by a
function of the \emph{subdominant} eigenvalue. By subdominant eigenvalue we mean that
eigenvalue which is second largest in absolute value, and we denote this
eigenvalue by $\lambda_{\max} = \max\{\lambda_1, |\lambda_{d-1}|\}$.
The function bounding the rate of convergence of a Markov chain is given by the
following theorem ($\log$ denotes the natural logarithm):
\begin{theorem}
% Theorem 2.1.2
\label{thm-2.1.2}
The quantity $\tau_x(\epsilon)$ satisfies
\begin{enumerate}
\item
$\tau_x(\epsilon) \leq (1-\lambda_{\max})^{-1}(\log \pi(x)^{-1} + \log \epsilon^{-1})$;
\item $\max_{x \in S} \tau_x(\epsilon) \geq \frac{1}{2} \lambda_{\max}(1-\lambda_{\max})^{-1} \log(2\epsilon)^{-1}$.
\end{enumerate}
\end{theorem}
As this theorem shows, if we have an upper bound on the subdominant eigenvalue, then we have
an upper bound on the function $\tau_x(\epsilon)$. In what follows, we will
derive an approximation to the
% 11.txt
subdominant eigenvalue and supply error bounds. Together, an approximation and error bounds
for $\lambda_{\max}$ provide enough information to make Theorem~\ref{thm-2.1.2} useful.
\section{Functions on the State Space}
% Sec 2.2
\label{sec:funct-state-space}
Recall that $X_n(\omega) = \omega_n\in S$ denotes the state in which the Markov
chain exists at time $n$.
Suppose that
$\Phi = \{\phi_1, \dots, \phi_p\}$
is a collection of $p$ \emph{observables}, or functions defined on the state
space $S$. Furthermore, let these observables be real valued,
$\phi_i : S \rightarrow \R$.
It is often useful to assume that none of the observables is a constant function. Suppose now that
the state space $S$ is finite with $d$ possible states. Then, since an
observable is simply a map of the state space, we can think of each $\phi_i$ as a
vector of $d$ real numbers---the $d$ values that it takes on at the different states.
Now assume the Markov chain is irreducible, and let $\pi$ denote its stationary distribution.
If we start the chain from its stationary distribution---i.e., suppose $X_0$ has
distribution $\pi$---then $X_n$ is a stationary process.
Furthermore, for each $i$, $\phi_i(X_n)$ is a stationary stochastic process with
\emph{mean}
\[
\bE_\pi\phi_i = \sum_{x\in S} \pi(x)\phi_i(x)
\]
and \emph{autocovariance function}
\begin{align}
% Eqn 2.5
\bC_\pi (\phi_i(X_n), \phi_i (X_{n+s}))
&= \bE_\pi[(\phi_i(X_n) - \bE_\pi\phi_i)(\phi_i(X_{n+s}) - \bE_\pi\phi_i)]\\
&= \sum_{x, y\in S} P_\pi(X_n = x, X_{n+s} = y) (\phi_i(x)- \bE_\pi\phi_i)(\phi_i(y) - \bE_\pi\phi_i).\nonumber
\end{align}
By the definition of conditional probability, we can write~(\ref{eq:2.4}) as follows:
\[
\sum_{x, y\in S} P_\pi(X_n = x) P_\pi(X_{n+s} = y\mid X_n = x) (\phi_i(x)-
\bE_\pi\phi_i)(\phi_i(y) - \bE_\pi\phi_i).
\]
Equivalently,
\[
\sum_{x, y\in S} \pi(x)p^s_{xy} (\phi_i(x)-\bE_\pi\phi_i)(\phi_i(y) - \bE_\pi\phi_i).
\]
Here $p^s_{xy}$ denotes the element in row $x$ and column $y$ of $\P^s$, the
$s$th power of the transition probability matrix.
Similarly, we define the \emph{cross-covariance} between the function
$\phi_i$ at time $n$ and $\phi_j$ at time $n+s$ as
\begin{align}
% Eqn 2.6
\bC_\pi (\phi_i(X_n), \phi_j (X_{n+s}))
&= \bE_\pi[(\phi_i(X_n) - \bE_\pi\phi_i)(\phi_j(X_{n+s}) - \bE_\pi\phi_j)]\nonumber\\
&= \sum_{x, y\in S} \pi(x)p^s_{xy} (\phi_i(x)-\bE_\pi\phi_i)(\phi_j(y) - \bE_\pi\phi_j).
\end{align}
Now let $\<\Phi\>$ denote the matrix of mean vectors whose $j$th column is
$\bE_\pi\phi_j\one$, where $\one = (1,\dots, 1)^t$,
and let $\Pi = \diag(\pi(\omega_1),\dots, \pi(\omega_d))$ be the $d \times d$
diagonal matrix with stationary probabilities $\pi(\omega)$
on the main diagonal and zeros elsewhere.
Finally, denoting by $\C(s)$ the $p \times p$ covariance matrix
whose $(i,j)$ element is $\bC_\pi(\phi_i(X_n), \phi_j(X_{n+s}))$, we have
\begin{align*}
\C(0) &= \bE(\Phi(X_n) - \<\Phi\>)(\Phi(X_n) - \<\Phi\>)^t\\
&= (\Phi - \<\Phi\>)^t\Pi (\Phi-\<\Phi\>), \\
\C(s) &= \bE(\Phi(X_n) - \<\Phi\>)(\Phi(X_{n+s}) - \<\Phi\>)^t\\
&= (\Phi - \<\Phi\>)^t \Pi \P^s (\Phi - \<\Phi\>).
\end{align*}
Below, we will also find it useful to have at our disposal a new matrix that is \emph{similar} to the
transition probability matrix. We have in mind the matrix
$\M = \Pi^{1/2}\P\Pi^{-1/2}$.
As is easily verified, this allows us to write the covariance matrix as
\begin{align}
\label{eq:2.7}
\C(s) &=
(\Phi - \<\Phi\>)^t \Pi^{\frac{1}{2}t}\M^s\Pi^{\frac{1}{2}}(\Phi - \<\Phi\>)\nonumber\\
&=\Psi^t \M^s \Psi, % Eqn 2.7
\end{align}
%% \\Phi\Phi‘M’\\Phi\Phi (2.7)
where we have defined $\Psi = \Pi^{\frac{1}{2}}(\Phi - \<\Phi\>)$. Recall that
our main motivation is that for out-of-core problems traditional eigenvalue
algorithms are inadequate. With this fact and the form~(\ref{eq:2.7}) % (2.7)
in
%
%
% ------------------ 13.txt ---------------------------------------------------
%
%
mind, we will consider using the covariance of observables on the state space to implement the
%% (otherwise useless)
Rayleigh-Ritz procedure, which we describe below. This procedure requires
that $\M$ be symmetric. As the next fact demonstrates,
%% this required symmetry is the reason for our
%% interest in the reversibility property of the Markov chain.
this need for symmetry is the reason we insist that the Markov chain be reversible.
\begin{fact}
%% Fact 2.2.1
The matrix $\M$ is symmetric if and only if the Markov chain is reversible (i.e., iff the
process satisfies the \emph{detailed balance} condition).
\end{fact}
\begin{proof}
\begin{align*}
\M \text{ is symmetric }
& \quad \Longleftrightarrow \quad
(\Pi^{1/2}\P\Pi^{-1/2})^t = \Pi^{1/2}\P\Pi^{-1/2}\\
& \quad \Longleftrightarrow \quad
\Pi^{-\frac{1}{2}t}\P^t\Pi^{\frac{1}{2}t} = \Pi^{1/2}\P\Pi^{-1/2}\\
& \quad \Longleftrightarrow \quad
\P^t\Pi^t = \Pi\P.
\end{align*}
Elementwise, the final equality is $\pi_i\pij = \pi_j\pji$.
According to Definition~\ref{def:2.1.3}, this states
that $\pi$ is a reversible measure.
\end{proof}
% 14.txt
%%%-------------------------------------------------------------------
%%%
%%% \input{InvariantSubspaces} % Chapter 3
%%%
\chapter{Invariant and Approximate Invariant Subspaces}
\label{cha:invar-appr-invar}
\section{Invariant Subspaces}
\begin{definition}
%% Definition 3.1.1
A subspace $V \subseteq \R^n$ with the property that
\[
v\in V \quad \Longrightarrow \quad \M v \in V
\]
is said to be \emph{invariant} for $\M$.
\end{definition}
Recall that, having chosen observables $\<\Phi\> = (\phi_1, \dots, \phi_p)$, we
constructed the covariance matrix
$\C(s) = \Psi^t \M \Psi$.
If the column space of $\Psi$, which we denote by $\ran(\Psi)$, is an invariant
subspace for $\M$, the definition implies $\M\psi_j \in \ran(\Psi)$.
That is, for each $\psi_i$ there exists a vector
$t$ of coefficients such that $\M \psi_j = \sum_{i=1}^p t_i \psi_{ij}$.
This is true for all $j$ and, putting each vector of
coefficients into a matrix $\T$, we see that
$\M \Psi = \Psi \T$. Conversely, $\M\Psi = \Psi \T$ implies that $\M\psi_j$ is a
linear combination of columns of $\Psi$, so $\ran(\Psi)$ is invariant. We have
thus proved the following
% 15.txt
\begin{fact}
%% Fact 3.1.1
\label{fact:3.1.1}
The subspace $\ran(\Psi)$ is invariant for $\M$ if and only if there exists
$\T \in \R^{p\times p}$ such that $\M\Psi = \Psi \T$.
\end{fact}
Consequently,
\begin{fact}
%% Fact 3.1.2
\label{fact:3.1.2}
$\lambda(\T) \subseteq \lambda(\M)$.
\end{fact}
\begin{proof}
%Proof of 3.1.2:
~
\vskip-2cm
\begin{align*}
\lambda \in \lambda(\T)
& \quad \Longleftrightarrow \quad
(\exists v \in \R^p) \T v = \lambda v\\
& \quad \Longleftrightarrow \quad
\Psi \T v = \lambda \Psi v\\
& \quad \Longleftrightarrow \quad
\M \Psi v = \lambda \Psi v\\
& \quad \Longrightarrow \quad
\lambda \in \lambda(M).
\end{align*}
The second equivalence follows from Fact~\ref{fact:3.1.1}. %3.1.1.
\end{proof}
%Facts~\ref{fact:3.1.1} and~\ref{fact:3.1.2} are theoretically useful. To see why
To see why Facts~\ref{fact:3.1.1} and~\ref{fact:3.1.2} are theoretically useful,
%consider the equation of Fact~\ref{fact:3.1.1}:
%% \begin{align*}
%% & \Psi \T = \M \Psi \\
%% \Longleftrightarrow \quad & \Psi^t\Psi \T = \Psi^t \M \Psi\\
%% \Longleftrightarrow \quad & \T = (\Psi^t\Psi)^t \Psi^t \M\Psi.
%% \end{align*}
%% \begin{equation*}
%% \Psi \T = \M \Psi \quad\Longleftrightarrow \quad \Psi^t\Psi \T = \Psi^t \M \Psi
%% \Longleftrightarrow \quad \T = (\Psi^t\Psi)^t \Psi^t \M\Psi.
%% \end{equation*}
note that the equation of Fact~\ref{fact:3.1.1}, $\Psi \T = \M \Psi$, is
equivalent to
$\Psi^t\Psi \T = \Psi^t \M \Psi$, which is equivalent to
$\T = (\Psi^t\Psi)^t \Psi^t \M\Psi$.
In this form, we recognize that $\T = \C^{-1}(0)\C(1)$. That is, using only the
covariance of observables on the state space, we can generate a matrix $\T$ with
the property $\lambda(\T) \subseteq \lambda(\M)$. Recalling that
$\M = \Pi^{1/2}\P\Pi^{-1/2}$,
we see that $\M$ is similar to our original stochastic matrix $\P$,
and thus $\lambda(M) = \lambda(P)$.
\section{Approximate Invariant Subspaces}
% Sec 3.2
\label{sec:appr-invar-subsp}
%% We qualified the foregoing by stating that the facts are only theoretically
%% useful. This
We noted above that Facts~\ref{fact:3.1.1} and~\ref{fact:3.1.2} are
theoretically useful. Speaking practically now,
%% This is because
when choosing observables on the state space we may not be sure that
they will satisfy the
%
%
% -------------16.txt---------------------------------------------------
%
%
primary assumption underlying the two facts. %Facts 3.1.1 and 3.1.2.
Recall the assumption: $\ran(\Psi)$ is an invariant
subspace for $\M$ where $\Psi = \Pi^{1/2}(\Phi - \<\Phi\>)$. It may well be the
case that there exists $\psi_j\in \ran(\Psi)$ such that
$\M\psi_j \notin \ran(\Psi)$, thereby violating the assumption.
Even if we are lacking an invariant subspace, however, for some applications it
is reasonable to expect that observables can be chosen to provide at least
an \emph{approximate invariant subspace}, which is defined as follows:
\begin{definition}
%% Definition 3.2.1
If the columns of $\Psi\in \R^{d\times p}$ are independent and the norm of the
\emph{residual matrix} $\E = \M\Psi - \Psi \T$ is small for some
$\T \in \R^{p\times p}$, then
$\ran(\Psi)$ defines an \emph{approximate invariant subspace} for $\M$.
\end{definition}
To see how an approximate invariant subspace can be useful for approximating
eigenvalues of $\M$, we recall a theorem from Golub and Van Loan~\cite{Golub:1996}.
\begin{theorem}
% Theorem 3.2.1
\label{thm:3.2.1}
Suppose $\M \in R^{d\times d}$ and $\T \in \R^{p\times p}$ are symmetric and let
$\E = \M\Q - \Q\T$, where $\Q \in \R^{d\times p}$ is orthonormal
(i.e., $\Q^t\Q = \I$). Then there exist $\mu_1,\dots, \mu_p \in \lambda(\T)$ and
$\lambda_1, \dots, \lambda_p \in \lambda(\M)$ such that
\[
|\mu_k - \lambda_k| \leq \sqrt{2}\|\E\|_2, \quad \text{ for } \quad k = 1, \dots, p.
\]
\end{theorem}
If the subspace $\ran(\Q)$ is an approximate invariant subspace, then the definition implies that
there is a choice $\T$ rendering the error $\|\E\|_2$ small, and thus the
eigenvalues of $\T$ provide a good approximation to those of $\M$.
When considering the foregoing ideas, it is apparent that their application
presents new---but hopefully less prohibitive---problems. As these problems
are the focus of the rest of the paper, now is a good time to examine
them.
First, the preceding theorem assumes a matrix $\Q$ whose columns form an
orthonormal basis for the approximate invariant subspace. For our problem,
derivation of such a $\Q$ is tricky, and we must prepare for this.
Next, having an approximate invariant subspace at our disposal merely tells us
that there exists some matrix $\T$ which makes the error
%
%
% ------------- 17.txt ----------------------------------------------------
%
%
$\|\E\|_2$ small. We must discover the form of such a $\T$. Moreover, it is
natural to seek that $\T$ which minimizes $\|\E\|_2$ for a given approximate
invariant subspace.
Finally, in order to apply these ideas to realistic eigenvalue problems, we must
find a practical way to generate %% the most appropriate
a good approximate invariant subspace.
We now address each of these issues in turn.
%the order raised.
Recall the matrix $\Psi = \Pi^{1/2}(\Phi - \<\Phi\>)$. The columns of this
matrix, though independent (by choice of independent observables), are not
necessarily orthonormal. However, consider the polar decomposition
$\Psi = \Q \Z$, where $\Q^t\Q = \I$ and $\Z^2 = \Psi^t\Psi$ is a symmetric
positive semidefinite matrix.\footnote{Recall that the polar decomposition is
derived from the singular value decomposition, $\Psi = \U\Sigma \V^t$, by
letting $\Q = \U \V^t$ and $\Z = \V\Sigma \V^t$.}
%% ‘Recall that the polar decomposition is derived from the SVD, \Psi = UZJV‘, by letting Q = UV‘ and Z = VEV‘.
Note that $\Z = (\Psi^t\Psi)^{1/2}$ is nonsingular, so $\Q$ has the form
\[
\Q = \Psi \Z^{-1} = \Psi(\Psi^t\Psi)^{-1/2},
\]
and it is clear that $\ran(\Q) = \ran(\Psi)$.
Perhaps $\ran(\Q)$ is a useful approximation to an invariant
subspace for $\M$. If $\ran(\Q)$ is not itself invariant, we have the error
matrix $\E =\M\Q - \Q\T$. Below we show that the
$\T$ which minimizes $\|\E\|_2$ is $\T = \Q^t\M\Q$.
This yields the following
\begin{theorem}
%Theorem 3.2.2
\label{thm:3.2.2}
If $\Psi = \Q\Z$ is the polar decomposition of $\Psi$, then the matrix
\[
\T = \Q^t\M\Q = (\Psi^t\Psi)^{-1/2}\Psi^t \M^t \Psi (\Psi^t\Psi)^{-1/2}
\]
minimizes $\|\E\|_2 = \|\M\Q - \Q\T\|_2$.
\end{theorem}
\begin{proof}
We prove the result by establishing the following
\\[6pt]
\underline{Claim:} Given $\M \in \R^{d\times d}$ suppose $Q \in \R^{d\times d}$
satisfies $\Q^t\Q =\I$.
Then,
\[
\min_{\T\in \R^{p\times p}} \|\M\Q - \Q\T\|_2 = \|(\I- \Q\Q^t)\M\Q\|_2,
\]
and $\T = \Q^t\M\Q$ is the minimizer.
%
%
% -------------- 18.txt -------------------------------
%
%
The claim is verified by an easy application of the Pythagorean Theorem:
For any $\T \in \R^{p\times p}$ we have
\begin{align}
\label{eq:3.1}
\|\M\Q - \Q\T\|^2_2 &= \|\M\Q - \Q\Q^t\M\Q + \Q\Q^t\M\Q - \Q\T\|^2_2 \nonumber\\
&=\|(\I - \Q\Q^t)\M\Q + \Q(\Q^t\M\Q - \T)\|^2_2 \nonumber \\
&=\|(\I - \Q\Q^t)\M\Q\|^2_2 + \|\Q(\Q^t\M\Q - \T)\|^2_2 \\
&=\|(\I - \Q\Q^t)\M\Q\|^2_2 \nonumber
\end{align}
Equality~(\ref{eq:3.1}) holds since $\I - \Q\Q^t$ projects $\M\Q$ onto the
subspace orthogonal to
$\ran(\Q)$. Thus, the two terms in the expression on the right are orthogonal,
and the Pythagorean theorem yields equality. The concluding inequality
establishes that the minimizing $\T$ is that which annihilates the
second term in~(\ref{eq:3.1}), that is, $\T = \Q^t\M\Q$.
The second equality in Theorem~\ref{thm:3.2.2} is a consequence of the polar decomposition, in
which $\Q = \Psi \Z^{-1} = \Psi(\Psi^t\Psi)^{-1/2}$. Thus,
$\T = (\Psi^t\Psi)^{-1/2}\Psi^t \M^t \Psi (\Psi^t\Psi)^{-1/2}$
is the minimizer, as claimed.
\end{proof}
\subsection{The Krylov Subspace}
% Sec 3.3
\label{sec:krylov-subspace}
Suppose the columns of a matrix $\Q \in \R^{d\times p}$ give an orthonormal
basis for an approximate invariant subspace. Then, as we have seen,
\begin{enumerate}
\item
$\T = \Q^t\M\Q$ minimizes $\|\E\|_2 = \|\M\Q - \Q\T\|_2$ and
\item there exist $\mu_1, \dots, \mu_p\in \lambda(\T)$ and
$\lambda_1,\dots, \lambda_p \in \lambda(\M)$ such that
\[
\|\mu_k-\lambda_k\| \leq \sqrt{2}\|\E\|_2, \quad \text{ for $k=1,\dots, p$.}
\]
\end{enumerate}
%
%
% ---------------- 19.txt --------------------------------------------
%
%