In [5]:
import math 
import numpy as np
import pandas as pd
import numpy.linalg as alg
import matplotlib.pyplot as plt

### Simulation Experiments on Phase Transition in PCA

In [None]:
"""
Args:
    iteration: number of trials
    Var_signal: contians the variances of data signal for trial
    S: covatiance matrix in a trial
    L: list of the max eigen_values of every trial
    V: array of the corresponding eigen_vector for the max eigen_value of every trial
"""
p = 100
n = 500
gamma = p/n
sigma = 1
u = np.ones(p)/p
Ip = np.eye(p)
zero_mean = np.zeros(p)
iteration = 2000
Var_signal = [i/iteration for i in range(1, iteration+1)]
S = np.zeros((p, p))
L = []
V = np.zeros(p)

"""Find the max eigen_value and eigen_vector for all cases with different data signal strength"""
"""We denote the strength of data signal by it's variance lambda"""
"""
Args:
    Var_signal: contians the variances of data signal for trial
    j: denotes the index of a sample in a trial
    alpha: is a normal distribution with 0 mean and variance v as data signal
    t: the true direction of data, supposed to be e
    epsilon: p-dim normal distribution, standing for the noise
    S: covatiance matrix in a trial
"""

# For each Trial
for i, v in enumerate(Var_signal):
    # For each sample
    for j in range(n):
        alpha = np.random.normal(loc=0, scale=math.sqrt(v), size=1)
        t = alpha * u
        epsilon = np.random.multivariate_normal(zero_mean, Ip, 1)
        x = t + epsilon
        # Compute the Covariance Matrix
        S += 1/n * np.dot(x, x.T)
    
    # Do EVD
    eigen_values, eigen_vectors = alg.eig(S)
    eigen_pairs = [(eigen_values[i], eigen_vectors[:, i]) for i in range(len(eigen_values))]
    # Sort the eigen_pairs by eigen_value in decreasing order
    eigen_pairs.sort(key= lambda eigen_pairs: eigen_pairs[0], reverse=True)
    # Find the max eigenvalue and corresponding eigenvector
    lambda_max = eigen_pairs[0][0]
    vector_max = eigen_pairs[0][1]
    
    if i%10 == 0:
        print("The {}-th trial with data signal variance {}".format(i+1, v))
        print("The max eigen value is {}".format(lambda_max))
        print("The correspondign eigen vector is\n {}".format(vector_max))
        print("\n")
    
    # Record the max eigenvalue and corresponding eigenvector
    L.append(lambda_max)
    if V.all() == 0:
        V = vector_max
    else:
        V = np.hstack((V, vector_max))
        
        
V = V.astype(np.float64)


The 1-th trial with data signal variance 0.0005
The max eigen value is (10151.2257755271+0j)
The correspondign eigen vector is
 [-0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j
 -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j
 -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j
 -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j
 -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j
 -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j
 -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j
 -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j
 -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j
 -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j
 -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j
 -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+0.j -0.1+

### Conclusions

All basic conclusions can be verified by the simulation experiments. However, the eigen vector corresponding to the max eigen value will be orthogonal to the ture direction u from time to time. I tried a lot of combinations of hyperparameters (p, n iteration), however, the same problem appears every time. 

Besides, I found it impossible to verify the case where p and n go to ifinity at the same time because even a server with 80 gpus will shutdown once I assign p = 5000 and n =10000.