# PCA Implementation

Implements a PCA method that we will test on a small sample of the dataset.

# 1 Set up Spark Environment

Set up Spark environment so we can partition the data.

References: 
* https://towardsdatascience.com/pyspark-in-google-colab-6821c2faf41c

In [0]:
# Install necessary dependencies, if needed (only need to run once!!!)
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.gtlib.gatech.edu/pub/apache/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
# checks if it was installed properly (if so, you should see file spark-2.4.5-bin-hadoop2.7)
!ls

sample_data  spark-2.4.5-bin-hadoop2.7	spark-2.4.5-bin-hadoop2.7.tgz


In [0]:
# Set up environment path so we can run Pyspark in Colab
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

In [0]:
# Create local spark session
import findspark
findspark.init()
import pyspark
from pyspark.sql import SQLContext
sc = pyspark.SparkContext(appName="pca")
sqlContext = SQLContext(sc)

## 2 TODO Load Raw Data

See `data_loading.ipynb` for starter code.

The full data is hosted on GCS. We only want a small sample of it, e.g. `val_data.tar` (~2 GB).

References:
* https://stackoverflow.com/questions/51715268/how-to-import-data-from-google-cloud-storage-to-google-colab

In [0]:
from google.cloud import storage
import tarfile

# Connect to GCS bucket
bucket_name = "dataproc-staging-us-central1-759291875656-wohgf1sk"
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)

# Identifying blobs of data in the bucket
file_prefix = "data/"
blobs = bucket.list_blobs(prefix=file_prefix, delimiter = '/')

# Downloading a specific blob of data
file_name = "val_data.tar"
blob = bucket.get_blob(file_prefix + file_name)
blob.download_to_filename("val_data.tar")

# reading the data
tar_file = tarfile.open("val_data.tar")
arr = []
for member in tar_file.getmembers():
    f = tar_file.extractfile(member)
    img_bytes = np.frombuffer(f, dtype=int)
    arr.append(img_bytes)
tar_file.close()

DefaultCredentialsError: ignored

In [0]:
# display sample image

from PIL import Image
import matplotlib.pyplot as plt
import io

img_data = arr[0]
image = Image.open(io.BytesIO(img_data))
plt.imshow(image)

## 3 Parallelize Data

Parallelize the data so we can distribute work across the cluster.

In [0]:
import numpy as np
import matplotlib.pyplot as plt

In [0]:
par_data = sc.parallelize(val_data)

In [0]:
# Test
print('Size:', par_data.count())
print('Sample Entry:', par_data.take(1))

## 4 Compute Covariance Matrix

Function to compute the covariance of the data in an RDD.

In [0]:
def compute_covariance(data):
  """
  Compute covariance matrix for given RDD.
  Args:
    data: (RDD of np arrays) RDD representing the data
  Returns:
    covmat: (np array) covariance matrix of the RDD
  """
  n = data.count()
  mean = data.mean()
  data_0_mean = data.map(lambda m: m - mean)
  covmat = (data_0_mean
                .map(lambda mat: np.outer(mat.T, mat))
                .reduce(lambda x,y: x+y)) / float(n)
  return covmat

## 5 Eigendecomposition

Perform eigendecomposition of covariance matrix to determine the directions of maximal variance, i.e. the principal components.

In [0]:
from numpy.linalg import eigh

def pca(data, k=2):
  """
  Computes the top `k` principal components, their corresponding PCA scores, and eigenvalues.
  Args:
    data: (RDD of np arrays) RDD representing the data.
    k: (int) number of principal components to find
  Returns:
    (eigenvectors, scores, eigenvalues): (np.ndarray, RDD of np.ndarray, np.ndarray)
  """

  # Compute covariance matrix
  covmat = compute_covariance(data)

  # Compute eigenvalues & eigenvectors
  eig_vals, eig_vecs = eigh(covmat)

  # Sort the eigenvectors based on their eigenvalues
  inds = np.argsort(eig_vals)
  inds = inds[::-1]

  # Find the `k` principal components, `k` scores, and all eigenvalues
  components = eig_vecs[:,inds[:k]]
  eigenvalues = eig_vals[inds]
  scores = data.map(lambda m: m.dot(components))
  
  return (components, scores, eigenvalues)

In [0]:
# Run PCA on the actual data
top_comps, top_scores, top_eigenvals = pca(par_data, 2)

In [0]:
# Debugging info
print('top components: \n{0}'.format(top_comps))
print('\ntop scores (first three): \n{0}'
       .format('\n'.join(map(str, top_scores.take(3)))))
print('\ntop eigenvalues: \n{0}'.format(top_eigenvals))

## 6 Test & Visualize Results

Test the PCA on MNIST dataset.

Plots the original data and its reconstructions using the top `k` principal components returned by our PCA function.

### 6.1 Test on MNIST Dataset

In [0]:
mnist = keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
# normalizing the data
x_train, x_test = x_train / 255.0, x_test /255.0

### 6.2 Plot Results

In [0]:
# TODO: implement projection function

In [0]:
# TODO Test & plot it

# PCA using Tensorflow

References:
* https://medium.com/@mukesh.mithrakumar/principal-component-analysis-with-tensorflow-2-0-395aaf96bc

In [1]:
# Install tensorflow_transform
import argparse
import os
import pprint
import tempfile
import urllib.request
import zipfile

print("Installing dependencies for Colab environment")
!pip install -Uq grpcio==1.26.0

import tensorflow as tf

print('Installing Apache Beam')
!pip install -Uq apache_beam==2.16.0
import apache_beam as beam

print('Installing TensorFlow Transform')
!pip install -Uq tensorflow-transform==0.15.0

Installing dependencies for Colab environment
[K     |████████████████████████████████| 2.4MB 2.8MB/s 
[?25hInstalling Apache Beam
[K     |████████████████████████████████| 3.0MB 2.8MB/s 
[K     |████████████████████████████████| 81kB 6.1MB/s 
[K     |████████████████████████████████| 61kB 7.9MB/s 
[K     |████████████████████████████████| 1.2MB 26.0MB/s 
[K     |████████████████████████████████| 51kB 8.5MB/s 
[K     |████████████████████████████████| 153kB 36.7MB/s 
[K     |████████████████████████████████| 225kB 32.5MB/s 
[K     |████████████████████████████████| 112kB 32.0MB/s 
[?25h  Building wheel for oauth2client (setup.py) ... [?25l[?25hdone
  Building wheel for avro-python3 (setup.py) ... [?25l[?25hdone
  Building wheel for hdfs (setup.py) ... [?25l[?25hdone
  Building wheel for dill (setup.py) ... [?25l[?25hdone
  Building wheel for httplib2 (setup.py) ... [?25l[?25hdone
[31mERROR: pydrive 1.3.1 has requirement oauth2client>=4.0.0, but you'll have oauth2c

In [0]:
import numpy as np
import tensorflow as tf
import tensorflow_transform as tft
from tensorflow import keras
from PIL import Image
import io
from tempfile import mkdtemp
import pickle as pkl
import tarfile
import scipy
import time

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


Getting the preprocessed data

In [0]:
filepath = "/content/drive/My Drive/CMU 2020 S1 (Spring)/10-405: Machine Learning with Large Datasets/10605 Term Project/Data/train_img_array.pkl"
fp = np.memmap(filepath, dtype = 'float32', mode = 'r', shape = (1803460,32,32))
fp = fp.reshape((1803460,-1))
data_tensor = tf.convert_to_tensor(fp, dtype = tf.float64)

print('Finished loading data')

In [1]:
train_data = fp[:1803000]
print('Data shape:', train_data.shape)
plt.imshow(train_data[100], cmap='Greys')

NameError: ignored

Running PCA on the data

In [0]:
# garbage collector
import gc
gc.collect()

In [0]:
# run pca & time it
start_time = time.time()
top_eigenvecs = tf.tft.pca(data_tensor, 128, dtype=int)
end_time  = time.time()
print('Orignial dimension: %d' % (data_tensor.shape[1]))
print('Output eigenvecs shape:', top_eigenvecs.shape)
print("handling %d data: --- {%s} seconds ---" % (data_tensor.shape[0], time.time() - start_time))