In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import argparse
import os
import sys
import numpy as np
import torch
from matplotlib import pyplot as plt
import pandas as pd

module_path = os.path.abspath(os.path.join('/users/dli44/tool-presence'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src import constants as c
from src import utils
from src import visualization as v
from src import model as m
from src import gmm

In [None]:
import matplotlib
matplotlib.rc('text', usetex=True)
matplotlib.rcParams['figure.dpi'] = 200

In [None]:
# set up argparse
parser = argparse.ArgumentParser(
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument('--root', type=str,
                    default=os.path.abspath('.'),
                    help='Root directory of tool-presence')
parser.add_argument('--train', type=str, default='')
parser.add_argument('--test', type=str, default='')
parser.add_argument('--model-path', type=str, default='')
parser.add_argument('--fit-path', type=str, default='')
parser.add_argument('-v', '--verbose', help="increase output verbosity",
                    action="store_true")
args = parser.parse_args([]);
args.augmentation=False
args.image_size=64
args.image_channels=3
args.batch_size=32
args.data_dir="../data/youtube_data/"

In [None]:
datasets, _ = utils.setup_data(args)

In [None]:
# i = 0
a = [datasets['train'][i][0].numpy().reshape(1, -1) for i in range(len(datasets['train']))]

In [None]:
a = np.array(a).reshape(len(datasets['train']), -1)

In [None]:
from sklearn import decomposition

In [None]:
from tqdm import tnrange

explained_variance = []

for n in tnrange(2, 1000):
    pca = decomposition.PCA(n)
    projected = pca.fit_transform(a)
    explained_variance.append(sum(pca.explained_variance_ratio_))

In [None]:
plt.plot(range(2, 100), explained_variance[:98])
plt.ylabel("Explained variance ratio")
plt.xlabel("Number of components")
plt.savefig("pca_plot.png", bbox_inches='tight')

In [None]:
print(len(explained_variance))

In [None]:
print(a.shape)
print(projected.shape)
print('Explained variation per principal component: {} {}'.format(sum(pca.explained_variance_ratio_),pca.explained_variance_ratio_))

In [None]:
np.random.seed(100)
indices = np.random.randint(0, len(datasets['train']), 10)

In [None]:
a_post_pca = pca.inverse_transform(projected)

In [None]:
sample = a[indices].reshape(10, 3, 64, 64).transpose(0,2,3,1)
test_sample = a_post_pca[indices].reshape(10,3,64,64).transpose(0,2,3,1)

In [None]:
plt.figure()
plt.imshow(np.vstack([np.hstack(sample), np.hstack(test_sample)]))
plt.axis("off")
plt.title('50-dimensional PCA reconstructions')
plt.savefig("PCA1.png", bbox_inches='tight')