# Quick Start Guide

2FIX: REMEMBER, USERS MAY ARRIVE HERE WITH BASICALLY NO CONTEXT AT ALL

2FIX: ADD TEXT (with references/points (how specific?) to user guide and technical summary)
* mention run_simple method up here?




## Make sure that dqm is in your PYTHONPATH

In [None]:
### uncomment the code below and edit as needed

# NOTE: you need the *parent* folder of the Python 'dqm' folder in the path.
# the 'dqm' folder below is the *outer* folder, containing the README file, etc.

#import os, sys
#sys.path.append(os.path.join(os.path.expanduser('~'), 'dqm'))

## Imports

In [None]:
### imports

import numpy as np
from dqm import dqm, plot_frames, extract_manifolds

try:
    import matplotlib.pyplot as plt
    HAVE_PLT = True
except:
    HAVE_PLT = False
print('have PyPlot:', HAVE_PLT)

## Create a very simple data set

* 20 dimensions
* 4 spherical clusters grouped in 2 superclusters

In [None]:
def random_points_in_sphere(num_points, num_dims, radius, rand_seed=0):
    '''
    generate random points within a high-dimensional sphere
    '''

    rng = np.random.default_rng(rand_seed)
    points = rng.uniform(low=-1, high=1, size=(num_points, num_dims))

    # for each point, pick a random value between 0 and radius, skewed toward the maximum value (radius)
    for row_idx in range(num_points):
        r = radius * (rng.random() ** 0.5)
        row = points[row_idx, :]
        row_norm = np.linalg.norm(row)  # L2 norm is the default
        points[row_idx, :] = row * r / row_norm
    # end for each point/row

    return points
# end function random_points_in_sphere


### build the data set

# set parameters
num_points_per_cluster = 100
num_dims = 20
cluster_radius= 4.0
super_sep = 12  # separation between superclusters
sub_sep = 10  # separation of clusters within superclusters

# create 4 spherical clusters
cluster0 = random_points_in_sphere(num_points_per_cluster, num_dims, cluster_radius, rand_seed=0)
#cluster0[:, 0] = cluster0[:, 0] + delta0
#cluster0[:, 1] = cluster0[:, 1] + delta1
cluster1 = random_points_in_sphere(num_points_per_cluster, num_dims, cluster_radius, rand_seed=1)
#cluster1[:, 0] = cluster1[:, 0] + delta0
#cluster1[:, 1] = cluster1[:, 1] - delta1
cluster2 = random_points_in_sphere(num_points_per_cluster, num_dims, cluster_radius, rand_seed=2)
#cluster2[:, 0] = cluster2[:, 0] - delta0
#cluster2[:, 1] = cluster2[:, 1] + delta1
cluster3 = random_points_in_sphere(num_points_per_cluster, num_dims, cluster_radius, rand_seed=3)
#cluster3[:, 0] = cluster3[:, 0] - delta0
#cluster3[:, 1] = cluster3[:, 1] - delta1

rng = np.random.default_rng(17)

# create 2 superclusters by pushing clusters 0 and 1 in a random direction away from the origin and
# pushing clusters 2 and 3 in the opposite direction
dir = rng.uniform(low=-1, high=1, size=num_dims)
dir_norm = np.linalg.norm(dir)
dir /= dir_norm
cluster0 += dir * super_sep / 2
cluster1 += dir * super_sep / 2
cluster2 -= dir * super_sep / 2
cluster3 -= dir * super_sep / 2

# separate clusters within superclusters
dir = rng.uniform(low=-1, high=1, size=num_dims)
dir_norm = np.linalg.norm(dir)
dir /= dir_norm
cluster0 += dir * sub_sep / 2
cluster1 -= dir * sub_sep / 2
dir = rng.uniform(low=-1, high=1, size=num_dims)
dir_norm = np.linalg.norm(dir)
dir /= dir_norm
cluster2 += dir * sub_sep / 2
cluster3 -= dir * sub_sep / 2

# cat clusters together into single matrix
dat = np.concatenate((cluster0, cluster1, cluster2, cluster3), axis=0)

print('shape of raw data:', dat.shape)

## Create a cluster color scheme

In [None]:
# create cluster color scheme
cluster_colors = np.zeros((dat.shape[0], 3))
cluster_colors[:num_points_per_cluster, :] = np.array([1, 0, 0])  # cluster 0 is red
cluster_colors[num_points_per_cluster:2*num_points_per_cluster, :] = np.array([0, 1, 0])  # cluster 1 is green
cluster_colors[2*num_points_per_cluster:3*num_points_per_cluster, :] = np.array([0, 0, 1])  # cluster 2 is blue
cluster_colors[3*num_points_per_cluster:4*num_points_per_cluster, :] = np.array([0.7, 0.7, 0.7])  # cluster 3 is gray

# plot the first 3 columns of the 'dat' matrix
plot_frames(dat, color=cluster_colors)

## create dqm instance and store raw data

In [None]:
dqm = dqm()
dqm.verbose = True  # default True
dqm.raw_data = dat

## Run PCA

explain what's interesting about the results (see user guide for more detail)
* difference between left and middle plots

In [None]:
dqm.run_pca()

## Choose how many PCA dimensions to use

either explicitly or by setting a threshold for the cumulative variance

In [None]:
## choose a cumulative-variance threshold and create frame 0

dqm.clear_pca()

dqm.pca_var_threshold = 0.9
# OR...
# dqm.pca_num_dims = 
dqm
dqm.create_frame_0()

In [None]:
fig = plot_frames_ipv(dqm.frames[:, :3, 0], size=2, color=cluster_colors, labels=['1', '2', '3'])

In [None]:
## (optionally) choose a limited basis
dqm.basis_size = dqm.raw_data.shape[0] / 2  # use half the points as a basis
dqm.choose_basis_by_distance()

In [None]:
## plot with a color scheme that highlights the basis rows

basis_colors = 0.8 * np.ones((dat.shape[0], 3))  # light gray
basis_colors[dqm.basis_row_nums, :] = np.array([1, 0.6, 0])  # basis rows in orange

sizes = 1.5 * np.ones(dat.shape[0])
sizes[dqm.basis_row_nums] = 2  # make basis-row points bigger

#fig = plot_frames_ipv(dqm.frames[:, :3, 0], size=sizes, color=basis_colors, labels=['1', '2', '3'])

In [None]:
dqm.choose_sigma_for_basis()

In [None]:
### look at non-basis overlaps

print('building overlaps...')
overlaps = dqm.build_overlaps()

print('for sigma {:.4f}, non-basis overlaps have min {:.3f}, mean {:.3f}, median {:.3f}, max {:.3f}'.\
      format(dqm.sigma, np.min(overlaps), np.mean(overlaps), np.median(overlaps), np.max(overlaps)))

if HAVE_PLT:
    plt.hist(overlaps, bins=50)
    plt.show()
else:
    print('ooops -- need a plotting package...')

In [None]:
### set mass -- 2FIX....................!!!!!!!!!!!!!!

dqm.mass = dqm.default_mass_for_num_dims()


In [None]:
dqm.build_operators()

In [None]:
dqm.build_frames_auto(100)

In [None]:
dqm.frames.shape

In [None]:
dqm.mean_row_distance

In [None]:
#fig = plot_frames_ipv(dqm.frames, size=2, color=cluster_colors)

In [None]:
### show clean separation of 4 clusters

In [None]:
### show clean separation of 2 superclusters with higher sigma

In [None]:
###### THIS IS OUTDATED...

print('saving operators to disk...')
np.savez('operators.npz', simt=dqm.simt, xops=dqm.xops, exph=dqm.exph)