# Manifold Learning

<h3><span style="color:blue">
NOTE:
In this code, all inputs and outputs are <i>column vectors</i>.<br/>
</span></h3>

In [None]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

## Part 1.  Multidimensional Scaling

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

# Helper functions & Datasets
from Code.optimization import gradient_descent
from Code.dataset import ten_city, synthetic_spiral

## * Classical MDS

### <span style="color:red">============= Assignment 1 =============</span>

In [None]:
def cmds(X, n_dim, input_type='raw'):
    """
    Classical(linear) multidimensional scaling (MDS)
    
    Parameters
    ----------
    X: (d, n) array or (n,n) array
        input data. The data are placed in column-major order. 
        That is, samples are placed in the matrix (X) as column vectors
        d: dimension of points
        n: number of points
        
    n_dim: dimension of target space
    
    input_type: it indicates whether data are raw or distance
        - raw: raw data. (n,d) array. 
        - distance: precomputed distances between the data. (n,n) array.
    Returns
    -------
    Y: (n_dim, n) array. projected embeddings.
    evals: (n_dim) eigen values
    evecs: corresponding eigen vectors in column vectors
    """

    if input_type == 'distance':
        D = X
    elif input_type == 'raw':
        Xt = X.T
        D = euclidean_distances(Xt,Xt)
        
    ???

    return Y, evals, evecs

# 
test_data = np.array([[0,3,2], [1,3,5], [-6,-3,5], [1,1,1]]).T

n_dim = 3
Y_test, evals, evecs = cmds(X=test_data, n_dim=n_dim, input_type='raw')
print('%d-largest eigen values:'% n_dim)
print(evals)
print('Corresponding eigen vectors:\n', evecs.T)
print('Embedded coordinates:')
print(Y_test)

In [None]:
# ten city dataset
flying_dist, city = ten_city()
flying_dist = flying_dist + flying_dist.T

In [None]:
???

### <span style="color:red">=========== End of Assignment 1 ===========</span>

## * Stress-based MDS

### <span style="color:red">============= Assignment 2 =============</span>

In [None]:
def loss_sammon(D, y):
    """
    Loss function (stress) - Sammon
    
    Parameters
    ----------
    D: (n,n) array. distance matrix in original space
        This is a symetric matrix
    y: (d,n) array
        d is the dimensionality of target space.
        n is the number of points.
        
    Returns
    -------
    stress: scalar. stress
    """
    ???
    
    return stress

def grad_sammon(D, y):
    """
    Gradient function (first derivative) - Sammonn_dim
    
    Parameters
    ----------
    D: (n,n) array. distance matrix in original space
        This is a symetric matrix
    y: (d,n) array
        d is the dimensionality of target space.
        n is the number of points.
        
    Returns
    -------
    g: (k,n) array.
        Gradient matrix. 
        k is the dimensionality of target space.
        n is the number of points.
    """
    ???
    
    return 

# The following code to be used for testing student's implementation during marking. Don't change!
test_data = np.array([[0,3,2], [1,3,5], [-6,-3,5], [1,1,1]]).T
D = euclidean_distances(test_data.T, test_data.T)
y = np.array([[1,0],[1,1],[0,1],[0,0]]).T
loss = loss_sammon(D, y)
gradient = grad_sammon(D, y)
print(loss)
print(gradient)

### <span style="color:red">=========== End of Assignment 2 ===========</span>

The following function provided for students is used to do Assignment 3.

In [None]:
def stress_based_mds(x, n_dim, loss_f, grad_f, input_type='raw', 
                     lr=0.5, tol=1e-9, max_iter=6000):
    """
    Stress-based MDS
    
    Parameters
    ----------
    x: (d,n) array or (n,n) array
        If it is raw data -> (d,n) array
        otherwise, (n,n) array (distance matrix)
        n is the number of points
        d is the dimensionality of original space
    n_dim: dimensionality of target space
    loss_f: loss function
    grad_f: gradient function
    input_type: 'raw' or 'distance'
    init: initialisation method
        random: Initial y is set randomly
        fixed: Initial y is set by pre-defined values
    max_iter: maximum iteration of optimization
    
    Returns
    -------
    y: (n_dim,n) array. Embedded coordinates in target space
    losses: (max_iter,) History of stress
    """

    # obtain distance
    if input_type == 'raw':
        x_t = x.T
        D = euclidean_distances(x_t, x_t)
    elif input_type == 'distance':
        D = x
    else:
        raise ValueError('inappropriate input_type')
    
    # Remaining initialisation
    N = x.shape[1]

    np.random.seed(10)
    # Initialise y randomly
    y = np.random.normal(0.0,1.0,[n_dim,N])

    # calculate optimal solution (embedded coordinates)
    y, losses = gradient_descent(D, y, loss_f, grad_f, lr, tol, max_iter)
    
    return y, losses

### <span style="color:red">=========== Assignment 3 ===========</span>

In [None]:
X_spiral = synthetic_spiral()

fig_spiral = plt.figure()
fig_spiral.suptitle('Spiral Data')

# Add 3d scatter plot
ax = fig_spiral.add_subplot(projection='3d')
ax.scatter(X_spiral[0,:], X_spiral[1,:], X_spiral[2,:], c='k')
ax.plot(X_spiral[0,:], X_spiral[1,:], X_spiral[2,:], c='k');

### <span style="color:red">=========== End of Assignment 3 ===========</span>

## Part 2. Isometric Feature Mapping (ISOMAP)

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
from Code.helpers import VIS_Shortest_path_2d, ImageViewer

### <span style="color:red">============= Assignment 4 =============</span>

In [None]:
def fixed_radius_distance(X, epsilon):
    """
    Calculate epsilon-NN
    
    Parameters
    ----------
    X: (d,n) array, where n is the number of points and d is its dimension
    epsilon: criterion of selecting neighbors
        Select points as its neighbours if distance < epsilon
        
    Returns
    -------
    nbrs_dist: (n,k*) array
        It is filled with distances with neighbors. 
        In each row, k* varies according to the number of neighbours
        Each row corresponds to a specific point (row-major order)
    nbrs_idx: (n,k*) array
        It is filled with the indices of neighbors. 
        In each row, k* varies according to the number of neighbours
        Each row corresponds to a specific point (row-major order)
    """
    ???

    return nbrs_dist, nbrs_idx

def nearest_neighbor_distance(X, n_neighbors):
    """
    Calculate K-NN
    
    Parameters
    ----------
    X: (d,n) array, where n is the number of points and d is its dimension
    n_neighbors: number of neighbors
        Select n_neighbors(k) nearest neighbors

    Returns
    -------
    dist: (n,k) array
        It is filled with distances with neighbors. 
        In each row, k varies according to the number of neighbours
        Each row corresponds to a specific point (row-major order)
    nbrs: (n,k) array
        It is filled with the indices of neighbors. 
        In each row, k varies according to the number of neighbours
        Each row corresponds to a specific point (row-major order)
    """
    ???

    return nbrs_dist, nbrs_idx

# The following code to be used for testing student's implementation during marking. Don't change!
test_data = np.array([[0,3,2], [1,3,5], [-6,-3,5], [1,1,1]]).T
dist, idx = fixed_radius_distance(test_data, 9.1)
print(dist)
print(idx)

dist, idx = nearest_neighbor_distance(test_data, 2)
print(dist)
print(idx)

In [None]:
def isomap(x, n_components, n_neighbors=None, epsilon=None, dist_func=None, cmds_func=None):
    """
    ISOMAP
    
    Parameters
    ----------
    x: (d,n) array, where n is the number of points and n is its dimensionality.
    n_components: dimentionality of target space
    n_neighbors: the number of neighourhood
    epsilon: fixed radius
    dist_func: function for calculating distance matrix
    
    Returns
    -------
    Y: (d,n) array. Embedded coordinates from cmds in Step 3.
    dist_mat: (n,n)array. Distance matrix made in Step 1.
    predecessors: predecessors from "shortest_path" function in Step 2.
    """
    assert(cmds_func is not None)
    assert((epsilon is not None) or (n_neighbors is not None))

    n_points = x.shape[1]

    # Step 1.
    # find nearest neighbors to each sample with the given condition
    ???
    
    # Step 2.
    # Find shortest paths
    from scipy.sparse import csr_matrix
    from scipy.sparse.csgraph import shortest_path
    ???

    # Step 3.
    # Apply cMDS
    ???

    return Y, dist_mat, predecessors

# The following code to be used for testing student's implementation during marking. Don't change!
test_data = np.array([[0,3,2], [1,3,5], [-6,-3,5], [1,1,1]]).T
n_components = 2
n_neighbors = 2
Y_nn, dist_nn, predecessors_nn = isomap(test_data, 
                                            n_components, 
                                            n_neighbors=n_neighbors, 
                                            dist_func=nearest_neighbor_distance, 
                                            cmds_func=cmds)
print(Y_nn)
print(dist_nn)
print(predecessors_nn)

### Example with Swiss Roll

In [None]:
from sklearn.datasets import make_swiss_roll

n_points = 1000
data_s_roll, color = make_swiss_roll(n_points)
data_s_roll = data_s_roll.T

In [None]:
fig_swiss_roll = plt.figure()
fig_swiss_roll.suptitle("Swiss roll dataset")

# Add 3d scatter plot
ax = fig_swiss_roll.add_subplot(projection='3d')
ax.scatter(data_s_roll[0,:], data_s_roll[1,:], data_s_roll[2,:], c=color, 
           cmap=plt.cm.Spectral)
ax.view_init(4, -72);

### <span style="color:red">=========== End of Assignment 4 ===========</span>
    
    
The following code is provided to do Assignment 5.
    

### Example with face data

In [None]:
from Code.dataset import face_tenenbaum

data_face = face_tenenbaum()
print(data_face.shape)
image_size = [64,64]

In [None]:
n_components = 2
n_neighbors = 6
Y_face, dist_face, predecessors_face = isomap(data_face, n_components, 
                                              n_neighbors=n_neighbors, 
                                              dist_func=nearest_neighbor_distance, 
                                              cmds_func=cmds)

### <span style="color:red">============= Assignment 5 =============</span>

#### Shortest path of specific two points

In [None]:
def get_shortest_path(predecessors, start_idx, end_idx):
    path = [end_idx]
    k = end_idx
    while predecessors[start_idx, k] != -9999:
        path.append(predecessors[start_idx, k])
        k = predecessors[start_idx, k]
    return path[::-1]

#### Path 1

In [None]:
fig_face_index_h = plt.figure()
vis_face_index_h = VIS_Shortest_path_2d(Y_face, dist_face, predecessors_face, 
                                        fig_face_index_h)

#### Path 2

In [None]:
fig_face_index_v = plt.figure()
vis_face_index_v = VIS_Shortest_path_2d(Y_face, dist_face, predecessors_face, 
                                        fig_face_index_v)

### <span style="color:red">=========== End of Assignment 5 ===========</span>



## Part 3. Locally Linear Embedding (LLE) 

In [None]:
from scipy.stats import pearsonr, spearmanr
from Code.helpers import VIS_Bars
from Code.lle import lle

### <span style="color:red">============ Assignment 6 =========== </span>

In [None]:
from sklearn.datasets import make_s_curve
n_points = 1000
X, angle = make_s_curve(n_points, random_state=0)
X = X.T

In [None]:
fig_s_curve = plt.figure()
fig_s_curve.suptitle('S-Curve')
    
colorize = dict(c=angle, cmap=plt.cm.Spectral)
# Add 3d scatter plot
ax = fig_s_curve.add_subplot(projection='3d')
ax.scatter(X[0,:], X[1,:], X[2,:], **colorize)
ax.view_init(4, -72)

### <span style="color:red">============ End of Assignment 6 =========== </span>

### <span style="color:red">============ Assignment 7 =========== </span>

In [None]:
from Code.dataset import bars

data_bar, centers = bars()
data_bar = data_bar.T
centers = centers.T
image_size = [40,40]

In [None]:
def reg_func(C, K):
    trace = np.trace(C)
    return 1e-3*trace*K*np.eye(K)

### <span style="color:red">============ End of Assignment 7 =========== </span>