In [2]:
import numpy as np
from numpy.linalg import svd, matrix_rank
import pandas as pd
import matplotlib.pyplot as plt
from IPython import get_ipython
from util import (
    svdcomp,
    nextplot,
    plot_matrix,
    plot_xy,
    plot_cov,
    match_categories,
)  # see util.py
from sklearn.cluster import KMeans

%matplotlib notebook

# 1 Intuition on SVD

In [5]:
# rank 1 (or 2 because there are two linearly independent rows/columns?) 
M1 = np.array(
    [
        [1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
    ]
)


# same as above --> 1
M2 = np.array(
    [
        [0, 0, 0, 0, 0],
        [0, 2, 1, 2, 0],
        [0, 2, 1, 2, 0],
        [0, 2, 1, 2, 0],
        [0, 0, 0, 0, 0],
    ]
)

# rank 1
M3 = np.array([[0, 0, 0, 0],
               [0, 1, 1, 1],
               [0, 1, 1, 1],
               [0, 1, 1, 1],
               [0, 1, 1, 1]])

# rank 2
M4 = np.array(
    [
        [1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [0, 0, 0, 1, 1],
        [0, 0, 0, 1, 1],
    ]
)

# rank 3
M5 = np.array(
    [
        [1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1],
        [0, 0, 1, 1, 1],
        [0, 0, 1, 1, 1],
    ]
)

# rank 2
M6 = np.array(
    [
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 0, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
    ]
)

## 1a

In [14]:
# YOUR PART

matrix_rank(M6)

2

## 1b

In [16]:
# YOUR PART
svd_M1 = svd(M1)
svd_M2 = svd(M2)
svd_M3 = svd(M3)
svd_M4 = svd(M4)
svd_M5 = svd(M5)
svd_M6 = svd(M6)

## 1c

In [None]:
# You can use the functions svdcomp and plot_matrix from util.py
# YOUR PART

## 1d

In [None]:
# Another method to compute the rank is matrix_rank.
# YOUR PART

# 2 The SVD on Weather Data

In [17]:
# Load the data
climate = pd.read_csv("data/worldclim.csv")
coord = pd.read_csv("data/worldclim_coordinates.csv")
lon = coord["lon"]
lat = coord["lat"]

In [18]:
# Plot the coordinates
plot_xy(lon, lat)

<IPython.core.display.Javascript object>

## 2a

In [21]:
# YOUR PART
# Center the data (i.e., substract the column mean from each column). Store the result
# in X.
X = np.subtract(climate, np.mean(climate))/np.std(climate)

In [22]:
# Plot histograms of attributes
nextplot()
X.hist(ax=plt.gca())

<IPython.core.display.Javascript object>

  X.hist(ax=plt.gca())


array([[<AxesSubplot:title={'center':'min1'}>,
        <AxesSubplot:title={'center':'min2'}>,
        <AxesSubplot:title={'center':'min3'}>,
        <AxesSubplot:title={'center':'min4'}>,
        <AxesSubplot:title={'center':'min5'}>,
        <AxesSubplot:title={'center':'min6'}>,
        <AxesSubplot:title={'center':'min7'}>],
       [<AxesSubplot:title={'center':'min8'}>,
        <AxesSubplot:title={'center':'min9'}>,
        <AxesSubplot:title={'center':'min10'}>,
        <AxesSubplot:title={'center':'min11'}>,
        <AxesSubplot:title={'center':'min12'}>,
        <AxesSubplot:title={'center':'max1'}>,
        <AxesSubplot:title={'center':'max2'}>],
       [<AxesSubplot:title={'center':'max3'}>,
        <AxesSubplot:title={'center':'max4'}>,
        <AxesSubplot:title={'center':'max5'}>,
        <AxesSubplot:title={'center':'max6'}>,
        <AxesSubplot:title={'center':'max7'}>,
        <AxesSubplot:title={'center':'max8'}>,
        <AxesSubplot:title={'center':'max9'}>],
       

## 2b

In [33]:
# Compute the SVD of the normalized climate data and store it in variables U,s,Vt. What
# is the rank of the data?
# YOUR PART
U, s, Vt = svd(X)
rank_X = matrix_rank(X)
print(rank_X)

48


## 2c

In [31]:
# Here is an example plot.
plot_xy(lon, lat, U[:, 0])

<IPython.core.display.Javascript object>

In [None]:
# For interpretation, it may also help to look at the other component matrices and
# perhaps use other plot functions (e.g., plot_matrix).
# YOUR PART

## 2d

In [None]:
# Here is an example.
plot_xy(U[:, 0], U[:, 1], lat - np.mean(lat))

## 2e

In [None]:
# 2e(i) Guttman-Kaiser
# YOUR PART

In [None]:
# 2e(ii) 90% squared Frobenius norm
# YOUR PART

In [None]:
# 2e(iv) entropy
# YOUR PART

In [None]:
# 2e(v) random flips
# Random sign matrix: np.random.choice([-1,1], X.shape)
# YOUR PART

In [None]:
# 2e What, if any, of these would be your choice?
# YOUR PART

## 2f

In [None]:
# Here is the empty plot that you need to fill (one line per choice of k: RSME between
# original X and the reconstruction from size-k SVD of noisy versions)
# YOUR PART
nextplot()
plt.plot()
plt.xlabel(r"Noise level ($\epsilon$)")
plt.ylabel("Reconstruction RMSE vs. original data")

# 3 SVD and k-means

In [None]:
# Cluster the normalized climate data into 5 clusters using k-means and store
# the vector giving the cluster labels for each location.
X_clusters = KMeans(5).fit(X).labels_

## 3a

In [None]:
# Plot the results to the map: use the cluster labels to give the color to each
# point.
plot_xy(lon, lat, X_clusters)

## 3b

In [None]:
# YOUR PART HERE

## 3c

In [None]:
# Compute the PCA scores, store in Z (of shape N x k)
k = 2
# YOUR PART HERE

In [None]:
# cluster and visualize
Z_clusters = KMeans(5).fit(Z).labels_
# match clusters as well as possible (try without)
Z_clusters = match_categories(X_clusters, Z_clusters)
nextplot()
axs = plt.gcf().subplots(1, 2)
plot_xy(lon, lat, X_clusters, axis=axs[0])
plot_xy(lon, lat, Z_clusters, axis=axs[1])