# Principal Component Analysis

In [78]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


In [79]:
# Implement PCA for pandas DataFrame
def pca(X, n_components):
    """
    :param X: pandas DataFrame
    :param n_components: int
    :return: pandas DataFrame
    """
    # Center the data
    X = X - X.mean()
    # Compute the covariance matrix
    cov_mat = np.cov(X.T)
    # Compute eigenvalues and eigenvectors
    eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)
    # Sort eigenvalues and eigenvectors
    eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:, i]) for i in range(len(eigen_vals))]
    eigen_pairs.sort(key=lambda k: k[0], reverse=True)
    # Select the first n_components eigenvectors
    w = np.hstack((eigen_pairs[i][1][:, np.newaxis] for i in range(n_components)))
    # Transform the data
    X_pca = X.dot(w)
    return X_pca

In [80]:
# Plot PCA results
def plot_pca(X, X_pca):
    """
    :param X: pandas DataFrame
    :param X_pca: pandas DataFrame
    :return: None
    """
    # Plot the original data
    plt.figure(figsize=(8, 8))
    plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c='blue', marker='o', label='True')
    plt.xlabel('PC 1')
    plt.ylabel('PC 2')
    plt.legend(loc='lower left')
    plt.show()
    # Plot the PCA results
    plt.figure(figsize=(8, 8))
    plt.scatter(X_pca.iloc[:, 0], X_pca.iloc[:, 1], c='red', marker='o', label='PCA')
    plt.xlabel('PC 1')
    plt.ylabel('PC 2')
    plt.legend(loc='lower left')
    plt.show()
    return None

## Real-World Usage

In [81]:
# Load data
X = pd.read_csv('data/spotify.csv')
X_features = X[['track_popularity', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']]
X_features

Unnamed: 0,track_popularity,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,66,-2.634,0.0583,0.102000,0.000000,0.0653,0.5180,122.036,194754
1,67,-4.969,0.0373,0.072400,0.004210,0.3570,0.6930,99.972,162600
2,70,-3.432,0.0742,0.079400,0.000023,0.1100,0.6130,124.008,176616
3,60,-3.778,0.1020,0.028700,0.000009,0.2040,0.2770,121.956,169093
4,69,-4.672,0.0359,0.080300,0.000000,0.0833,0.7250,123.976,189052
...,...,...,...,...,...,...,...,...,...
32828,42,-1.814,0.0936,0.076600,0.000000,0.0668,0.2100,128.170,204375
32829,20,-4.462,0.0420,0.001710,0.004270,0.3750,0.4000,128.041,353120
32830,14,-4.899,0.0481,0.108000,0.000001,0.1500,0.4360,127.989,210112
32831,15,-3.361,0.1090,0.007920,0.127000,0.3430,0.3080,128.008,367432


In [84]:
# Perform PCA
X_pca = pca(X_features, 3)
print(X_pca.shape)

(32833, 3)


  w = np.hstack((eigen_pairs[i][1][:, np.newaxis] for i in range(n_components)))


In [1]:
# Plot PCA results
print(X_features.shape)
print(X_pca.shape)
plot_pca(X_features, X_pca)


NameError: name 'X_features' is not defined