In [271]:
print("hello world")

hello world


In [272]:
import pandas as pd

# Specify the path to the u.data file
file_path = '/Users/tarunvallabhaneni/MPCS Classes/Math of ML/Final Project/data/ml-100k/u.data'

# Load the dataset into a DataFrame
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_csv(file_path, sep='\t', names=column_names)
ratings.drop('timestamp', axis=1, inplace=True)

# Display the first few rows
print(ratings.head())


   user_id  item_id  rating
0      196      242       3
1      186      302       3
2       22      377       1
3      244       51       2
4      166      346       1


In [273]:

# convert to matrix R
user_item_matrix = ratings.pivot(index='user_id', columns='item_id', values='rating')

user_item_matrix = user_item_matrix.fillna(0)

# print info
print(user_item_matrix.info())


total_elements = user_item_matrix.shape[0] * user_item_matrix.shape[1]
non_zero_elems = (user_item_matrix != 0).sum().sum()
sparsity = 1 - (non_zero_elems / total_elements)
print(f"\nSparsity of the matrix is {sparsity:.2%}")



<class 'pandas.core.frame.DataFrame'>
Index: 943 entries, 1 to 943
Columns: 1682 entries, 1 to 1682
dtypes: float64(1682)
memory usage: 12.1 MB
None

Sparsity of the matrix is 93.70%


In [274]:
import numpy as np
matrix_np = user_item_matrix.to_numpy()
print(matrix_np)

nonzero_indices = matrix_np.nonzero()
nonzero_count = np.count_nonzero(matrix_np)
print(nonzero_indices)
print(nonzero_count)


[[5. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]]
(array([  0,   0,   0, ..., 942, 942, 942]), array([   0,    1,    2, ..., 1187, 1227, 1329]))
100000


# Training Class

In [None]:
from tqdm import tqdm
class GD:
    def __init__(self, R, k, learning_rate=0.001, reg_param=0.01, epochs=10, batch_size=100):
        self.R = R
        self.num_users, self.num_items = R.shape
        self.k = k
        self.learning_rate = learning_rate
        self.reg_param = reg_param  
        self.num_epochs = epochs
        self.rows, self.cols = self.R.nonzero()
        self.batch_size = batch_size
        
        # Calculate rating mean and std for normalization
        self.rating_mean = np.mean(R[R.nonzero()])
        self.rating_std = np.std(R[R.nonzero()])
        
        # Initialize 
        self.U = np.random.normal(0, 0.1, size=(self.num_users, self.k))
        self.V = np.random.normal(0, 0.1, size=(self.num_items, self.k))
        
    def normalize_rating(self, rating):
        """Normalize rating to zero mean and unit variance"""
        return (rating - self.rating_mean) / self.rating_std
        
    def denormalize_rating(self, normalized_rating):
        """Convert normalized rating back to original scale"""
        return (normalized_rating * self.rating_std) + self.rating_mean


    
    def calculate_rmse(self, R):
        """Calculate RMSE"""
        rows, cols = R.nonzero()
        if len(rows) == 0:
            return float('inf')
            
        r_true = R[rows, cols]
        r_true_norm = self.normalize_rating(r_true)
        r_pred = np.sum(self.U[rows] * self.V[cols], axis=1)
        
        errors = r_true_norm - r_pred
        mse = np.mean(errors ** 2)
        rmse = np.sqrt(mse)
        return mse, rmse

        
    def train(self, val_matrix=None):
        indices = list(zip(self.rows, self.cols))
        
        for epoch in range(self.num_epochs):
            np.random.shuffle(indices)
            
            for batch_start in range(0, len(indices), self.batch_size):
                batch_indices = indices[batch_start:batch_start+self.batch_size]
                
                for i, j in batch_indices:
                    r_true = self.R[i, j]
                    r_true_norm = self.normalize_rating(r_true)
                    r_pred = np.dot(self.U[i], self.V[j])  
                    error = r_true_norm - r_pred
                    
                    # Gradient calculation
                    u_grad = -2 * error * self.V[j] + 2 * self.reg_param * self.U[i]
                    v_grad = -2 * error * self.U[i] + 2 * self.reg_param * self.V[j]
                    
                    
                    # Update parameters
                    self.U[i] -= self.learning_rate * u_grad
                    self.V[j] -= self.learning_rate * v_grad
            
            # Calculate losses
            _, rmse_train = self.calculate_rmse(self.R)
            
            
            # Print progress
            # if val_matrix is not None:
            #     _, rmse_val = self.calculate_rmse(val_matrix)
            #     print(f"Epoch {epoch+1}/{self.num_epochs}, train RMSE: {rmse_train:.4f}, val RMSE: {rmse_val:.4f}")
            # else:
            #     print(f"Epoch {epoch+1}/{self.num_epochs}, train RMSE: {rmse_train:.4f}")
            

# K Fold Cross Validation

In [276]:
from sklearn.model_selection import train_test_split

def prepare_cross_validation(R, n_folds=5, test_size=0.2, random_state=42):
    # Get observed (i, j) pairs from the matrix
    rows, cols = R.nonzero()
    data = list(zip(rows, cols))
    
    # Split data into train/val and test sets
    train_val_data, test_data = train_test_split(data, test_size=test_size, random_state=random_state)
    
    # Create k folds
    def k_folds(data, k):
        np.random.shuffle(data)
        fold_size = len(data) // k
        leftover = len(data) % k
        
        folds = []
        start = 0
        for i in range(k):
            extra = 1 if i < leftover else 0
            end = start + fold_size + extra
            folds.append(data[start:end])
            start = end
        return folds
    
    folds = k_folds(train_val_data, n_folds)
    
    # Create test matrix
    test_matrix = np.zeros(R.shape)
    for i, j in test_data:
        test_matrix[i, j] = R[i, j]
        
    return folds, test_matrix

def train_with_cross_validation(R, folds, model_params):
    all_val_rmse = []
    
    for fold_idx, fold in enumerate(folds):
        # print(f"\nTraining on fold {fold_idx + 1}")
        
        # Prepare validation data
        val_data = fold
        train_data = [point for idx, f in enumerate(folds) if idx != fold_idx for point in f]
        
        # Create validation matrix
        val_matrix = np.zeros(R.shape)
        for i, j in val_data:
            val_matrix[i, j] = R[i, j]
            
        # Create training matrix
        train_matrix = np.zeros(R.shape)
        for i, j in train_data:
            train_matrix[i, j] = R[i, j]
            
        # Train model
        model = GD(train_matrix, **model_params)
        model.train(val_matrix)
        
        # Store validation RMSE
        _, val_rmse = model.calculate_rmse(val_matrix)
        all_val_rmse.append(val_rmse)
        
    # Compute average RMSE across all folds
    average_val_rmse = np.mean(all_val_rmse)
    # print(f"\nAverage Validation RMSE across all folds: {average_val_rmse:.4f}")
    
    return average_val_rmse, model


In [277]:
folds, test_matrix = prepare_cross_validation(matrix_np, n_folds=5)
model_params = {
    'k': 30,
    'learning_rate': 0.05,  
    'reg_param': 0.1,       
    'epochs': 20,
    'batch_size': 100
}
mean_val_loss, final_model = train_with_cross_validation(matrix_np, folds, model_params)
print(f"\nMean validation loss across folds: {mean_val_loss:.4f}")





Mean validation loss across folds: 0.8538


# Test Error

In [278]:
test_mse, test_rmse = final_model.calculate_rmse(test_matrix)
print(f"Test MSE: {test_mse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")

Test MSE: 0.7418
Test RMSE: 0.8613


# Hyperparameter Tuning

In [279]:
import itertools
def hyperparameter_search(R, folds, hyperparams):
    best_val_loss = float('inf')
    best_hyperparams = None
    
    for param in hyperparams:
        model_params = {
            'k': param[0],
            'learning_rate': param[1],
            'reg_param': param[2],
            'epochs': param[3],
            'batch_size': 100
        }

        print(f"\nTraining with hyperparameter: {param}")
        
        
        val_loss, _ = train_with_cross_validation(R, folds, model_params)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_hyperparams = param
            
    return best_hyperparams, best_val_loss


k_values = [10, 20, 30]
learning_rates = [0.001, 0.005, 0.01]
reg_params = [0.01, 0.05, 0.1]
epochs = [10, 20, 30]

hyperparam_combinations = list(itertools.product(k_values, learning_rates, reg_params, epochs))
len(hyperparam_combinations)

81

In [280]:
best_hyperparams, best_val_loss = hyperparameter_search(matrix_np, folds, hyperparam_combinations)


Training with hyperparameter: (10, 0.001, 0.01, 10)

Training with hyperparameter: (10, 0.001, 0.01, 20)

Training with hyperparameter: (10, 0.001, 0.01, 30)

Training with hyperparameter: (10, 0.001, 0.05, 10)

Training with hyperparameter: (10, 0.001, 0.05, 20)

Training with hyperparameter: (10, 0.001, 0.05, 30)

Training with hyperparameter: (10, 0.001, 0.1, 10)

Training with hyperparameter: (10, 0.001, 0.1, 20)

Training with hyperparameter: (10, 0.001, 0.1, 30)

Training with hyperparameter: (10, 0.005, 0.01, 10)

Training with hyperparameter: (10, 0.005, 0.01, 20)

Training with hyperparameter: (10, 0.005, 0.01, 30)

Training with hyperparameter: (10, 0.005, 0.05, 10)

Training with hyperparameter: (10, 0.005, 0.05, 20)

Training with hyperparameter: (10, 0.005, 0.05, 30)

Training with hyperparameter: (10, 0.005, 0.1, 10)

Training with hyperparameter: (10, 0.005, 0.1, 20)

Training with hyperparameter: (10, 0.005, 0.1, 30)

Training with hyperparameter: (10, 0.01, 0.01, 10)


In [281]:
best_hyperparams, best_val_loss

((20, 0.01, 0.1, 30), 0.8394406083907624)