# This notebook
* Try three different collaborative filterings: Memory, SVD, NMF
* Speed: SVD - Memory - NMF (fast to slow)
* RMSE: SVD - NMF - Memory (low to high)


* Winner: SVD

## Note
* Parameter tuning (SVD: num_factors)

In [1]:
import pandas as pd
import numpy as np

## Load the data and convert into sparse matrix

In [2]:
def data_to_csr(file_path):
    from scipy.sparse import csr_matrix
    
    # Read pickle
    stu = pd.read_pickle(file_path)
    
    # Num of dictionaries in the file
    n = len(stu)
    
    # Initialize df
    df = pd.DataFrame()
    
    # Loop through dicts and concat
    for i in range(n):
        stu[i]['student_id'] = i
        df = pd.concat([df,stu[i]])
    
    # Fix .0 issues -- Some courses has .0 at the end (e.g. AAS 101.0), remove .0
    df.loc[df['Subject/Catalog'].str.contains('\.') == True, 'Subject/Catalog'] = df.loc[df['Subject/Catalog'].str.contains('\.') == True, 'Subject/Catalog'].str.split('\.').str[0]
    
    # Pivot table to stu - course
    df = pd.pivot_table(df, values = 'Rating', index = 'student_id', columns = 'Subject/Catalog')
    
    # Fillna
    df = df.fillna(0)
    
    # df to sparse matrix
    csr = csr_matrix(df)
    
    # Store course names
    courses = list(df.columns)
    
    return csr, courses

In [3]:
csr, courses = data_to_csr('student_profiles.pickle')

## Memory-based collaborative filtering

In [4]:
import scipy.sparse
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
uc_mat = csr
user_idx = 0

def memory_based(uc_mat, user_idx, top_n):
    # Memory-based collaborative filtering
    
    # Normalize user-course matrix
    norm_uc_mat = uc_mat - uc_mat.mean(axis = 1).reshape(-1,1)
    
    # Cosine similarity of raw matrix
    cossim = cosine_similarity(uc_mat)
    
    # Calculate the numerator of V-hat(aj)
    v_num = np.dot(cossim[user_idx,:], norm_uc_mat)
    
    # Sum of V-hat(aj) numerator
    v_num_sum = v_num.sum(axis = 0)
    
    # V_hat(aj)
    v_hat = v_num_sum/(cossim[user_idx,:].sum())
    
    #Predicted rating of each course
    pred_rating = np.add(v_hat, uc_mat.mean(axis = 1).reshape(-1,1)[user_idx])
    
    # User vector
    user_vec = uc_mat[user_idx].toarray().flatten()
    
    # User's favorite course indices
    user_fav_idx = np.argsort(-user_vec)
    
    # User's favorite courses
    user_fav = [courses[i] for i in user_fav_idx[:top_n]]
    print(f"Top {top_n} favorite courses:")
    print(user_fav)
    
    # Flatten pred_rating
    pred_rating = np.asarray(pred_rating).flatten()
    
    
    # Argsort the rating except the courses taken by the user
    recommend_index = np.argsort(-pred_rating[user_vec == 0])
    
    # Recommend top courses
    top_courses = [courses[i] for i in recommend_index[:top_n]]
    
    print(f"Top {top_n} recommended courses:")
    print(top_courses)

    print()

    # RMSE between true & predicted
    rmse = np.sqrt(np.sum((user_vec[user_vec != 0] - pred_rating[user_vec != 0]) ** 2))
    print(f"RMSE = {rmse}")
    
    return top_courses



In [12]:
%%time
pred = memory_based(csr, 0, 10)

Top 10 favorite courses:
['EECS 370', 'ENSCE 105', 'EECS 443', 'EECS 280', 'EECS 203', 'SPANIS 373', 'ECO 497', 'CHE 211', 'RCAS 202', 'ENGLIS 387']
Top 10 recommended courses:
['MAT 175', 'MAT 205', 'MAT 186', 'MAT 174', 'MAT 176', 'MAT 185', 'EECS 367', 'EECS 280', 'EECS 385', 'EECS 481']

RMSE = 18.227582054884557
CPU times: user 822 ms, sys: 374 ms, total: 1.2 s
Wall time: 1.25 s


## Matrix-based collaborative filtering

In [6]:
from sklearn.decomposition import TruncatedSVD

def svd_based(uc_mat, user_idx, num_factors, top_n = 10, random_state=42):
    # SVD based collaborative filtering
    
    # SVD
    svd = TruncatedSVD(n_components = num_factors, random_state = random_state)
    
    # Transform X
    X_transformed = svd.fit_transform(uc_mat)
    
    # Calculate r
    r = np.matmul(X_transformed, svd.components_)
    
    # Pred rating
    pred_rating = r[user_idx,:]#.reshape(1,-1)
    
    
    # User vector
    user_vec = uc_mat[user_idx].toarray().flatten()
    
    # User's favorite course indices
    user_fav_idx = np.argsort(-user_vec)
    
    # User's favorite courses
    user_fav = [courses[i] for i in user_fav_idx[:top_n]]
    print(f"Top {top_n} favorite courses:")
    print(user_fav)
    
    # Flatten pred_rating
    pred_rating = np.asarray(pred_rating).flatten()
    
    # Argsort the rating except the courses taken by the user
    recommend_index = np.argsort(-pred_rating[user_vec == 0])
    
    # Recommend top courses
    top_courses = [courses[i] for i in recommend_index[:top_n]]
    
    print(f"Top {top_n} recommended courses:")
    print(top_courses)

    print()

    # RMSE between true & predicted
    rmse = np.sqrt(np.sum((user_vec[user_vec != 0] - pred_rating[user_vec != 0]) ** 2))
    print(f"RMSE = {rmse}")
    
    return top_courses



In [16]:
%%time
pred = svd_based(csr, 0, 80, )

Top 10 favorite courses:
['EECS 370', 'ENSCE 105', 'EECS 443', 'EECS 280', 'EECS 203', 'SPANIS 373', 'ECO 497', 'CHE 211', 'RCAS 202', 'ENGLIS 387']
Top 10 recommended courses:
['RELIGIO 231', 'EEC 481', 'EECS 582', 'SA 140', 'EECS 571', 'EECS 489', 'EECS 203', 'HJC 335', 'EECS 492', 'LIN 347']

RMSE = 15.867938151779919
CPU times: user 938 ms, sys: 161 ms, total: 1.1 s
Wall time: 553 ms


## NMF-based collaborative filtering

In [8]:
from sklearn.decomposition import NMF

def nmf_based(uc_mat, user_idx, num_factors, top_n = 10, random_state=42):
    # NMF based collaborative filtering
    
    # NMF
    nmf = NMF(n_components = num_factors, init = 'nndsvd', max_iter = 300, random_state = random_state)
    
    # Transform X
    X_transformed = nmf.fit_transform(uc_mat)
    
    # Calculate r
    r = np.matmul(X_transformed, nmf.components_)
    
    # Predict rating
    pred_rating = r[user_idx,:].reshape(1,-1)
    
    
    # User vector
    user_vec = uc_mat[user_idx].toarray().flatten()
    
    # User's favorite course indices
    user_fav_idx = np.argsort(-user_vec)
    
    # User's favorite courses
    user_fav = [courses[i] for i in user_fav_idx[:top_n]]
    print(f"Top {top_n} favorite courses:")
    print(user_fav)
    
    # Flatten pred_rating
    pred_rating = np.asarray(pred_rating).flatten()
    
    
    # Argsort the rating except the courses taken by the user
    recommend_index = np.argsort(-pred_rating[user_vec == 0])
    
    # Recommend top courses
    top_courses = [courses[i] for i in recommend_index[:top_n]]
    
    print(f"Top {top_n} recommended courses:")
    print(top_courses)

    print()

    # RMSE between true & predicted
    rmse = np.sqrt(np.sum((user_vec[user_vec != 0] - pred_rating[user_vec != 0]) ** 2))
    print(f"RMSE = {rmse}")
    
    return top_courses



In [14]:
%%time
pred = nmf_based(csr, 0, 80, )

Top 10 favorite courses:
['EECS 370', 'ENSCE 105', 'EECS 443', 'EECS 280', 'EECS 203', 'SPANIS 373', 'ECO 497', 'CHE 211', 'RCAS 202', 'ENGLIS 387']
Top 10 recommended courses:
['MAT 175', 'EECS 281', 'SPANIS 473', 'SPANIS 485', 'TCHNCLC 380', 'EEC 203', 'SPANIS 476', 'ECO 494', 'EECS 582', 'SA 140']

RMSE = 16.13685123447898
CPU times: user 32.4 s, sys: 705 ms, total: 33.1 s
Wall time: 36 s


