# Pipeline

### read_data(file_path)
- read data to df

### concat_user_input_and_convert_to_csr(df, user_input): 
- df: output df from read_data
- user_input: list of courses the user provide
- Append user input to the df and convert the df into csr. 
- Returns csr matrix, and course list.

### svd_based(uc_mat, courses, user_idx = -1, num_factors=80, top_n = 10, random_state=42):
- uc_mat: csr matrix (1st output of concat_user_input_and_convert_to_csr)
- courses: course list (2nd output of concat_user_input_and_convert_to_csr)
- num_factors: hyperparam of svd
- top_n: num of recommendation
- Returns top_n recommended courses

---------

# This notebook
* Try three different collaborative filterings: Memory, SVD, NMF
* Speed: SVD - Memory - NMF (fast to slow)
* RMSE: SVD - NMF - Memory (low to high)


* Winner: SVD

## Note
* Parameter tuning (SVD: num_factors)

In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

## Load the data and convert into sparse matrix

In [12]:
def read_data(file_path):
    from scipy.sparse import csr_matrix
    
    # Read pickle
    stu = pd.read_pickle(file_path)
    
    # Num of dictionaries in the file
    n = len(stu)
    
    # Initialize df
    df = pd.DataFrame()
    
    # Loop through dicts and concat
    for i in range(n):
        stu[i]['student_id'] = i
        df = pd.concat([df,stu[i]])
    
    return df

def concat_user_input_and_convert_to_csr(df, user_input):
    # df = user dataset we have
    # user_input: courses that user provide

    # User input to df and concat
    user_df = pd.DataFrame()
    user_df['Subject/Catalog'] = user_input
    user_df['Rating'] = 1
    user_df['student_id'] = df['student_id'].max()+1

    df = pd.concat([df, user_df])
    # Fix .0 issues -- Some courses has .0 at the end (e.g. AAS 101.0), remove .0
    df.loc[df['Subject/Catalog'].str.contains('\.') == True, 'Subject/Catalog'] = df.loc[df['Subject/Catalog'].str.contains('\.') == True, 'Subject/Catalog'].str.split('\.').str[0]
    
    # Pivot table to stu - course
    df = pd.pivot_table(df, values = 'Rating', index = 'student_id', columns = 'Subject/Catalog')
    
    # Fillna
    df = df.fillna(0)
    
    # df to sparse matrix
    csr = csr_matrix(df)
    
    # Store course names
    courses = list(df.columns)
    
    # Get matched user by cos sim -- uncomment this if we use this information
    #matched_user_idx = np.argsort(cosine_similarity(csr)[-1])[-2] #Best match of stu_idx -1 
    
    return csr, courses #matched_user_idx

## Matrix-based collaborative filtering

In [38]:
from sklearn.decomposition import TruncatedSVD

def svd_based(uc_mat, courses, num_factors=80, top_n = 10, random_state=42):
    # SVD based collaborative filtering
    
    # user_idx = -1
    user_idx = -1
    
    # SVD
    svd = TruncatedSVD(n_components = num_factors, random_state = random_state)
    
    # Transform X
    X_transformed = svd.fit_transform(uc_mat)
    
    # Calculate r
    r = np.matmul(X_transformed, svd.components_)
    
    # Pred rating
    pred_rating = r[user_idx,:]#.reshape(1,-1)
    
    
    # Matched user vector
    user_vec = uc_mat[user_idx].toarray().flatten()
    
    
    # User's favorite course indices
    user_fav_idx = np.argsort(-user_vec)
    
    # User's favorite courses
    #user_fav = [courses[i] for i in user_fav_idx[:top_n]]
    #print(f"Top {top_n} favorite courses:")
    #print(user_fav)

    
    # Flatten pred_rating
    pred_rating = np.asarray(pred_rating).flatten()
    
    # Argsort the rating except the courses taken by the user
    recommend_index = np.argsort(-pred_rating[user_vec == 0])
    
    # Recommend top courses
    top_courses = [courses[i] for i in recommend_index[:top_n]]
    
    print(f"Top {top_n} recommended courses:")
    print(top_courses)

    print()

    # RMSE between true & predicted. Uncomment the next two lines to see RMSE
    rmse = np.sqrt(np.sum((user_vec[user_vec != 0] - pred_rating[user_vec != 0]) ** 2))
    print(f"RMSE = {rmse}")
    
    return top_courses

In [39]:
%%time
pred = svd_based(csr, courses)

Top 10 recommended courses:
['EECS 581', 'EECS 385', 'EECS 482', 'RELIGIO 310', 'EECS 473', 'EECS 487', 'ENVIRO 167', 'EECS 490', 'EECS 280', 'SA 315']

RMSE = 4.320117968276276
CPU times: user 793 ms, sys: 117 ms, total: 911 ms
Wall time: 418 ms


## Memory-based collaborative filtering

In [32]:
import scipy.sparse
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
uc_mat = csr
user_idx = 0

def memory_based(uc_mat, user_idx, top_n):
    # Memory-based collaborative filtering
    
    # Normalize user-course matrix
    norm_uc_mat = uc_mat - uc_mat.mean(axis = 1).reshape(-1,1)
    
    # Cosine similarity of raw matrix
    cossim = cosine_similarity(uc_mat)
    
    # Calculate the numerator of V-hat(aj)
    v_num = np.dot(cossim[user_idx,:], norm_uc_mat)
    
    # Sum of V-hat(aj) numerator
    v_num_sum = v_num.sum(axis = 0)
    
    # V_hat(aj)
    v_hat = v_num_sum/(cossim[user_idx,:].sum())
    
    #Predicted rating of each course
    pred_rating = np.add(v_hat, uc_mat.mean(axis = 1).reshape(-1,1)[user_idx])
    
    # User vector
    user_vec = uc_mat[user_idx].toarray().flatten()
    
    # User's favorite course indices
    user_fav_idx = np.argsort(-user_vec)
    
    # User's favorite courses
    user_fav = [courses[i] for i in user_fav_idx[:top_n]]
    print(f"Top {top_n} favorite courses:")
    print(user_fav)
    
    # Flatten pred_rating
    pred_rating = np.asarray(pred_rating).flatten()
    
    
    # Argsort the rating except the courses taken by the user
    recommend_index = np.argsort(-pred_rating[user_vec == 0])
    
    # Recommend top courses
    top_courses = [courses[i] for i in recommend_index[:top_n]]
    
    print(f"Top {top_n} recommended courses:")
    print(top_courses)

    print()

    # RMSE between true & predicted
    rmse = np.sqrt(np.sum((user_vec[user_vec != 0] - pred_rating[user_vec != 0]) ** 2))
    print(f"RMSE = {rmse}")
    
    return top_courses



In [33]:
%%time
pred = memory_based(csr, -1, 10)

Top 10 favorite courses:
['ENGLIS 220', 'EART 156', 'NATIVEA 310', 'EART 255', 'WG 211', 'ASIA 243', 'ASIA 244', 'MAT 216', 'ASIANLA 266', 'NATIVEA 422']
Top 10 recommended courses:
['EEC 183', 'EE 484', 'EEC 280', 'EE 483', 'MAT 295', 'MAT 403', 'MAT 296', 'MAT 286', 'MAT 327', 'MAT 297']

RMSE = 4.873693740931147
CPU times: user 1.02 s, sys: 444 ms, total: 1.46 s
Wall time: 1.68 s


## NMF-based collaborative filtering

In [36]:
from sklearn.decomposition import NMF

def nmf_based(uc_mat, user_idx, num_factors, top_n = 10, random_state=42):
    # NMF based collaborative filtering
    
    # NMF
    nmf = NMF(n_components = num_factors, init = 'nndsvd', max_iter = 300, random_state = random_state)
    
    # Transform X
    X_transformed = nmf.fit_transform(uc_mat)
    
    # Calculate r
    r = np.matmul(X_transformed, nmf.components_)
    
    # Predict rating
    pred_rating = r[user_idx,:].reshape(1,-1)
    
    
    # User vector
    user_vec = uc_mat[user_idx].toarray().flatten()
    
    # User's favorite course indices
    user_fav_idx = np.argsort(-user_vec)
    
    # User's favorite courses
    user_fav = [courses[i] for i in user_fav_idx[:top_n]]
    print(f"Top {top_n} favorite courses:")
    print(user_fav)
    
    # Flatten pred_rating
    pred_rating = np.asarray(pred_rating).flatten()
    
    
    # Argsort the rating except the courses taken by the user
    recommend_index = np.argsort(-pred_rating[user_vec == 0])
    
    # Recommend top courses
    top_courses = [courses[i] for i in recommend_index[:top_n]]
    
    print(f"Top {top_n} recommended courses:")
    print(top_courses)

    print()

    # RMSE between true & predicted
    rmse = np.sqrt(np.sum((user_vec[user_vec != 0] - pred_rating[user_vec != 0]) ** 2))
    print(f"RMSE = {rmse}")
    
    return top_courses



In [37]:
%%time
pred = nmf_based(csr, -1, 80, )

Top 10 favorite courses:
['ENGLIS 220', 'EART 156', 'NATIVEA 310', 'EART 255', 'WG 211', 'ASIA 243', 'ASIA 244', 'MAT 216', 'ASIANLA 266', 'NATIVEA 422']
Top 10 recommended courses:
['EEC 183', 'LIN 393', 'EECS 487', 'EECS 482', 'EECS 581', 'EECS 491', 'EECS 574', 'RELIGIO 310', 'EECS 543', 'EECS 478']

RMSE = 4.346884211155051
CPU times: user 31.4 s, sys: 614 ms, total: 32.1 s
Wall time: 35.9 s


