# Update (12.10)
### read_date(file_path)
* input: file_path
* output: 
     - csr: csr matrix of user profile data
     - courses: list of courses
     
### svd_based(csr, courses, user_input, num_factors = 100, top_n = 10, random_state=42)
* input: 
    - csr
    - courses
    - user_input
    - other inputs can be assigned inside the function. 
    
* output:
    - recommended courses
    
--------


### Pipeline

#### read_data(file_path)
- read data to df

#### concat_user_input_and_convert_to_csr(df, user_input): 
- df: output df from read_data
- user_input: list of courses the user provide
- Append user input to the df and convert the df into csr. 
- Returns csr matrix, and course list.

#### svd_based(uc_mat, courses, user_idx = -1, num_factors=80, top_n = 10, random_state=42):
- uc_mat: csr matrix (1st output of concat_user_input_and_convert_to_csr)
- courses: course list (2nd output of concat_user_input_and_convert_to_csr)
- num_factors: hyperparam of svd
- top_n: num of recommendation
- Returns top_n recommended courses

---------


#### This notebook
* Try three different collaborative filterings: Memory, SVD, NMF
* Speed: SVD - Memory - NMF (fast to slow)
* RMSE: SVD - NMF - Memory (low to high)


* Winner: SVD

##### Note
* Parameter tuning (SVD: num_factors)

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix, vstack
import pickle

## Load the data and convert into sparse matrix

### Update: read_data will take a file_path and output a csr and course list. 

In [2]:
def read_data(file_path):
    from scipy.sparse import csr_matrix
    
    # Read pickle
    stu = pd.read_pickle(file_path)
    
    # Num of dictionaries in the file
    n = len(stu)
    
    # Initialize df
    df = pd.DataFrame()
    
    # Loop through dicts and concat
    for i in range(n):
        stu[i]['student_id'] = i
        df = pd.concat([df,stu[i]])
    
    # df = user dataset we have
    # user_input: courses that user provide

    # Fix .0 issues -- Some courses has .0 at the end (e.g. AAS 101.0), remove .0
    df.loc[df['Subject/Catalog'].str.contains('\.') == True, 'Subject/Catalog'] = df.loc[df['Subject/Catalog'].str.contains('\.') == True, 'Subject/Catalog'].str.split('\.').str[0]
    
    # Pivot table to stu - course
    df = pd.pivot_table(df, values = 'Rating', index = 'student_id', columns = 'Subject/Catalog')
    
    # Fillna
    df = df.fillna(0)
    
    # df to sparse matrix
    csr = csr_matrix(df)
    
    # Store course names
    courses = list(df.columns)
    

    return csr, courses 

In [8]:
csr, courses = read_data('student_profiles.pickle')

### Store user profile csr and the list of courses

In [9]:
with open('csr.pkl', "wb") as fOut:
    pickle.dump(csr, fOut, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('courses.pkl', "wb") as fOut:
    pickle.dump(courses, fOut, protocol=pickle.HIGHEST_PROTOCOL)

### Load csr and courses

In [10]:
with open('csr.pkl', "rb") as fIn:
    csr = pickle.load(fIn)
with open('courses.pkl', "rb") as fIn:
     courses = pickle.load(fIn)

In [11]:
# random user input
user_input = np.random.choice(courses, size = 10, replace = False)

## Matrix-based collaborative filtering

In [12]:
from sklearn.decomposition import TruncatedSVD

def svd_based(csr, courses, user_input, num_factors = 100, top_n = 10, random_state=42):
    # SVD based collaborative filtering
    user_row = np.isin(courses, user_input)
    uc_mat = vstack([csr, user_row]).tocsr()
    
    user_idx = -1

    
    # SVD
    svd = TruncatedSVD(n_components = num_factors, random_state = random_state)
    
    # Transform X
    X_transformed = svd.fit_transform(uc_mat)
    
    # Calculate r
    r = np.matmul(X_transformed, svd.components_)
    
    # Pred rating
    pred_rating = r[user_idx,:]#.reshape(1,-1)
    
    
    # Matched user vector
    user_vec = uc_mat[user_idx].toarray().flatten()
    
    
    # User's favorite course indices
    user_fav_idx = np.argsort(-user_vec)

    
    # Flatten pred_rating
    pred_rating = np.asarray(pred_rating).flatten()
    
    # Argsort the rating except the courses taken by the user
    recommend_index = np.argsort(-pred_rating[user_vec == 0])
    
    # Recommend top courses
    top_courses = [courses[i] for i in recommend_index[:top_n]]

    # RMSE between true & predicted. Uncomment the next two lines to see RMSE
    #rmse = np.sqrt(np.sum((user_vec[user_vec != 0] - pred_rating[user_vec != 0]) ** 2))
    #print(f"RMSE = {rmse}")
    
    return top_courses#, rmse

In [13]:
user_input = np.random.choice(courses, size = 10, replace = False)
svd_based(csr, courses, user_input)

['CHE 105',
 'ENVIRO 229',
 'MCD 397',
 'ENGLIS 215',
 'COM 487',
 'ASTR 104',
 'EECS 203',
 'EECS 376',
 'EECS 570',
 'EECS 590']