# Update (12.13)
### read_date(file_path)
* input: file_path
* output: 
     - csr: csr matrix of user profile data
     - courses: list of courses
     
### memory_based(csr, courses, user_input, top_n = 10, random_state=42)
* input: 
    - csr
    - courses
    - user_input
    - csr, courses and the other inputs can be assigned inside the function. 
    
* output:
    - recommended courses
    
--------


In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix, vstack
import pickle

## Load the data and convert into sparse matrix

### Update: read_data will take a file_path and output a csr and course list. 

In [14]:
def read_data(file_path):
    from scipy.sparse import csr_matrix
    
    # Read pickle
    stu = pd.read_pickle(file_path)
    
    # Num of dictionaries in the file
    n = len(stu)
    
    # Initialize df
    df = pd.DataFrame()
    
    # Loop through dicts and concat
    for i in range(n):
        stu[i]['student_id'] = i
        df = pd.concat([df,stu[i]])
    
    # df = user dataset we have
    # user_input: courses that user provide

    # Fix .0 issues -- Some courses has .0 at the end (e.g. AAS 101.0), remove .0
    df.loc[df['Subject/Catalog'].str.contains('\.') == True, 'Subject/Catalog'] = df.loc[df['Subject/Catalog'].str.contains('\.') == True, 'Subject/Catalog'].str.split('\.').str[0]
    
    # Pivot table to stu - course
    df = pd.pivot_table(df, values = 'Rating', index = 'student_id', columns = 'Subject/Catalog')
    
    # Fillna
    df = df.fillna(0)
    
    # df to sparse matrix
    csr = csr_matrix(df)
    
    # Store course names
    courses = list(df.columns)
    

    return csr, courses 

In [8]:
#csr, courses = read_data('student_profiles.pickle')

### Store user profile csr and the list of courses

In [9]:
with open('csr.pkl', "wb") as fOut:
    pickle.dump(csr, fOut, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('courses.pkl', "wb") as fOut:
    pickle.dump(courses, fOut, protocol=pickle.HIGHEST_PROTOCOL)

### Load csr and courses

In [10]:
with open('csr.pkl', "rb") as fIn:
    csr = pickle.load(fIn)
with open('courses.pkl', "rb") as fIn:
     courses = pickle.load(fIn)

In [11]:
# random user input
user_input = np.random.choice(courses, size = 10, replace = False)

## Matrix-based collaborative filtering

In [28]:
import scipy.sparse
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


def memory_based(csr, courses, user_input, top_n = 10):
    # Memory-based collaborative filtering
    user_idx = -1
    
    user_row = np.isin(courses, user_input)
    uc_mat = vstack([csr, user_row]).tocsr()
    # Normalize user-course matrix
    norm_uc_mat = uc_mat - uc_mat.mean(axis = 1).reshape(-1,1)
    
    # Cosine similarity of raw matrix
    cossim = cosine_similarity(uc_mat)
    
    # Calculate the numerator of V-hat(aj)
    v_num = np.dot(cossim[user_idx,:], norm_uc_mat)
    
    # Sum of V-hat(aj) numerator
    v_num_sum = v_num.sum(axis = 0)
    
    # V_hat(aj)
    v_hat = v_num_sum/(cossim[user_idx,:].sum())
    
    #Predicted rating of each course
    pred_rating = np.add(v_hat, uc_mat.mean(axis = 1).reshape(-1,1)[user_idx])
    
    # User vector
    user_vec = uc_mat[user_idx].toarray().flatten()
    
    # User's favorite course indices
    user_fav_idx = np.argsort(-user_vec)
    
    # User's favorite courses
    user_fav = [courses[i] for i in user_fav_idx[:top_n]]
    #print(f"Top {top_n} favorite courses:")
    #print(user_fav)
    
    # Flatten pred_rating
    pred_rating = np.asarray(pred_rating).flatten()
    
    
    # Argsort the rating except the courses taken by the user
    recommend_index = np.argsort(-pred_rating[user_vec == 0])
    
    # Recommend top courses
    top_courses = [courses[i] for i in recommend_index[:top_n]]
    
    #print(f"Top {top_n} recommended courses:")
    #print(top_courses)

    #print()

    # RMSE between true & predicted
    rmse = np.sqrt(np.sum((user_vec[user_vec != 0] - pred_rating[user_vec != 0]) ** 2))
    #print(f"RMSE = {rmse}")
    
    return top_courses, rmse



In [29]:
user_input = np.random.choice(courses, size = 10, replace = False)
memory_based(csr, courses, user_input)

(['EECS 280',
  'EECS 301',
  'EEC 270',
  'EEC 280',
  'EEC 203',
  'MAT 451',
  'MAT 403',
  'MAT 327',
  'MAT 427',
  'MAT 422'],
 2.194925444737049)