# Update (12.13)
### read_date(file_path)
* input: file_path
* output: 
     - csr: csr matrix of user profile data
     - courses: list of courses
     
### memory_based(csr, courses, user_input, top_n = 10, random_state=42)
* input: 
    - csr
    - courses
    - user_input
    - csr, courses and the other inputs can be assigned inside the function. 
    
* output:
    - recommended courses
    
--------


In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix, vstack
import pickle

## Load the data and convert into sparse matrix

### Update: read_data will take a file_path and output a csr and course list. 

In [14]:
def read_data(file_path):
    from scipy.sparse import csr_matrix
    
    # Read pickle
    stu = pd.read_pickle(file_path)
    
    # Num of dictionaries in the file
    n = len(stu)
    
    # Initialize df
    df = pd.DataFrame()
    
    # Loop through dicts and concat
    for i in range(n):
        stu[i]['student_id'] = i
        df = pd.concat([df,stu[i]])
    
    # df = user dataset we have
    # user_input: courses that user provide

    # Fix .0 issues -- Some courses has .0 at the end (e.g. AAS 101.0), remove .0
    df.loc[df['Subject/Catalog'].str.contains('\.') == True, 'Subject/Catalog'] = df.loc[df['Subject/Catalog'].str.contains('\.') == True, 'Subject/Catalog'].str.split('\.').str[0]
    
    # Pivot table to stu - course
    df = pd.pivot_table(df, values = 'Rating', index = 'student_id', columns = 'Subject/Catalog')
    
    # Fillna
    df = df.fillna(0)
    
    # df to sparse matrix
    csr = csr_matrix(df)
    
    # Store course names
    courses = list(df.columns)
    

    return csr, courses 

In [8]:
#csr, courses = read_data('student_profiles.pickle')

### Store user profile csr and the list of courses

In [9]:
with open('csr.pkl', "wb") as fOut:
    pickle.dump(csr, fOut, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('courses.pkl', "wb") as fOut:
    pickle.dump(courses, fOut, protocol=pickle.HIGHEST_PROTOCOL)

### Load csr and courses

In [36]:
with open('csr.pkl', "rb") as fIn:
    csr = pickle.load(fIn)
with open('courses.pkl', "rb") as fIn:
    courses = pickle.load(fIn)

In [37]:
# random user input
user_input = np.random.choice(courses, size = 10, replace = False)

In [38]:
# Load df
fw = pd.read_csv('fw.csv')

## Matrix-based collaborative filtering

In [71]:
import scipy.sparse
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


def memory_based(csr, courses, user_input, df, top_n = 10):
    # Memory-based collaborative filtering
    user_idx = -1
    
    user_row = np.isin(courses, user_input)
    uc_mat = vstack([csr, user_row]).tocsr()
    # Normalize user-course matrix
    norm_uc_mat = uc_mat - uc_mat.mean(axis = 1).reshape(-1,1)
    
    # Cosine similarity of raw matrix
    cossim = cosine_similarity(uc_mat)
    
    # Calculate the numerator of V-hat(aj)
    v_num = np.dot(cossim[user_idx,:], norm_uc_mat)
    
    # Sum of V-hat(aj) numerator
    v_num_sum = v_num.sum(axis = 0)
    
    # V_hat(aj)
    v_hat = v_num_sum/(cossim[user_idx,:].sum())
    
    #Predicted rating of each course
    pred_rating = np.add(v_hat, uc_mat.mean(axis = 1).reshape(-1,1)[user_idx])
    
    # User vector
    user_vec = uc_mat[user_idx].toarray().flatten()
    
    # User's favorite course indices
    user_fav_idx = np.argsort(-user_vec)
    
    # User's favorite courses
    user_fav = [courses[i] for i in user_fav_idx[:top_n]]
    #print(f"Top {top_n} favorite courses:")
    #print(user_fav)
    
    # Flatten pred_rating
    pred_rating = np.asarray(pred_rating).flatten()
    
    
    # Argsort the rating except the courses taken by the user
    recommend_index = np.argsort(-pred_rating[user_vec == 0])
    
    # Recommend top courses
    top_courses = [courses[i] for i in recommend_index]#[:top_n]]

    # Get top_n offered courses that are recommended 
    offered = []
    count = 0
    for t in top_courses:
        if t in list(df['course']):
            offered.append(t)
            count += 1
            if count >= top_n:
                break

    rec_df = df.set_index('course', drop = True)
    rec_df = rec_df.loc[offered].reset_index()
    

    # RMSE between true & predicted
    #rmse = np.sqrt(np.sum((user_vec[user_vec != 0] - pred_rating[user_vec != 0]) ** 2))
    #print(f"RMSE = {rmse}")
    
    return rec_df



In [72]:
user_input = np.random.choice(courses, size = 10, replace = False)
memory_based(csr, courses, user_input, fw)

Unnamed: 0.1,course,Unnamed: 0,Class Nbr,Term,Session,Acad Group,Subject,Course Title,description,Component,...,Units,sub_title,credits,requirements_distribution,consent,advisory_prerequisites,other_course_info,repeatability,semester,text
0,EECS 280,1431,23753,Fall 2021,Regular Academic Session,Engineering,Electrical Engineering And Computer Science (E...,Prog&Data Struct,Techniques and algorithm development and effec...,LEC,...,4.0,,4.0,"BS, MSA",,MATH 115.,"F, W. Students may attempt this class a maximu...",May not be repeated for credit.,fall,Electrical Engineering And Computer Science (E...
1,EECS 301,1435,21370,Fall 2021,Regular Academic Session,Engineering,Electrical Engineering And Computer Science (E...,Probabil Mthd in Eng,Basic concepts of probability theory. Random ...,DIS,...,4.0,,4.0,BS,,Preceded or Accompanied by EECS 216.,,May not be repeated for credit.,fall,Electrical Engineering And Computer Science (E...
2,MATH 115,2650,21585,Fall 2021,Regular Academic Session,"Literature, Sci, and the Arts",Mathematics (MATH) Open Sections,Calculus I,Background and Goals: The sequence Math 115-11...,LEC,...,4.0,,4.0,"BS, MSA, QR/1",,Four years of high school mathematics.,,May not be repeated for credit.,fall,Mathematics (MATH) Open Sections Calculus I B...
3,EECS 491,1478,32072,Fall 2021,Regular Academic Session,Engineering,Electrical Engineering And Computer Science (E...,Intro Distrib Sys,"Distributed systems offer higher performance, ...",DIS,...,4.0,,4.0,BS,,,,May not be repeated for credit.,fall,Electrical Engineering And Computer Science (E...
4,MATH 116,2651,22260,Fall 2021,Regular Academic Session,"Literature, Sci, and the Arts",Mathematics (MATH) Open Sections,Calculus II,Background and Goals: The sequence Math 115-11...,LEC,...,4.0,,4.0,"BS, MSA, QR/1",,MATH 115.,,May not be repeated for credit.,fall,Mathematics (MATH) Open Sections Calculus II ...
5,EECS 443,1460,28738,Fall 2021,Regular Academic Session,Engineering,Electrical Engineering And Computer Science (E...,Senior Thesis,Students develop and carry out a research plan...,IND,...,3.0,,3.0,BS,With permission of instructor.,,,May not be repeated for credit.,fall,Electrical Engineering And Computer Science (E...
6,EECS 376,1445,26349,Fall 2021,Regular Academic Session,Engineering,Electrical Engineering And Computer Science (E...,Foundatns Comp Sci,Introduction to theory of computation. Models ...,DIS,...,4.0,,4.0,BS,,,"F, W.",May not be repeated for credit.,fall,Electrical Engineering And Computer Science (E...
7,EECS 388,1446,23875,Fall 2021,Regular Academic Session,Engineering,Electrical Engineering And Computer Science (E...,Intro Comp Security,This course introduces the principles and prac...,LAB,...,4.0,,4.0,BS,,EECS 370 (C or better) or equivalent.,,May not be repeated for credit.,fall,Electrical Engineering And Computer Science (E...
8,EECS 592,6112,19024,Winter 2022,Regular Academic Session,Engineering,Electrical Engineering And Computer Science (E...,AI Foundations,,LEC,...,4.0,,,,,,,,winter,Electrical Engineering And Computer Science (E...
9,EECS 482,1474,32868,Fall 2021,Regular Academic Session,Engineering,Electrical Engineering And Computer Science (E...,Intro Oper System,Operating system design and implementation: mu...,LEC,...,4.0,,4.0,BS,,,"F, W.",May not be repeated for credit.,fall,Electrical Engineering And Computer Science (E...
