##### <p> Samuel Wolfe <br> November 25, 2023 <br> MSBA 207 <br> Chapter 14 Part 2 </p>

In [1]:
# need to run "pip install mlxtend" first
# need to run "pip install surprise" first
# "conda install -c conda-forge scikit-surprise" in "terminal" (Mac) or "Anaconda Prompt" (Windows)
%matplotlib inline
from IPython.display import clear_output
import math
from sklearn.metrics.pairwise import cosine_similarity

from pathlib import Path

import heapq
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
pd.set_option('display.width', 250)

  from pkg_resources import get_distribution


In [2]:
def get_top_n(predictions, NumberOfItems):
    # First map the predictions to each user.
    byUser = defaultdict(list)
    for p in predictions:
        byUser[p.uid].append(p)
    
    # For each user, reduce predictions to top-n
    for uid, userPredictions in byUser.items():
        byUser[uid] = heapq.nlargest(NumberOfItems, userPredictions, key=lambda p: p.est)
    return byUser

In [3]:
# Working directory:
#
# We assume that data are kept in the same directory as the notebook. If you keep your 
# data in a different folder, replace the argument of the `Path`
DATA = Path('E:/Aliit/School/MSBA/206/MSBA-206/dmba')
#DATA = Path('C:/Users/Min Li/OneDrive/teaching/DS110/dmba')
# and then load data using 
#
# pd.read_csv(DATA / ‘filename.csv’)
# Load and preprocess data set 

## Chapter 14.3

We again consider the data in CourseTopics.csv describing course purchases at Statistics.com (see Problem 14.2 and data sample in Table).<br>
We want to provide a course recommendation to a student who purchased the Regression and Forecast courses.<br>
Apply user-based and item-based collaborative filtering to the data, using both Pearson correlation and Cosine similarity.<br>
Pandas.melt() unpivots a DataFrame from wide format to long format and we can use this method to turn the data into triplets such as (student, course, rating).<br>
Also review the pandas code in Table 2.3 for data processing.<br>
Note you need to create a Student_ID variable for "id_vars=" in the Pandas.melt() function as such a variable is not in the data.

In [4]:
df_topics = pd.read_csv(DATA / 'CourseTopics.csv',dtype='bool')
#Removing spaces from column names
names = df_topics.columns.to_list()
for x in names:
    df_topics = df_topics.rename(columns={x:x.replace(' ','_')})
df_topics['student_id'] = range(len(df_topics))
df_topics

Unnamed: 0,Intro,DataMining,Survey,Cat_Data,Regression,Forecast,DOE,SW,student_id
0,True,True,False,False,False,False,False,False,0
1,False,False,True,False,False,False,False,False,1
2,False,True,False,True,True,False,False,True,2
3,True,False,False,False,False,False,False,False,3
4,True,True,False,False,False,False,False,False,4
...,...,...,...,...,...,...,...,...,...
360,False,False,False,True,False,False,False,False,360
361,False,True,False,True,False,False,False,True,361
362,False,False,False,False,False,False,False,True,362
363,False,False,False,True,False,False,False,False,363


In [29]:
df_melt = pd.melt(df_topics,id_vars=['student_id'], value_vars=['Regression','Forecast'])
#df_melt['variable'] = np.where(df_melt['variable'] == 'Regression', 0,1).astype(dtype='bool')
df_melt

Unnamed: 0,student_id,variable,value
0,0,Regression,False
1,1,Regression,False
2,2,Regression,True
3,3,Regression,False
4,4,Regression,False
...,...,...,...
725,360,Forecast,False
726,361,Forecast,False
727,362,Forecast,False
728,363,Forecast,False


In [26]:
# https://surprise.readthedocs.io/en/stable/reader.html
# The Reader class is used to parse a file containing ratings.
# Such a file is assumed to specify only one rating per line, and each line needs to respect the following structure:
# user ; item ; rating ; [timestamp]
# rating_scale=(minimum_rating, maximum_rating)
reader = Reader(rating_scale=(0, 1))
reader

<surprise.reader.Reader at 0x291fbdda570>

In [32]:
data = Dataset.load_from_df(df_melt[['student_id', 'variable', 'value']], reader)
trainset = data.build_full_trainset() # https://surprise.readthedocs.io/en/stable/index.html
sim_options = {'name': 'cosine', 'user_based': False}  # compute cosine similarities between items
algo = KNNBasic(sim_options=sim_options) # https://surprise.readthedocs.io/en/stable/knn_inspired.html
algo.fit(trainset)
pred = algo.predict(0, str('Regression'), verbose=True) # r_ui=5 is the true rating

Computing the cosine similarity matrix...
Done computing similarity matrix.
user: 0          item: Regression r_ui = None   est = 0.00   {'actual_k': 2, 'was_impossible': False}


In [44]:
data = Dataset.load_from_df(df_melt[['student_id', 'variable', 'value']], reader)
trainset = data.build_full_trainset() # https://surprise.readthedocs.io/en/stable/index.html
sim_options = {'name': 'pearson', 'user_based': False}  # compute cosine similarities between items
algo = KNNBasic(sim_options=sim_options) # https://surprise.readthedocs.io/en/stable/knn_inspired.html
algo.fit(trainset)
pred = algo.predict(0, str('Regression'), verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
user: 0          item: Regression r_ui = None   est = 0.00   {'actual_k': 2, 'was_impossible': False}


In [45]:
# Then predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = algo.test(testset)
NumberOfItems = 4
top_n = get_top_n(predictions, NumberOfItems)

In [46]:
NumberOfUsers=5
# Print the recommended items for each user
print()
print('Top-4 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:NumberOfUsers]:
    print('User {}'.format(uid))
    for prediction in user_ratings:
        print('  Item {0.iid} ({0.est:.2f})'.format(prediction), end='')
    print()
print()


Top-4 recommended items for each user
User 230
  Item Forecast (0.00)
User 318
  Item Forecast (0.00)
User 155
  Item Regression (0.00)
User 356
  Item Regression (0.00)
User 16
  Item Forecast (0.06)



In [47]:
## Item-based filtering
# compute cosine similarity between users 
sim_options = {'name': 'cosine', 'user_based': False}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = algo.test(testset)
NumberOfItems = 4
NumberOfUsers = 5

top_n = get_top_n(predictions, NumberOfItems)

# Print the recommended items for each user
print()
print('Top-n recommended items for each user')
for uid, user_ratings in list(top_n.items())[:NumberOfUsers]:
    print('User {}'.format(uid))
    for prediction in user_ratings:
        print('  Item {0.iid} ({0.est:.2f})'.format(prediction), end='')
    print()

Computing the cosine similarity matrix...
Done computing similarity matrix.

Top-n recommended items for each user
User 230
  Item Forecast (0.00)
User 318
  Item Forecast (0.00)
User 155
  Item Regression (0.00)
User 356
  Item Regression (0.00)
User 16
  Item Forecast (0.18)


In [48]:
## Build a model using the full dataset
trainset = data.build_full_trainset()
sim_options = {'name': 'cosine', 'user_based': False}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

# Predict rating for user 383 and item 7
algo.predict(0, 'Regression')

Computing the cosine similarity matrix...
Done computing similarity matrix.


Prediction(uid=0, iid='Regression', r_ui=None, est=0, details={'actual_k': 2, 'was_impossible': False})

## Chapter 14.5

Course Ratings. The Institute for Statistics Education at Statistics.com asks students to rate a<br>
variety of aspects of a course as soon as the student completes it. The Institute is contemplating<br>
instituting a recommendation system that would provide students with recommendations for<br>
additional courses as soon as they submit their rating for a completed course. Consider the excerpt<br>
from student ratings of online statistics courses shown in Table 14.17, and the problem of what to<br>
recommend to student E.N.<br>

14.5.a First consider a user-based collaborative filter. This requires computing correlations between<br>all student pairs. For which students is it possible to compute correlations with E.N.? Compute<br>them.

Looking at `EN` we see its values as `EN: 4,-,-,4,-,-,4,-,3`. <br>`SQL`: `LN`, `MH`, `JH`, `DU`, `DS`<br>`DM in R`: `DS`<br>`R Prog`: `LN`, `DS`<br> `Regression`: `LN`<br>Adding all of these up we can calculate it out for `LN`, `MH`, `JH`, `DU`, `DS`.

I am assuming we are supposed to do the first method without the cosine similarity.

In [7]:
#calculating averages
EN_avg = (4+4+4+3)/4
LN_avg = (4+3+2+4+2)/5
MH_avg = (3+4+4)/3
JH_avg = (2+2)/2
DU_avg = (4+4)/2
DS_avg = (4+2+4)/3

In [8]:
#EN Compared to the others
ENtoLN = round((((4-LN_avg)*(4-EN_avg))+((4-LN_avg)*(4-EN_avg))+((2-LN_avg)*(3-EN_avg)))/(math.sqrt(pow(4-LN_avg,2)+pow(4-LN_avg,2)+pow(2-LN_avg,2))*math.sqrt(pow(4-EN_avg,2)+pow(4-EN_avg,2)+pow(3-EN_avg,2))),2)
ENtoMH = round((((3-MH_avg)*(4-EN_avg)))/(math.sqrt(pow(3-MH_avg,2))*math.sqrt(pow(4-EN_avg,2))),2)
ENtoJH = 0 #The average of JH results in a divide by zero situation.
ENtoDU = 0 #The average of DU results in a divide by zero situation.
ENtoDS = round((((4-DS_avg)*(4-EN_avg))+((2-DS_avg)*(4-EN_avg))+((4-DS_avg)*(3-EN_avg)))/(math.sqrt(pow(4-DS_avg,2)+pow(2-DS_avg,2)+pow(4-DS_avg,2))*math.sqrt(pow(4-EN_avg,2)+pow(4-EN_avg,2)+pow(3-EN_avg,2))),2)
print('EN to LN: ',ENtoLN)
print('EN to MH: ',ENtoMH)
print('EN to JH: ',ENtoJH)
print('EN to DU: ',ENtoDU)
print('EN to DS: ',ENtoDS)

EN to LN:  0.87
EN to MH:  -1.0
EN to JH:  0
EN to DU:  0
EN to DS:  -0.49


14.5.b. Based on the single nearest student to E.N., which single course should we recommend to E.N.?<br>
Explain why.<br>

Based on the nearest student to `E.N.`, `LN`, I would recommend `Python` to the student.<br>This recommendation comes from the second to last paragraph of the `User-Based Collaborative Filtering: “People Like You”` section. <br> Per the book when looking to recommend you take the nearest neighbor, which in this case is `LN`, <br>and you recommend the highest non shared item to the intended user, `EN`. This results in <br> the recommended course being `Python`

14.5.c. Use scikit-learn function sklearn.metrics.pairwise.cosine_similarity() to compute the cosine<br>
similarity between users.<br>

In [9]:
df_rating = pd.read_csv(DATA / 'courserating.csv',index_col='Unnamed: 0')
#Removing spaces from column names
names = df_rating.columns.to_list()
for x in names:
    df_rating = df_rating.rename(columns={x:x.replace(' ','_')})
df_rating

Unnamed: 0,SQL,Spatial,PA1,DM_in_R,Python,Forecast,R_Prog,Hadoop,Regression
LN,4.0,,,,3.0,2.0,4.0,,2.0
MH,3.0,4.0,,,4.0,,,,
JH,2.0,2.0,,,,,,,
EN,4.0,,,4.0,,,4.0,,3.0
DU,4.0,4.0,,,,,,,
FL,,4.0,,,,,,,
GL,,4.0,,,,,,,
AH,,3.0,,,,,,,
SA,,,4.0,,,,,,
RW,,,2.0,,,,,4.0,


In [10]:
cosine_similarity(df_rating.loc[['LN', 'EN'], ['SQL', 'R_Prog', 'Regression']])

array([[1.        , 0.98910049],
       [0.98910049, 1.        ]])

14.5.d. Based on the cosine similarities of the nearest students to E.N., which course should be<br>
recommended to E.N.?<br>

Given what the book says in the second to last paragraph of the `User-Based Collaborative Filtering: “People Like You”` section, <br> it would still be best to recommend the student `Python`, given it has the highest score.

14.5.e. What is the conceptual difference between using the correlation as opposed to cosine<br>
similarities? (Hint: How are the missing values in the matrix handled in each case?)<br>

With `correlation` we are taking into account the non similar values by calculating the average of each user.<br>With `cosine similarities` we are not, but this is accounted for when we take the Cos Sim of the resulting value.

14.5.f. With large datasets, it is computationally difficult to compute user-based recommendations in<br>
real time, and an item-based approach is used instead. Returning to the rating data (not the<br>
binary matrix), let’s now take that approach.<br>

14.5.f.i. If the goal is still to find a recommendation for E.N., for which course pairs is it possible<br>
and useful to calculate correlations?<br>

If were still looking at courses to recommend we can look at the courses `EN` has not taken.<br>
The best of these are `Spatial` and `PA 1`. This is because multiple people have taken these course.<br>
Then when we apply the stipulation of `useful` we finish with `Spacial` as it has the only cross over<br>
with the students we listed earlier, through `MH`, `JH`, and `DU`.

14.5.f.ii. Just looking at the data, and without yet calculating course pair correlations, which course<br>
would you recommend to E.N., relying on item-based filtering? Calculate two course pair<br>
correlations involving your guess and report the results.<br>

Just looking at the courses, `Python` is still my best recommendation. If we apply the logic from <br>
to this though, my recommendation would be `Spatial`. Using `SQL` as my base for the equation.

In [11]:
df_rating.describe()[1:2]

Unnamed: 0,SQL,Spatial,PA1,DM_in_R,Python,Forecast,R_Prog,Hadoop,Regression
mean,3.5,3.5,3.5,3.0,3.5,3.0,4.0,4.0,2.5


In [12]:
SQ_avg = df_rating.describe()[1:2]['SQL'].iloc[0]
SP_avg = df_rating.describe()[1:2]['Spatial'].iloc[0]
MH_SQ = 3
JH_SQ = 2
DU_SQ = 4
MH_SP = 4
JH_SP = 2
DU_SP = 4

In [13]:
top = ((MH_SQ-SQ_avg)*(MH_SQ-SP_avg)+(JH_SQ-SQ_avg)*(JH_SQ-SP_avg)+(DU_SQ-SQ_avg)*(DU_SQ-SP_avg))
top

2.75

In [14]:
bottomL = math.sqrt(pow(MH_SQ-SQ_avg,2)+pow(JH_SQ-SQ_avg,2)+pow(DU_SQ-SQ_avg,2))
bottomL

1.6583123951777

In [15]:
bottomR = math.sqrt(pow(MH_SP-SP_avg,2)+pow(JH_SP-SP_avg,2)+pow(DU_SP-SP_avg,2))
bottomR

1.6583123951777

In [16]:
ComSim = round(top/(bottomL*bottomR),2)
ComSim

1.0

14.5.g. Apply item-based collaborative filtering to this dataset (using Python) and based on the results,<br>
recommend a course to E.N.<br>

In [17]:
df_rating = pd.read_csv(DATA / 'courserating.csv')
#Removing spaces from column names
names = df_rating.columns.to_list()
for x in names:
    df_rating = df_rating.rename(columns={x:x.replace(' ','_')})
df_rating

Unnamed: 0,Unnamed:_0,SQL,Spatial,PA1,DM_in_R,Python,Forecast,R_Prog,Hadoop,Regression
0,LN,4.0,,,,3.0,2.0,4.0,,2.0
1,MH,3.0,4.0,,,4.0,,,,
2,JH,2.0,2.0,,,,,,,
3,EN,4.0,,,4.0,,,4.0,,3.0
4,DU,4.0,4.0,,,,,,,
5,FL,,4.0,,,,,,,
6,GL,,4.0,,,,,,,
7,AH,,3.0,,,,,,,
8,SA,,,4.0,,,,,,
9,RW,,,2.0,,,,,4.0,


In [18]:
df_redux = pd.melt(df_rating,id_vars=['Unnamed:_0'], value_vars=['SQL','Spatial'])
df_redux

Unnamed: 0,Unnamed:_0,variable,value
0,LN,SQL,4.0
1,MH,SQL,3.0
2,JH,SQL,2.0
3,EN,SQL,4.0
4,DU,SQL,4.0
5,FL,SQL,
6,GL,SQL,
7,AH,SQL,
8,SA,SQL,
9,RW,SQL,


In [19]:
df_redux = pd.DataFrame(columns=['userID','itemID','rating'])
for y in df_rating.columns.to_list():
    for x in df_rating.T.columns.to_list():
        if df_rating.loc[x,y] == 0.0:
            continue
        else:
            #print(y,",",x,',',df_rating.loc[x,y])
            df_redux.loc[len(df_redux.index)] = [x,y,df_rating.loc[x,y]]
df_redux

Unnamed: 0,userID,itemID,rating
0,0,Unnamed:_0,LN
1,1,Unnamed:_0,MH
2,2,Unnamed:_0,JH
3,3,Unnamed:_0,EN
4,4,Unnamed:_0,DU
...,...,...,...
145,10,Regression,
146,11,Regression,
147,12,Regression,
148,13,Regression,


In [20]:
# Convert thes data set into the format required by the surprise package
# The columns must correspond to user id, item id and ratings (in that order)
reader = Reader(rating_scale=(1, 4))
data = Dataset.load_from_df(df_redux[['Unnamed:_0', 'variable', 'value']], reader)

# Split into training and test set
trainset, testset = train_test_split(data, test_size=.25, random_state=1)

## User-based filtering
# compute cosine similarity between users 
sim_options = {'name': 'cosine', 'user_based': True}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

KeyError: "None of [Index(['Unnamed:_0', 'variable', 'value'], dtype='object')] are in the [columns]"

In [None]:
# Then predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = algo.test(testset)
NumberOfItems = 9
top_n = get_top_n(predictions, NumberOfItems)

In [None]:
NumberOfUsers=10
# Print the recommended items for each user
print()
print('Top-4 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:NumberOfUsers]:
    print('User {}'.format(uid))
    for prediction in user_ratings:
        print('  Item {0.iid} ({0.est:.2f})'.format(prediction), end='')
    print()
print()

In [None]:
## Build a model using the full dataset
trainset = data.build_full_trainset()
sim_options = {'name': 'cosine', 'user_based': False}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)


In [None]:
# Then predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = algo.test(testset)
NumberOfItems = 4
top_n = get_top_n(predictions, NumberOfItems)

In [None]:
NumberOfUsers=5
# Print the recommended items for each user
print()
print('Top-4 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:NumberOfUsers]:
    print('User {}'.format(uid))
    for prediction in user_ratings:
        print('  Item {0.iid} ({0.est:.2f})'.format(prediction), end='')
    print()
print()

For what ever reason my code is recommending Regression to EN, even though EN took it already. I have changed a number of the settings to get a different result. As well I have tried removing `Regression` from the list and it results in `EN` getting no recommendations.

14.5.h. Convert all numeric ratings to 1 and all blank (missing values) to 0. Apply user-based<br>
and item-based collaborative filtering to this dataset using both Pearson correlation and<br>Cosine similarity and based on the results, recommend a course to E.N.

In [None]:
df_rating = pd.read_csv(DATA / 'courserating.csv',index_col='Unnamed: 0').fillna(0)
#Removing spaces from column names
names = df_rating.columns.to_list()
for x in names:
    df_rating = df_rating.rename(columns={x:x.replace(' ','_')})
df_rating

In [None]:
df_redux = pd.DataFrame(columns=['userID','itemID','rating'])
for y in df_rating.columns.to_list():
    for x in df_rating.T.columns.to_list():
            #print(y,",",x,',',df_rating.loc[x,y])
            df_redux.loc[len(df_redux.index)] = [x,y,df_rating.loc[x,y]]
df_redux

In [None]:
## Build a model using the full dataset
trainset = data.build_full_trainset()
sim_options = {'name': 'cosine', 'user_based': False}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

In [None]:
# Then predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = algo.test(testset)
NumberOfItems = 4
top_n = get_top_n(predictions, NumberOfItems)

In [None]:
NumberOfUsers=5
# Print the recommended items for each user
print()
print('Top-4 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:NumberOfUsers]:
    print('User {}'.format(uid))
    for prediction in user_ratings:
        print('  Item {0.iid} ({0.est:.2f})'.format(prediction), end='')
    print()
print()