In [1]:
import os
os.chdir('C:\\Users\songyifn\Desktop\Recommender-System-for-MathOverflow')

import numpy as np
import pandas as pd

import load_mathoverflow_data as lmd #The method for loading the mathoverflow data is stord in the Python script load_mathoverflow_data.py
#load the MathOverflow data set and perform training/testing splitting
data = lmd.load_mathoverflow_data(test_set_fraction=0.1, #training/testing split: 90% training, 10% testing
                                  indicator_features=False,
                                  tag_features=True)

train = data['train']
test = data['test']

In [2]:
#find the indices of the questions which should be kept in the reduced question pool:
#the criterion is that the question must be answered at least once
to_include=np.where(np.squeeze(train.getnnz(axis=0))>0)[0]

In [3]:
#use the array of question indices (question ids) to be kept to get the subset of training and testing set through slicing
train_new = train.tocsc()[:,to_include].tocoo()
test_new = test.tocsc()[:,to_include].tocoo()

In [4]:
print('The reduced question pool for the case study has %s users and %s questions, '
      'with %s interactions in the testing and %s interactions in the training set.'
      % (train_new.shape[0], train_new.shape[1], test_new.getnnz(), train_new.getnnz()))

The reduced question pool for the case study has 4513 users and 49773 questions, with 653 interactions in the testing and 96667 interactions in the training set.


### (1) Model 1: Baseline Model (non-personalized Popularity-based recommendation)

In [6]:
### order the question id according to their popularity (the counts of times each question has been answered) in descending order
order_by_popularity=np.argsort(np.squeeze(train_new.getnnz(axis=0)))[::-1]
len(order_by_popularity)

49773

In [7]:
from sklearn import metrics
train_new_csr=train_new.tocsr()
test_new_csr=test_new.tocsr()

all_auc=[]
for i in range(train.shape[0]):
    #for each user, first find all question ids that the user did not answer during the time period of the training set,
    #then when evaluating the model performance on the testing set, the model will only rank those questions (avoid re-recommending)
    indices_kept=np.where(train_new_csr[i,:].toarray()[0]==0)[0]
    #the ROC curve is created by comparing the predicted ranking (based on popularity and is the same for every user) and
    #the the list of indicators which show whether the user actually answered the question or not at different thresholds
    fpr, tpr, thresholds = metrics.roc_curve(pd.Series(test_new_csr[i,:].toarray()[0][indices_kept].astype(int)).apply(lambda x: 1 if x!=0 else 0),
                                             order_by_popularity[indices_kept])
    auc=metrics.auc(fpr,tpr)
    #if a user has not answered any questions during the time period of the testing set, he/she should not be included in the evaluation
    if pd.isnull(auc)==False:
        all_auc.append(auc)



In [8]:
#calculate the mean AUC score of all users
test_auc=round(sum(all_auc)/len(all_auc),6)
print('Popularity-based model testing set AUC: %s' % test_auc)

Popularity-based model testing set AUC: 0.467945


### (2) Model 2: Pure collaborative filtering model, without any item/user features

In [13]:
from lightfm import LightFM # Import the model
from lightfm.evaluation import auc_score # Import the evaluation routines

# Set the common parameters for the recommender system models used in the case study
NUM_THREADS = 2 #Number of parallel threads used in the computation
NUM_COMPONENTS = 30 #dimension of the latent factor vectors for users/questions
NUM_EPOCHS = 3 #number of training epochs
ITEM_ALPHA = 1e-6 #regularization strength parameter for the item features

# Fit the collaborative filtering model with WARP loss function
model2 = LightFM(loss='warp',
                 item_alpha=ITEM_ALPHA,
                 no_components=NUM_COMPONENTS)

model2 = model2.fit(train_new, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)

In [10]:
# Compute and print the AUC score of the training data
train_auc = auc_score(model2, train_new, num_threads=NUM_THREADS).mean()
print('Pure collaborative filtering model training set AUC: %s' % train_auc)

Pure collaborative filtering model training set AUC: 0.817615


In [12]:
# Pass in the interactions in the training set to exclude them from being re-recommended to users.
test_auc = auc_score(model2, test_new, train_interactions=train_new, num_threads=NUM_THREADS).mean()
print('Pure collaborative filtering model testing set AUC: %s' % test_auc)

# Set biases to zero and test the model again
model2.item_biases *= 0.0
test_auc = auc_score(model2, test_new, train_interactions=train_new, num_threads=NUM_THREADS).mean()
print('Pure collaborative filtering testing set AUC (with biases corrected): %s' % test_auc)

Pure collaborative filtering model testing set AUC: 0.676976
Pure collaborative filtering testing set AUC (with biases corrected): 0.642542


### (3) Model 3: Hybrid model which only uses the tags information as item features

In [17]:
#extract the subset of the item feature matrix by only including the questions ids that belongs to the reduced question pool
item_features_1 = data['item_features']
item_features_new_1 = item_features_1[to_include,:]
item_features_new_1

<49773x1380 sparse matrix of type '<type 'numpy.float32'>'
	with 125037 stored elements in Compressed Sparse Row format>

In [18]:
model3 = LightFM(loss='warp',
                 item_alpha=ITEM_ALPHA,
                 no_components=NUM_COMPONENTS,
                 max_sampled=30)

model3 = model3.fit(train_new,
                    item_features=item_features_new_1,
                    epochs=15,
                    num_threads=NUM_THREADS)

train_auc = auc_score(model3,
                      train_new,
                      item_features=item_features_new_1,
                      num_threads=NUM_THREADS).mean()
print('Hybrid model 3 training set AUC: %s' % train_auc)

test_auc = auc_score(model3,
                     test_new,
                     train_interactions=train_new,
                     item_features=item_features_new_1,
                     num_threads=NUM_THREADS).mean()
print('Hybrid model 3 testing set AUC: %s' % test_auc)

Hybrid model 3 training set AUC: 0.964593
Hybrid model 3 testing set AUC: 0.871383


### (4) Model 4: Hybrid model which include the 50 additional item features from topic modeling of question texts (tags + topics)

In [21]:
import scipy.sparse as sp

#create the new item feature matrices for tags+topics and for topics only based on the reduced question pool
#load the document-topics matrix obtained from exploring the recommender system using the full question pool 
item_features_topics=sp.coo_matrix(pd.read_csv('question_topics.csv',header=None),dtype=np.float32).tocsr() 
item_features_enhanced=sp.hstack([item_features_1,item_features_topics]).tocsr() #concatenate the tags feature and topics feature matrices horizontally
item_features_new_enhanced=item_features_enhanced[to_include,:]
item_features_new_topics=item_features_topics[to_include,:]

In [22]:
item_features_new_enhanced

<49773x1430 sparse matrix of type '<type 'numpy.float32'>'
	with 632721 stored elements in Compressed Sparse Row format>

In [23]:
item_features_new_topics

<49773x50 sparse matrix of type '<type 'numpy.float32'>'
	with 507684 stored elements in Compressed Sparse Row format>

In [24]:
model4 = LightFM(loss='warp',
                 item_alpha=ITEM_ALPHA,
                 no_components=NUM_COMPONENTS,
                 max_sampled=30)

# Fit the hybrid model. The "enhanced" item features matrix is passed as an additional argument
model4 = model4.fit(train_new,
                    item_features=item_features_new_enhanced,
                    epochs=15,
                    num_threads=NUM_THREADS)

train_auc = auc_score(model4,
                      train_new,
                      item_features=item_features_new_enhanced,
                      num_threads=NUM_THREADS).mean()
print('Hybrid model 4 training set AUC: %s' % train_auc)

test_auc = auc_score(model4,
                     test_new,
                     train_interactions=train_new,
                     item_features=item_features_new_enhanced,
                     num_threads=NUM_THREADS).mean()
print('Hybrid model 4 testing set AUC: %s' % test_auc)

Hybrid model 4 training set AUC: 0.968554
Hybrid model 4 testing set AUC: 0.894619


### (5) Model 5: Hybrid model which only uses the 50 topical features of items

In [26]:
model5 = LightFM(loss='warp',
                 item_alpha=ITEM_ALPHA,
                 no_components=NUM_COMPONENTS,
                 max_sampled=30)

# Fit the hybrid model. The topical item features matrix is passed as an additional argument
model5 = model5.fit(train_new,
                    item_features=item_features_new_topics,
                    epochs=15,
                    num_threads=NUM_THREADS)

train_auc = auc_score(model5,
                      train_new,
                      item_features=item_features_new_topics,
                      num_threads=NUM_THREADS).mean()
print('Hybrid model 5 training set AUC: %s' % train_auc)

test_auc = auc_score(model5,
                     test_new,
                     train_interactions=train_new,
                     item_features=item_features_new_topics,
                     num_threads=NUM_THREADS).mean()
print('Hybrid model 5 testing set AUC: %s' % test_auc)

Hybrid model 5 training set AUC: 0.918341
Hybrid model 5 testing set AUC: 0.8182
