Run the Python script <strong>MathOverflow_data_extraction.py</strong> first to extract all relevant data

In [2]:
import os
os.chdir('C:\\Users\songyifn\Desktop\Recommender-System-for-MathOverflow')

import numpy as np
import pandas as pd

import load_mathoverflow_data as lmd #The method for loading the mathoverflow data is stord in the Python script load_mathoverflow_data.py
#load the MathOverflow data set and perform training/testing splitting
data = lmd.load_mathoverflow_data(test_set_fraction=0.1, #training/testing split: 90% training, 10% testing
                                  indicator_features=False,
                                  tag_features=True)

entire = data['entire_data'] #This is the entire users/questions pool
train = data['train']
test = data['test']

In [5]:
print('The original users/questions pool of MathOverflow has %s users and %s questions, '
      'with a tota of %s interactions.'
      % (entire.shape[0], entire.shape[1], entire.getnnz()))

The original users/questions pool of MathOverflow has 11166 users and 78048 questions, with a tota of 116198 interactions.


In [3]:
print('The dataset for the case study has %s users and %s questions, '
      'with %s interactions in the testing and %s interactions in the training set.'
      % (train.shape[0], train.shape[1], test.getnnz(), train.getnnz()))

The dataset for the case study has 4513 users and 78048 questions, with 9129 interactions in the test and 96667 interactions in the training set.


#### Total number of questions in the pool which have never been answered:

In [13]:
len(np.intersect1d(np.where(test.getnnz(axis=0)==0)[0],np.where(train.getnnz(axis=0)==0)[0]))

21817

#### Number of 'new' questions in the testing set:

In [8]:
len(np.intersect1d(np.where(test.getnnz(axis=0)>0)[0],np.where(train.getnnz(axis=0)==0)[0]))

6458

#### Total number of questions in the testing set that has been answered at least once:

In [11]:
len(np.where(test.getnnz(axis=0)>0)[0])

6996

### (1) Model 1: Pure collaborative filtering model, without any item/user features

In [20]:
from lightfm import LightFM # Import the model
from lightfm.evaluation import auc_score, precision_at_k # Import the evaluation routines

# Set the common parameters for the recommender system models used in the case study
NUM_THREADS = 2 #Number of parallel threads used in the computation
NUM_COMPONENTS = 30 #dimension of the latent factor vectors for users/questions
NUM_EPOCHS = 3 #number of training epochs
ITEM_ALPHA = 1e-6 #regularization strength parameter for the item features

# Fit the collaborative filtering model with WARP loss function
model1 = LightFM(loss='warp',
                 item_alpha=ITEM_ALPHA,
                 no_components=NUM_COMPONENTS)

model1 = model1.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)

In [21]:
# Compute and print the AUC score of the training data
train_auc = auc_score(model1, train, num_threads=NUM_THREADS).mean()
print('Pure collaborative filtering model train AUC: %s' % train_auc)

Pure collaborative filtering model train AUC: 0.890514


In [22]:
# Pass in the interactions in the training set to exclude them from being re-recommended to users.
test_auc = auc_score(model1, test, train_interactions=train, num_threads=NUM_THREADS).mean()
print('Pure collaborative filtering model test AUC: %s' % test_auc)

# Set biases to zero and test the model again
model1.item_biases *= 0.0
test_auc = auc_score(model1, test, train_interactions=train, num_threads=NUM_THREADS).mean()
print('Pure collaborative filtering test AUC (with biases corrected): %s' % test_auc)

Pure collaborative filtering model test AUC: 0.340132
Pure collaborative filtering test AUC (with biases corrected): 0.517832


### (2) Model 2: Hybrid model which only uses the tags information as item features

In [26]:
item_features_1 = data['item_features']
tag_labels = data['item_feature_labels']
top_20_labels=tag_labels[np.argsort(np.squeeze(item_features_1.getnnz(axis=0)))[::-1][:20]]
print('There are %s distinct tags, with the 20 most popular tags being %s.' % (item_features_1.shape[1], top_20_labels.tolist()))

There are 1380 distinct tags, with the 20 most popular tags being [u'ag.algebraic-geometry', u'nt.number-theory', u'reference-request', u'co.combinatorics', u'at.algebraic-topology', u'dg.differential-geometry', u'gr.group-theory', u'pr.probability', u'fa.functional-analysis', u'rt.representation-theory', u'ct.category-theory', u'linear-algebra', u'ac.commutative-algebra', u'graph-theory', u'lo.logic', u'set-theory', u'gt.geometric-topology', u'gn.general-topology', u'ca.analysis-and-odes', u'mg.metric-geometry'].


In [29]:
model2 = LightFM(loss='warp',
                 item_alpha=ITEM_ALPHA,
                 no_components=NUM_COMPONENTS,
                 max_sampled=30) #max_sampled hyperparameter is set as 30 in order to boost the model predictive performance

# Fit the hybrid model. In this case, an additional argument is passed into the .fit() method, which is item_features. 

# Change the number of training epochs and see how the training and testing AUC scores are changing
for i in range(1,16,1):
    model2 = model2.fit(train,
                        item_features=item_features_1,
                        epochs=i,
                        num_threads=NUM_THREADS)

    train_auc = auc_score(model2,
                          train,
                          item_features=item_features_1,
                          num_threads=NUM_THREADS).mean()

    test_auc = auc_score(model2,
                         test,
                         train_interactions=train,
                         item_features=item_features_1,
                         num_threads=NUM_THREADS).mean()
    print 'Epoch ',i,': ','training set AUC: %s' % train_auc,'; ','testing set AUC: %s' % test_auc

Epoch  1 :  training set AUC: 0.889251 ;  testing set AUC: 0.83669
Epoch  2 :  training set AUC: 0.916412 ;  testing set AUC: 0.848959
Epoch  3 :  training set AUC: 0.929504 ;  testing set AUC: 0.854283
Epoch  4 :  training set AUC: 0.936114 ;  testing set AUC: 0.860525
Epoch  5 :  training set AUC: 0.942955 ;  testing set AUC: 0.861771
Epoch  6 :  training set AUC: 0.948833 ;  testing set AUC: 0.865766
Epoch  7 :  training set AUC: 0.951697 ;  testing set AUC: 0.865448
Epoch  8 :  training set AUC: 0.954122 ;  testing set AUC: 0.870026
Epoch  9 :  training set AUC: 0.956439 ;  testing set AUC: 0.866621
Epoch  10 :  training set AUC: 0.958418 ;  testing set AUC: 0.868727
Epoch  11 :  training set AUC: 0.960141 ;  testing set AUC: 0.868023
Epoch  12 :  training set AUC: 0.961838 ;  testing set AUC: 0.868795
Epoch  13 :  training set AUC: 0.963949 ;  testing set AUC: 0.867322
Epoch  14 :  training set AUC: 0.96521 ;  testing set AUC: 0.865486
Epoch  15 :  training set AUC: 0.965867 ;  te

The model training phase needs a number of epochs to achieve improved accuracy. Having less than 5 training epochs does not seem to be sufficient.

In [106]:
#Try to change the value of item_alpha (regularization strength) and track how the training and testing AUC scores are changing
for a in range(-6,1,1):
    item_alpha=10**a
    model2 = LightFM(loss='warp',
                     item_alpha=item_alpha,
                     no_components=NUM_COMPONENTS,
                     max_sampled=30)
    
    model2 = model2.fit(train,
                        item_features=item_features_1,
                        epochs=15,
                        num_threads=NUM_THREADS)

    train_auc = auc_score(model2,
                          train,
                          item_features=item_features_1,
                          num_threads=NUM_THREADS).mean()

    test_auc = auc_score(model2,
                         test,
                         train_interactions=train,
                         item_features=item_features_1,
                         num_threads=NUM_THREADS).mean()
    print 'item_alpha=10^'+str(a)+':','training set AUC: %s' % train_auc,'; ','testing set AUC: %s' % test_auc

item_alpha=10^-6: training set AUC: 0.965978 ;  testing set AUC: 0.873216
item_alpha=10^-5: training set AUC: 0.966108 ;  testing set AUC: 0.869432
item_alpha=10^-4: training set AUC: 0.96535 ;  testing set AUC: 0.870432
item_alpha=10^-3: training set AUC: 0.963127 ;  testing set AUC: 0.871575
item_alpha=10^-2: training set AUC: 0.929659 ;  testing set AUC: 0.858616
item_alpha=10^-1: training set AUC: 0.609422 ;  testing set AUC: 0.539822
item_alpha=10^0: training set AUC: 0.50668 ;  testing set AUC: 0.493985


Clearly, when the regularization strength is quite small (<= 10^-3), the testing set AUC score is close to maximum. When the regularization strength hyperparameter is beyong 10^-2, the bias of the model gets too high and the resulting testing set AUC score decreases significantly. 

In [31]:
model2 = LightFM(loss='warp',
                 item_alpha=ITEM_ALPHA,
                 no_components=NUM_COMPONENTS,
                 max_sampled=30) #max_sampled hyperparameter is set as 30 in order to boost the model predictive performance

model2 = model2.fit(train,
                    item_features=item_features_1,
                    epochs=15,
                    num_threads=NUM_THREADS)

train_auc = auc_score(model2,
                      train,
                      item_features=item_features_1,
                      num_threads=NUM_THREADS).mean()
print('Hybrid model 2 training set AUC: %s' % train_auc)

test_auc = auc_score(model2,
                     test,
                     train_interactions=train,
                     item_features=item_features_1,
                     num_threads=NUM_THREADS).mean()
print('Hybrid model 2 testing set AUC: %s' % test_auc)

Hybrid model 2 training set AUC: 0.966156
Hybrid model 2 testing set AUC: 0.868726


### (3) Model 3: Hybrid model which include the 50 additional item features from topic modeling of question texts (tags + topics)

Run the Python script <strong>build_item_features_question_topics.py</strong> first to obtain the collection of question texts by question id (Questions_text.csv)

#### Topic Modeling for Question Texts

In [34]:
questions_text = pd.read_csv("..\mathoverflow_Recommender_System\Questions_text.csv",header=None)
questions_text = questions_text.fillna('a') #for empty entry, replace it by the word 'a'
questions_text.shape

(78048, 1)

In [35]:
questions_text.iloc[23,0] #an example of a certain question's text before data cleaning

' Can the valuative criteria for separatedness/properness be checked "formally"? <p>Suppose f:X&rarr;Y is a morphism of finite type between locally noetherian schemes. The valuative criterion for separatedness (resp. properness) says roughly that f is a separated (resp. proper) morphism if and only if the following condition holds:</p>\r\n\r\n<blockquote>\r\n  <p>For any curve C in Y and for any lift\r\n  of C-{p} to X, there is at most one\r\n  (resp. exactly one) way to extend this\r\n  to a lift of C to X.</p>\r\n</blockquote>\r\n\r\n<p>More precisely,</p>\r\n\r\n<blockquote>\r\n  <p>If C is the spectrum of a DVR with\r\n  closed point p (a <em>very local</em> version\r\n  of a curve: the intersection of all\r\n  open neighborhoods of p on an "honest"\r\n  curve), C&rarr;Y is a morphism, and\r\n  C-{p}&rarr;X is a lift of that\r\n  morphism along f, there is at most one\r\n  (resp. exactly one) way to complete it to\r\n  a lift C&rarr;X.</p>\r\n</blockquote>\r\n\r\n<p>Does it suffic

In [37]:
import re #load Python regular expressions module
def cleanhtml(raw_html):
    #This function is defined to get rid of all HTML/XML tags within the raw texts body
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', raw_html)
    return cleantext

In [38]:
#initial cleaning step for the raw texts and verify if the HTML tags have been removed
questions_text.iloc[:,0]=questions_text.iloc[:,0].apply(cleanhtml)
questions_text.iloc[23,0]

' Can the valuative criteria for separatedness/properness be checked "formally"?  Suppose f:X&rarr;Y is a morphism of finite type between locally noetherian schemes. The valuative criterion for separatedness (resp. properness) says roughly that f is a separated (resp. proper) morphism if and only if the following condition holds: \r\n\r\n \r\n   For any curve C in Y and for any lift\r\n  of C-{p} to X, there is at most one\r\n  (resp. exactly one) way to extend this\r\n  to a lift of C to X. \r\n \r\n\r\n More precisely, \r\n\r\n \r\n   If C is the spectrum of a DVR with\r\n  closed point p (a  very local  version\r\n  of a curve: the intersection of all\r\n  open neighborhoods of p on an "honest"\r\n  curve), C&rarr;Y is a morphism, and\r\n  C-{p}&rarr;X is a lift of that\r\n  morphism along f, there is at most one\r\n  (resp. exactly one) way to complete it to\r\n  a lift C&rarr;X. \r\n \r\n\r\n Does it suffice to check the valuative criteria on an even more local kind of object: the

The HTML tags have been successfully removed!!

In [39]:
from nltk.corpus import stopwords #stop word list offered by Python Natural Language Toolkit
from nltk.stem.wordnet import WordNetLemmatizer #tool for stemming the words
import string
stop = set(stopwords.words('english'))
#an expanded list of stop words
stop2 = set(pd.read_csv("..\mathoverflow_Recommender_System\stopword_list.txt",header=None).iloc[:,0])
exclude = set(string.punctuation) #list of punctuation characters
lemma = WordNetLemmatizer()

In [71]:
def clean_texts(doc):
    # further data cleaning for the raw texts
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop and i not in stop2]) #remove stop words
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude) #remove punctuation
    to_be_normalized=[]
    #stem th words:
    for word in punc_free.split():
        try:
            cleaned_word=lemma.lemmatize(word)
            if len(cleaned_word)>2: #any 'word' whose length is smaller than 3 should be removed
                to_be_normalized.append(cleaned_word)
        except UnicodeDecodeError:
            pass
    normalized = " ".join(to_be_normalized)
    return normalized

In [41]:
# The cleaned texts need to be checked again to see if there are still stop words after stemming; if so, remove all of them
questions_text_clean_1 = [clean_texts(doc).split() for doc in questions_text.iloc[:,0]]
questions_text_clean_2 = [[i for i in doc if i not in stop2 and i not in stop] for doc in questions_text_clean_1]

In [42]:
#find the 300 most frequent words in the entire question texts collection
from collections import Counter
term_frequency=Counter(x for sublist in questions_text_clean_2 for x in sublist)
list(pd.DataFrame.from_dict(term_frequency,orient='index').sort_values([0],ascending=False).head(300).index)

[u'question',
 u'set',
 u'space',
 'function',
 u'example',
 u'finite',
 u'theorem',
 u'amp',
 u'map',
 u'theory',
 u'field',
 u'following',
 u'algebra',
 u'category',
 u'proof',
 u'form',
 u'matrix',
 u'answer',
 u'time',
 u'result',
 u'reference',
 u'true',
 u'complex',
 u'graph',
 u'class',
 u'polynomial',
 u'element',
 u'subset',
 u'vector',
 u'manifold',
 u'curve',
 u'consider',
 u'condition',
 u'sequence',
 u'paper',
 u'smooth',
 u'property',
 u'ring',
 u'structure',
 u'algebraic',
 'prime',
 u'integer',
 u'representation',
 u'product',
 u'suppose',
 u'closed',
 u'equation',
 u'definition',
 u'mathbb',
 'defined',
 u'real',
 u'solution',
 u'using',
 u'positive',
 'variety',
 u'mean',
 u'prove',
 u'sum',
 u'compact',
 u'bundle',
 u'dimension',
 u'exists',
 u'define',
 u'dont',
 u'lie',
 u'value',
 u'thanks',
 u'simple',
 'subgroup',
 u'model',
 u'term',
 u'measure',
 u'operator',
 'linear',
 u'bound',
 u'assume',
 u'surface',
 u'formula',
 u'natural',
 u'degree',
 u'zero',
 u'righ

In [43]:
#Among the most frequent words in the corpus, there are still a lot of them which are not real words or does not carry meanings
#which are useful for extracting the topics, so these words are manually identified as the 3rd stop words list
stop3=[u'question',u'set',u'space','function',u'example',u'finite',u'theorem',u'amp',u'map',u'theory',u'following',u'form',
       u'answer',u'time',u'result',u'reference',u'true',u'class',u'consider',u'condition',u'paper',u'property',u'structure',
       u'suppose',u'definition',u'mathbb','defined',u'using','variety',u'exists',u'define',u'dont',u'lie',u'value',u'thanks',
       u'simple',u'term',u'assume',u'natural',u'rightarrow',u'leq',u'object',u'type',u'note',u'particular',u'book',u'exist',
       u'constant',u'scheme',u'system',u'extension','statement',u'sense',u'mathcal','edit',u'help',u'omega',u'looking',u'ideal',
       u'action',u'course',u'standard',u'denote','alpha',u'omega',u'rational',u'similar',u'ive',u'idea',u'found',u'gamma',u'easy',
       u'limit',u'follows',u'look',u'functor',u'lambda',u'nice',u'word',u'pair',u'geq',u'corresponding',u'delta',u'section',
       u'trying',u'otimes',u'version',u'sigma',u'phi',u'actually',u'mathbbr',u'notion',u'called',u'thank',u'maybe',u'special',
       u'comment',u'infty',u'literature',u'cant',u'cdot',u'exact',u'choice',u'implies',u'associated',u'respect',u'hence',u'able',
       u'fiber',u'process',u'approach',u'page',u'usual',u'wondering',u'ldots',u'reason',u'write',u'obvious',u'contains',
       u'mathematics',u'exactly',u'doesnt',u'cover',u'advance',u'proper',u'call',u'classical',u'necessarily',u'read',
       u'application',u'please',u'etc',u'langle',u'via',u'simplicial',u'math',u'guess',u'correct',u'seen',u'mathbbc',
       u'information',u'hand',u'mathematical',u'article',u'motivation',u'appreciated',u'probably',u'forall',u'cdots',u'name',
       u'simply',u'taking','mathbbrn',u'instead',u'mathbbz',u'text',u'basic',u'believe',u'situation',u'change',u'choose',u'step',
       u'obtain',u'reading',u'lot',u'claim',u'remark',u'people',u'hope',u'act',u'finding',u'wellknown',u'nbsp',u'theta',u'yes',
       u'torus',u'written',u'precisely',u'description',u'tell',u'fix',u'study',u'imply',u'beta',u'denotes',u'useful',u'language',
       u'start',u'context',u'obtained']

In [44]:
#remove the words present in the 3rd stop words list
questions_text_clean_3 = [[i for i in doc if i not in stop3] for doc in questions_text_clean_2]

In [45]:
import gensim #import the gensim library, which offers the implementation of Latent Dirichlet Allocation model
from gensim import corpora
Lda = gensim.models.ldamodel.LdaModel # Creating the object for LDA model using gensim library

# Creating the term dictionary of the courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(questions_text_clean_3)
# Converting list of documents (corpus) into Document Term Matrix using the dictionary created above, as the input for topic modeling
doc_term_matrix = [dictionary.doc2bow(doc) for doc in questions_text_clean_3]



In [46]:
# Running and Trainign LDA model on the document term matrix.
ldamodel1 = Lda(doc_term_matrix, num_topics=50, alpha='auto', id2word = dictionary, random_state=1)

In [53]:
# Print out the top 10 words for each extracted topic:
topics_1=ldamodel1.print_topics(num_topics=50, num_words=10)
for t in topics_1:
    print "Topic",str(t[0])+":",t[1]

Topic 0: 0.104*"mbox" + 0.061*"family" + 0.032*"modulus" + 0.025*"symplectic" + 0.017*"mathfrakg" + 0.015*"mathscrs0" + 0.015*"homogeneous" + 0.010*"ultrafilter" + 0.010*"relative" + 0.009*"gin"
Topic 1: 0.042*"orbit" + 0.033*"mathfrak" + 0.023*"operatornamead" + 0.021*"oplus" + 0.020*"nilpotent" + 0.012*"expectation" + 0.012*"subalgebra" + 0.011*"cartan" + 0.011*"conditional" + 0.011*"ast"
Topic 2: 0.049*"homotopy" + 0.036*"equivalence" + 0.031*"relation" + 0.030*"isomorphism" + 0.024*"diagram" + 0.023*"transformation" + 0.017*"loop" + 0.016*"equivalent" + 0.014*"preserve" + 0.011*"bijection"
Topic 3: 0.034*"series" + 0.028*"method" + 0.020*"sum" + 0.019*"power" + 0.011*"modular" + 0.010*"mod" + 0.010*"compute" + 0.010*"formula" + 0.009*"numerical" + 0.009*"range"
Topic 4: 0.094*"model" + 0.045*"countable" + 0.028*"mathscrs" + 0.016*"puzzle" + 0.016*"configuration" + 0.015*"transition" + 0.012*"segment" + 0.012*"mathbbr2" + 0.011*"pure" + 0.010*"particle"
Topic 5: 0.026*"board" + 0.02

In [54]:
#create the document-topic matrix and save it to a .csv file (reusuable for building recommender systems on the reduced question_pool)
m1=np.zeros(shape=(len(doc_term_matrix),50))
for i in range(len(doc_term_matrix)):
    d_topic=ldamodel1.get_document_topics(doc_term_matrix[i], minimum_probability=None, minimum_phi_value=None, per_word_topics=False)
    for t in d_topic:
        m1[i,t[0]]=t[1]
pd.DataFrame(m1).to_csv("question_topics.csv",index=False,header=False)

In [57]:
import scipy.sparse as sp

#concatenate the tags feature matrix and the question topics feature matrix horizontally, the merged feature set is item_features_enhanced
m1_csr=sp.coo_matrix(m1,dtype=np.float32).tocsr() #the values must be of type np.float32 to become the item features input for LightFM model
item_features_enhanced=sp.hstack([item_features_1.tocoo(),m1_csr]).tocsr()
item_features_enhanced

<78048x1430 sparse matrix of type '<type 'numpy.float32'>'
	with 1034394 stored elements in Compressed Sparse Row format>

#### Train the hybrid recommender system model using 'enhanced' item features

In [59]:
model3 = LightFM(loss='warp',
                 item_alpha=ITEM_ALPHA,
                 no_components=NUM_COMPONENTS,
                 max_sampled=30)

# Fit the hybrid model. The "enhanced" item features matrix is passed as an additional argument
model3 = model3.fit(train,
                    item_features=item_features_enhanced,
                    epochs=15,
                    num_threads=NUM_THREADS)

train_auc = auc_score(model3,
                      train,
                      item_features=item_features_enhanced,
                      num_threads=NUM_THREADS).mean()
print('Hybrid model 3 training set AUC: %s' % train_auc)

test_auc = auc_score(model3,
                     test,
                     train_interactions=train,
                     item_features=item_features_enhanced,
                     num_threads=NUM_THREADS).mean()
print('Hybrid model 3 testing set AUC: %s' % test_auc)

Hybrid model 3 training set AUC: 0.970575
Hybrid model 3 testing set AUC: 0.879328


#### Evaluate the performance of this recommendation model in terms of top K recommendations to every user, simulating the real use cases

In [61]:
# Among the top K questions recommended to the user, what is the average rate that those predicted top K questions successfully
# capture the questions that were actually answered by the user during the time period of the testing set
for k in range(10,210,10):
    test_precision_at_k_all = precision_at_k(model3,
                                             test,
                                             train_interactions=train,
                                             k=k,
                                             item_features=item_features_enhanced,
                                             num_threads=NUM_THREADS)
    print 'k='+str(k)+':','Accuracy is',round(sum(test_precision_at_k_all>0)*1.0/len(test_precision_at_k_all),6)

k=10: Accuracy is 0.064189
k=20: Accuracy is 0.100507
k=30: Accuracy is 0.127534
k=40: Accuracy is 0.150338
k=50: Accuracy is 0.172297
k=60: Accuracy is 0.185811
k=70: Accuracy is 0.20777
k=80: Accuracy is 0.224662
k=90: Accuracy is 0.233108
k=100: Accuracy is 0.239865
k=110: Accuracy is 0.248311
k=120: Accuracy is 0.260135
k=130: Accuracy is 0.271959
k=140: Accuracy is 0.282095
k=150: Accuracy is 0.293074
k=160: Accuracy is 0.299831
k=170: Accuracy is 0.310811
k=180: Accuracy is 0.318412
k=190: Accuracy is 0.32348
k=200: Accuracy is 0.330236


### (4) Model 4: Hybrid model which only uses the 50 topical features of items

In [64]:
model4 = LightFM(loss='warp',
                 item_alpha=ITEM_ALPHA,
                 no_components=NUM_COMPONENTS,
                 max_sampled=30)

# Fit the hybrid model 4. In this case, the 78,048 x 50 document-topic matrix is passed in as the item features
model4 = model4.fit(train,
                    item_features=m1_csr,
                    epochs=15,
                    num_threads=NUM_THREADS)

train_auc = auc_score(model4,
                      train,
                      item_features=m1_csr,
                      num_threads=NUM_THREADS).mean()
print('Hybrid model 4 training set AUC: %s' % train_auc)

test_auc = auc_score(model4,
                     test,
                     train_interactions=train,
                     item_features=m1_csr,
                     num_threads=NUM_THREADS).mean()
print('Hybrid model 4 testing set AUC: %s' % test_auc)

Hybrid model 4 training set AUC: 0.922512
Hybrid model 4 testing set AUC: 0.820131


### (5) Model 5: Hybrid model which uses both item features ('enhanced') and user features (topics of answer per user)

Execute the Python script <strong>build_user_features_answers.py</strong> first to obtain the collection of answer texts by user id (User_answers.csv)

In [68]:
user_answers=pd.read_csv("..\mathoverflow_Recommender_System\User_answers.csv",header=None)
user_answers.shape

(4513, 1)

In [69]:
user_answers.iloc[6,0]

'<p>Here\'s a reduction to the finite dimensional case.  Let $F$ be a finite set of subspaces of $X$.  For each finite dimensional subspace $Y$ of $X$, let $u(Y)$ be the set of elements $Z$ of $F$ such that $Y$ is contained in $Z$.  By assumption, $u(Y)$ is non-empty for every $Y$.  Since any two finite dimensional subspaces are contained in a third, the intersection of the sets $u(Y)$, as $Y$ runs among all finite dimensional subspaces of $X$, is non-empty.  Hence there is at least one set in $F$ that contains every finite dimensional subspace of $X$, hence contains $X$.</p>\r\n\r\n<p>For the finite dimensional case, let $F$ be a finite set of subspaces of $X$.  By induction, every codimension 1 subspace of $X$ is contained in some $Y$ from $F$.  But there are infinitely many codimension $1$ subspaces, so some $Y$ in $F$ contains more than one such subspace.  Any two distinct codimension 1 subspaces $\\operatorname{span} X$ (if $\\dim X &gt; 1$) so $Y = X$.</p>\r\n <p>A variation on I

In [70]:
#Remove any HTML/XML tags present in the text body, and check the results
user_answers.iloc[:,0]=user_answers.iloc[:,0].apply(cleanhtml)
user_answers.iloc[6,0]

' Here\'s a reduction to the finite dimensional case.  Let $F$ be a finite set of subspaces of $X$.  For each finite dimensional subspace $Y$ of $X$, let $u(Y)$ be the set of elements $Z$ of $F$ such that $Y$ is contained in $Z$.  By assumption, $u(Y)$ is non-empty for every $Y$.  Since any two finite dimensional subspaces are contained in a third, the intersection of the sets $u(Y)$, as $Y$ runs among all finite dimensional subspaces of $X$, is non-empty.  Hence there is at least one set in $F$ that contains every finite dimensional subspace of $X$, hence contains $X$. \r\n\r\n For the finite dimensional case, let $F$ be a finite set of subspaces of $X$.  By induction, every codimension 1 subspace of $X$ is contained in some $Y$ from $F$.  But there are infinitely many codimension $1$ subspaces, so some $Y$ in $F$ contains more than one such subspace.  Any two distinct codimension 1 subspaces $\\operatorname{span} X$ (if $\\dim X &gt; 1$) so $Y = X$. \r\n  A variation on Ishai\'s exam

In [72]:
# The cleaned texts need to be checked again to see if there are still stop words after stemming; if so, remove all of them
user_answers_clean_1 = [clean_texts(doc).split() for doc in user_answers.iloc[:,0]]
user_answers_clean_2 = [[i for i in doc if i not in stop2 and i not in stop] for doc in user_answers_clean_1]

In [73]:
#find the 300 most frequent words in the entire user answer texts collection
term_frequency=Counter(x for sublist in user_answers_clean_2 for x in sublist)
list(pd.DataFrame.from_dict(term_frequency,orient='index').sort_values([0],ascending=False).head(300).index)

[u'set',
 u'space',
 u'function',
 u'example',
 u'question',
 u'theory',
 u'theorem',
 u'answer',
 u'map',
 u'finite',
 u'proof',
 u'paper',
 u'algebra',
 u'amp',
 u'form',
 u'category',
 u'field',
 u'time',
 u'result',
 u'element',
 u'class',
 u'mathbb',
 u'book',
 u'curve',
 u'structure',
 u'using',
 u'sequence',
 u'note',
 u'product',
 u'matrix',
 u'polynomial',
 u'model',
 u'complex',
 u'vector',
 u'algebraic',
 u'prime',
 u'dont',
 u'consider',
 u'representation',
 u'real',
 u'ring',
 u'sum',
 u'dimension',
 u'property',
 u'graph',
 u'mean',
 u'particular',
 u'subset',
 u'following',
 u'condition',
 u'equation',
 u'term',
 u'closed',
 u'degree',
 u'lie',
 u'line',
 u'zero',
 u'follows',
 u'hence',
 u'linear',
 u'solution',
 u'subgroup',
 u'bundle',
 u'manifold',
 u'value',
 u'formula',
 u'smooth',
 u'course',
 u'look',
 u'object',
 u'definition',
 u'comment',
 u'positive',
 u'argument',
 u'true',
 u'compact',
 u'type',
 u'integer',
 u'simple',
 u'measure',
 u'equivalent',
 u'surfa

In [74]:
#Among the most frequent words in the corpus, there are still a lot of them which are not real words or does not carry meanings
#which are useful for extracting the topics, so these words are manually identified as the 3rd stop words list
stop3=[u'set',u'space',u'map',u'finite',u'mathbb',u'function',u'example',u'question',u'theory',u'theorem',u'answer',u'paper',
       u'amp',u'form',u'time',u'result',u'book',u'using',u'note',u'dont',u'consider',u'following',u'this',u'term',u'follows',
       u'hence',u'case',u'look',u'comment',u'true',u'type',u'the',u'and',u'defined',u'extension',u'reference',u'point',
       u'section',u'variety',u'system',u'math',u'one',u'define',u'free',u'however',u'sense',u'exists',u'action',u'idea',
       u'actually',u'that',u'ideal',u'leq',u'sheaf',u'relation',u'alpha',u'method',u'limit',u'equal',u'edit',u'indeed',u'hold',
       u'nice',u'there',u'hand',u'found',u'exactly',u'choice',u'problem',u'called',u'write',u'functor',u'mathcal',u'which',
       u'page',u'see',u'word',u'yes',u'probably',u'now',u'way',u'reason',u'for',u'right',u'bit',u'cover',u'rightarrow',u'above',
       u'doesnt',u'lot',u'mathematical',u'check',u'detail',u'claim',u'taking',u'otimes',u'article',u'approach',u'special',
       u'start',u'exact',u'classical',u'try',u'looking',u'alpha',u'omega',u'gamma',u'phi',u'fact',u'fiber',u'not',u'path',
       u'application',u'mentioned',u'well',u'character',u'sigma',u'simply',u'useful',u'thus',u'step',u'usual',u'believe',
       u'people',u'exist',u'say',u'via',u'geq',u'cant']

In [75]:
#remove the words present in the 3rd stop words list
user_answers_clean_3 = [[i for i in doc if i not in stop3] for doc in user_answers_clean_2]

In [76]:
# Creating the term dictionary of the courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(user_answers_clean_3)
# Converting list of documents (corpus) into Document Term Matrix using the dictionary created above, as the input for topic modeling
doc_term_matrix = [dictionary.doc2bow(doc) for doc in user_answers_clean_3]

In [77]:
# Running and Trainign LDA model on the document term matrix.
ldamodel2 = Lda(doc_term_matrix, num_topics=50, alpha='auto', id2word = dictionary, random_state=1)

In [79]:
# Print out the top 10 words for each extracted topic:
topics_2=ldamodel2.print_topics(num_topics=50, num_words=10)
for t in topics_2:
    print "Topic",str(t[0])+":",t[1]

Topic 0: 0.010*"prime" + 0.007*"log" + 0.005*"sum" + 0.005*"integer" + 0.004*"bound" + 0.003*"polynomial" + 0.003*"proof" + 0.003*"conjecture" + 0.003*"factor" + 0.003*"solution"
Topic 1: 0.008*"bundle" + 0.004*"complex" + 0.004*"manifold" + 0.004*"class" + 0.004*"field" + 0.003*"category" + 0.003*"vector" + 0.003*"smooth" + 0.003*"structure" + 0.003*"element"
Topic 2: 0.007*"model" + 0.005*"algebra" + 0.004*"kappa" + 0.004*"category" + 0.004*"structure" + 0.003*"element" + 0.003*"proof" + 0.003*"topology" + 0.003*"cardinal" + 0.003*"subset"
Topic 3: 0.005*"category" + 0.005*"ring" + 0.004*"field" + 0.003*"algebra" + 0.003*"proof" + 0.003*"sequence" + 0.002*"class" + 0.002*"module" + 0.002*"local" + 0.002*"subset"
Topic 4: 0.010*"field" + 0.004*"isomorphism" + 0.004*"class" + 0.004*"scheme" + 0.004*"lie" + 0.004*"connected" + 0.004*"smooth" + 0.004*"complex" + 0.004*"algebraic" + 0.004*"structure"
Topic 5: 0.003*"beginalign" + 0.003*"endalign" + 0.003*"proof" + 0.002*"graph" + 0.002*"e

In [82]:
#create the document-topic matrix for topical features of user answers
m2=np.zeros(shape=(len(doc_term_matrix),50))
for i in range(len(doc_term_matrix)):
    d_topic=ldamodel2.get_document_topics(doc_term_matrix[i], minimum_probability=None, minimum_phi_value=None, per_word_topics=False)
    for t in d_topic:
        m2[i,t[0]]=t[1]

In [84]:
#convert the document-topic matrix (topical user features for answer texts per user) from dense format to sparse format
user_features_1=sp.coo_matrix(m2,dtype=np.float32).tocsr() #the values must be of type np.float32 to become the user features input for LightFM model
user_features_1

<4513x50 sparse matrix of type '<type 'numpy.float32'>'
	with 17984 stored elements in Compressed Sparse Row format>

#### Train the hybrid recommender system model using 'enhanced' item features plus the user features based on user answer topics

In [85]:
model5 = LightFM(loss='warp',
                 item_alpha=ITEM_ALPHA,
                 user_alpha=1e-6,
                 no_components=NUM_COMPONENTS,
                 max_sampled=30)

# Fit the hybrid model 5. In this case, both the enhanced item features and the topical features extracted from user-level answer texts are passed in the model
model5 = model5.fit(train,
                    item_features=item_features_enhanced,
                    user_features=user_features_1,
                    epochs=15,
                    num_threads=NUM_THREADS)

train_auc = auc_score(model5,
                      train,
                      item_features=item_features_enhanced,
                      user_features=user_features_1,
                      num_threads=NUM_THREADS).mean()
print('Hybrid model 5 training set AUC: %s' % train_auc)

test_auc = auc_score(model5,
                     test,
                     train_interactions=train,
                     item_features=item_features_enhanced,
                     user_features=user_features_1,
                     num_threads=NUM_THREADS).mean()
print('Hybrid model 5 testing set AUC: %s' % test_auc)

Hybrid model 5 training set AUC: 0.867159
Hybrid model 5 testing set AUC: 0.838811


### (6) Model 6: Hybrid model which uses both item features ('enhanced') and user features (extracted topics from user AboutMe texts)

Execute the Python script <strong>build_user_features_aboutme.py</strong> first to obtain the collection of self-introduction texts ('AboutMe') by user id (User_aboutme.csv)

#### Topic modeling for user AboutMe texts

In [90]:
user_aboutme=pd.read_csv("..\mathoverflow_Recommender_System\User_aboutme.csv",header=None)
user_aboutme.shape

(4513, 1)

In [91]:
#Remove any HTML/XML tags present in the text body, and check the results
user_aboutme.iloc[:,0]=user_aboutme.iloc[:,0].apply(cleanhtml)

In [93]:
num_of_empty_aboutme=sum(user_aboutme.iloc[:,0].apply(lambda x:x.strip())=='a')
print 'There are', num_of_empty_aboutme, 'users whose AboutMe field is empty. '

There are 2687 users whose AboutMe field is empty. 


In [94]:
# The cleaned texts need to be checked again to see if there are still stop words after stemming; if so, remove all of them
user_aboutme_clean_1 = [clean_texts(doc).split() for doc in user_aboutme.iloc[:,0]]
user_aboutme_clean_2 = [[i for i in doc if i not in stop2 and i not in stop] for doc in user_aboutme_clean_1]

In [95]:
# Creating the term dictionary of the courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(user_aboutme_clean_2)
# Converting list of documents (corpus) into Document Term Matrix using the dictionary created above, as the input for topic modeling
doc_term_matrix = [dictionary.doc2bow(doc) for doc in user_aboutme_clean_2]

In [99]:
# Running and Trainign LDA model on the document term matrix.
ldamodel3 = Lda(doc_term_matrix, num_topics=20, alpha='auto', id2word = dictionary, random_state=1)

In [100]:
# Print out the top 10 words for each extracted topic:
topics_3=ldamodel3.print_topics(num_topics=20, num_words=10)
for t in topics_3:
    print "Topic",str(t[0])+":",t[1]

Topic 0: 0.041*"student" + 0.026*"graduate" + 0.026*"list" + 0.016*"mathematical" + 0.015*"mathematics" + 0.011*"algebra" + 0.010*"center" + 0.008*"math" + 0.007*"sequence" + 0.007*"prefer"
Topic 1: 0.040*"university" + 0.018*"elementary" + 0.015*"mathematics" + 0.015*"postdoc" + 0.014*"faculty" + 0.013*"question" + 0.013*"website" + 0.013*"science" + 0.012*"department" + 0.011*"professor"
Topic 2: 0.035*"math" + 0.031*"student" + 0.020*"hilbert" + 0.018*"love" + 0.016*"write" + 0.015*"phd" + 0.015*"university" + 0.014*"geometry" + 0.014*"algebraic" + 0.013*"theory"
Topic 3: 0.033*"topology" + 0.026*"theory" + 0.022*"geometry" + 0.020*"mathematics" + 0.014*"university" + 0.014*"professor" + 0.012*"electrical" + 0.012*"algebraic" + 0.011*"stackexchange" + 0.011*"condition"
Topic 4: 0.035*"student" + 0.027*"phd" + 0.022*"university" + 0.016*"user" + 0.016*"algebra" + 0.012*"theory" + 0.012*"life" + 0.011*"graduate" + 0.011*"noncommutative" + 0.011*"able"
Topic 5: 0.050*"theory" + 0.023*"

In [101]:
#create the document-topic matrix for topical features of user aboutme texts
m3=np.zeros(shape=(len(doc_term_matrix),20))
for i in range(len(doc_term_matrix)):
    d_topic=ldamodel3.get_document_topics(doc_term_matrix[i], minimum_probability=None, minimum_phi_value=None, per_word_topics=False)
    for t in d_topic:
        m3[i,t[0]]=t[1]

In [102]:
#convert the document-topic matrix (topical user features for aboutme texts per user) from dense format to sparse format
user_features_2=sp.coo_matrix(m3,dtype=np.float32).tocsr() #the values must be of type np.float32 to become the user features input for LightFM model
user_features_2

<4513x20 sparse matrix of type '<type 'numpy.float32'>'
	with 66395 stored elements in Compressed Sparse Row format>

#### Train the hybrid recommender system model using 'enhanced' item features plus the user features based on 'AboutMe' topics

In [103]:
model6 = LightFM(loss='warp',
                 item_alpha=ITEM_ALPHA,
                 no_components=NUM_COMPONENTS,
                 max_sampled=30,
                 user_alpha=1e-6)

# Fit the hybrid model 6. In this case, both the enhanced item features and the topical features extracted from each user's AboutMe texts are passed in the model
model6 = model6.fit(train,
                    item_features=item_features_enhanced,
                    user_features=user_features_2,
                    epochs=15,
                    num_threads=NUM_THREADS)

train_auc = auc_score(model6,
                      train,
                      item_features=item_features_enhanced,
                      user_features=user_features_2,
                      num_threads=NUM_THREADS).mean()
print('Hybrid model 6 training set AUC: %s' % train_auc)

test_auc = auc_score(model6,
                     test,
                     train_interactions=train,
                     item_features=item_features_enhanced,
                     user_features=user_features_2,
                     num_threads=NUM_THREADS).mean()
print('Hybrid model 6 testing set AUC: %s' % test_auc)

Hybrid model 6 training set AUC: 0.667128
Hybrid model 6 testing set AUC: 0.551742
