In [36]:
import os
os.chdir('C:\\Users\songyifn\Desktop\mathoverflow_Recommender_System')

import numpy as np
import pandas as pd

import fetch_mathoverflow as fm

data = fm.fetch_mathoverflow('mathoverflow',
                           test_set_fraction=0.1,
                           indicator_features=False,
                           tag_features=True)

#entire = data['entire_data']
train = data['train']
test = data['test']



In [37]:
print('The dataset has %s users and %s items, '
      'with %s interactions in the test and %s interactions in the training set.'
      % (train.shape[0], train.shape[1], test.getnnz(), train.getnnz()))

The dataset has 4513 users and 78048 items, with 9129 interactions in the test and 96667 interactions in the training set.


In [38]:
np.array(pd.SparseSeries.from_coo(train).to_dense().index)

array([(0L, 1L), (0L, 2L), (0L, 7L), ..., (4511L, 65500L), (4512L, 37286L),
       (4512L, 60032L)], dtype=object)

In [115]:
train_zero=np.where(train.getnnz(axis=0)==0)[0]

In [120]:
test_non_zero=np.where(test.getnnz(axis=0)!=0)[0]

In [122]:
intersect=np.intersect1d(test_non_zero,train_zero)
intersect

array([ 1107,  3593,  4586, ..., 78036, 78037, 78040], dtype=int64)

In [124]:
sum(test.getnnz(axis=0)[intersect])

8476

#### Pure collaborative filtering, without any item features

In [39]:
# Import the model
from lightfm import LightFM

# Set the number of threads; you can increase this
# ify you have more physical cores available.
NUM_THREADS = 2
NUM_COMPONENTS = 30
NUM_EPOCHS = 3
ITEM_ALPHA = 1e-6

# Let's fit a WARP model: these generally have the best performance.
model = LightFM(loss='warp',
                item_alpha=ITEM_ALPHA,
               no_components=NUM_COMPONENTS)

# Run 3 epochs and time it.
%time model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)

Wall time: 4.42 s


In [82]:
# Import the evaluation routines
from lightfm.evaluation import auc_score, precision_at_k

In [40]:


# Compute and print the AUC score
train_auc = auc_score(model, train, num_threads=NUM_THREADS).mean()
print('Collaborative filtering train AUC: %s' % train_auc)

Collaborative filtering train AUC: 0.892098


In [41]:
# We pass in the train interactions to exclude them from predictions.
# This is to simulate a recommender system where we do not
# re-recommend things the user has already interacted with in the train
# set.
test_auc = auc_score(model, test, train_interactions=train, num_threads=NUM_THREADS).mean()
print('Collaborative filtering test AUC: %s' % test_auc)

Collaborative filtering test AUC: 0.341801


In [95]:
len(pd.SparseSeries.from_coo(train).index)

96667

In [42]:
# Set biases to zero
model.item_biases *= 0.0

test_auc = auc_score(model, test, train_interactions=train, num_threads=NUM_THREADS).mean()
print('Collaborative filtering test AUC: %s' % test_auc)

Collaborative filtering test AUC: 0.517812


In [43]:
item_features = data['item_features']
tag_labels = data['item_feature_labels']

print('There are %s distinct tags, with values like %s.' % (item_features.shape[1], tag_labels[:3].tolist()))

There are 1380 distinct tags, with values like [u'ag.algebraic-geometry', u'at.algebraic-topology', u'ca.analysis-and-odes'].


In [97]:
item_features#.shape

<78048x1380 sparse matrix of type '<type 'numpy.float32'>'
	with 199855 stored elements in Compressed Sparse Row format>

In [154]:
tag_labels[item_features[10,:].toarray()[0].astype(bool)]#.astype(bool)

array([u'population', u'forecasting', u'census'], 
      dtype='<U50')

In [155]:
train_csc=train.tocsc()

In [156]:
sum(train_csc[4,:].toarray()[0])

109.0

#### Only use the tags information as the item features

In [44]:
# Define a new model instance
model = LightFM(loss='warp',
                #item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS,
                max_sampled=30,
                item_alpha=1e-6)

# Fit the hybrid model. Note that this time, we pass
# in the item features matrix.
model = model.fit(train,
                item_features=item_features,
                #user_features=user_features,
                epochs=15,
                num_threads=NUM_THREADS)

SyntaxError: keyword argument repeated (<ipython-input-44-e76f0a3bce36>, line 6)

In [117]:
# Don't forget the pass in the item features again!
train_auc = auc_score(model,
                      train,
                      item_features=item_features,
                      #user_features=user_features,
                      num_threads=NUM_THREADS).mean()
print('Hybrid training set AUC: %s' % train_auc)

Hybrid training set AUC: 0.966631


In [118]:
test_auc = auc_score(model,
                    test,
                    train_interactions=train,
                    item_features=item_features,
                    #user_features=user_features,
                    num_threads=NUM_THREADS).mean()
print('Hybrid test set AUC: %s' % test_auc)

Hybrid test set AUC: 0.868726


#### Include the 50 new item features from topic modeling (tags + topics)

In [80]:
# Define a new model instance
model1 = LightFM(loss='warp',
                #item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS,
                max_sampled=30,
                item_alpha=1e-6)

# Fit the hybrid model. Note that this time, we pass
# in the item features matrix.
model1 = model1.fit(train,
                item_features=item_features_enhanced,
                #user_features=user_features,
                epochs=15,
                num_threads=NUM_THREADS)


# Don't forget the pass in the item features again!
train_auc = auc_score(model1,
                      train,
                      item_features=item_features_enhanced,
                      #user_features=user_features,
                      num_threads=NUM_THREADS).mean()
print('Hybrid training set AUC: %s' % train_auc)


test_auc = auc_score(model1,
                    test,
                    train_interactions=train,
                    item_features=item_features_enhanced,
                    #user_features=user_features,
                    num_threads=NUM_THREADS).mean()
print('Hybrid test set AUC: %s' % test_auc)

Hybrid training set AUC: 0.97054
Hybrid test set AUC: 0.879372


In [83]:
test_precision_at_k = precision_at_k(model1,
                                    test,
                                    train_interactions=train,
                                    k=10,
                                    item_features=item_features_enhanced,
                                    #user_features=user_features,
                                    num_threads=NUM_THREADS).mean()
print('Hybrid test set AUC: %s' % test_precision_at_k)

Hybrid test set AUC: 0.00819257


In [110]:
for k in range(10,110,10):
    test_precision_at_k_all = precision_at_k(model1,
                                        test,
                                        train_interactions=train,
                                        k=k,
                                        item_features=item_features_enhanced,
                                        #user_features=user_features,
                                        num_threads=NUM_THREADS)
    print('k=',k,': Accuracy is ',sum(test_precision_at_k_all>0)*1.0/len(test_precision_at_k_all))

('k=', 10, ': Accuracy is ', 0.074324324324324328)
('k=', 20, ': Accuracy is ', 0.10557432432432433)
('k=', 30, ': Accuracy is ', 0.13175675675675674)
('k=', 40, ': Accuracy is ', 0.1545608108108108)
('k=', 50, ': Accuracy is ', 0.18158783783783783)
('k=', 60, ': Accuracy is ', 0.19510135135135134)
('k=', 70, ': Accuracy is ', 0.21452702702702703)
('k=', 80, ': Accuracy is ', 0.23226351351351351)
('k=', 90, ': Accuracy is ', 0.24577702702702703)
('k=', 100, ': Accuracy is ', 0.25760135135135137)


In [90]:
sum(test_precision_at_k_all>0)*1.0/len(test_precision_at_k_all)

0.074324324324324328

In [84]:
test_precision_at_k = precision_at_k(model1,
                                    test,
                                    train_interactions=train,
                                    k=50,
                                    item_features=item_features_enhanced,
                                    #user_features=user_features,
                                    num_threads=NUM_THREADS).mean()
print('Hybrid test set AUC: %s' % test_precision_at_k)

Hybrid test set AUC: 0.0052027


#### Only use the 50 new item features from topic modeling

In [50]:
model2 = LightFM(loss='warp',
                #item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS,
                max_sampled=30,
                item_alpha=1e-6)

# Fit the hybrid model. Note that this time, we pass
# in the item features matrix.
model2 = model2.fit(train,
                item_features=m1_csr,
                #user_features=user_features,
                epochs=15,
                num_threads=NUM_THREADS)


# Don't forget the pass in the item features again!
train_auc = auc_score(model2,
                      train,
                      item_features=m1_csr,
                      #user_features=user_features,
                      num_threads=NUM_THREADS).mean()
print('Hybrid training set AUC: %s' % train_auc)

test_auc = auc_score(model2,
                    test,
                    train_interactions=train,
                    item_features=m1_csr,
                    #user_features=user_features,
                    num_threads=NUM_THREADS).mean()
print('Hybrid test set AUC: %s' % test_auc)

Hybrid training set AUC: 0.921795
Hybrid test set AUC: 0.820131


#### Use both item features (enhanced) and user features (topics)

In [79]:
# Define a new model instance
model1 = LightFM(loss='warp',
                #item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS,
                max_sampled=30,
                item_alpha=1e-6)

# Fit the hybrid model. Note that this time, we pass
# in the item features matrix.
model1 = model1.fit(train,
                item_features=item_features_enhanced,
                user_features=user_features_1,
                epochs=15,
                num_threads=NUM_THREADS)


# Don't forget the pass in the item features again!
train_auc = auc_score(model1,
                      train,
                      item_features=item_features_enhanced,
                      user_features=user_features_1,
                      num_threads=NUM_THREADS).mean()
print('Hybrid training set AUC: %s' % train_auc)


test_auc = auc_score(model1,
                    test,
                    train_interactions=train,
                    item_features=item_features_enhanced,
                    user_features=user_features_1,
                    num_threads=NUM_THREADS).mean()
print('Hybrid test set AUC: %s' % test_auc)

Hybrid training set AUC: 0.866863
Hybrid test set AUC: 0.838811


#### Use both item features (enhanced) and user features (aboutme)

In [109]:
# Define a new model instance
model3 = LightFM(loss='warp',
                #item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS,
                max_sampled=30,
                item_alpha=1e-6)

# Fit the hybrid model. Note that this time, we pass
# in the item features matrix.
model3 = model3.fit(train,
                item_features=item_features_enhanced,
                user_features=user_features_2,
                epochs=15,
                num_threads=NUM_THREADS)


# Don't forget the pass in the item features again!
train_auc = auc_score(model3,
                      train,
                      item_features=item_features_enhanced,
                      user_features=user_features_2,
                      num_threads=NUM_THREADS).mean()
print('Hybrid training set AUC: %s' % train_auc)


test_auc = auc_score(model3,
                    test,
                    train_interactions=train,
                    item_features=item_features_enhanced,
                    user_features=user_features_2,
                    num_threads=NUM_THREADS).mean()
print('Hybrid test set AUC: %s' % test_auc)

Hybrid training set AUC: 0.666054
Hybrid test set AUC: 0.551742


In [71]:
import scipy.io
mat = scipy.io.loadmat('kaggle77b_trainset.mat')

In [74]:
mat['trainset'].shape

(21983L, 100L)

In [1]:
import os
os.chdir('C:\\Users\\songyifn\\Desktop')

import pandas as pd
import numpy as np

In [2]:
users=pd.read_csv("kaggle_users.txt",header=None)

In [84]:
triplets=pd.read_csv('kaggle_visible_evaluation_triplets.txt', header=None, sep='\t')

In [135]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve([0,1,1,0], [8,6,9,3])

In [136]:
metrics.auc(fpr,tpr)

0.75

In [138]:
tpr

array([ 0.5,  0.5,  1. ,  1. ])

In [139]:
fpr

array([ 0. ,  0.5,  0.5,  1. ])

In [140]:
train

TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]

In [142]:
data['item_features']

<72360x1246 sparse matrix of type '<type 'numpy.float32'>'
	with 198963 stored elements in Compressed Sparse Row format>

In [185]:
user_id=pd.read_csv("map_question.csv",header=None)

In [187]:
len(np.unique(user_id.iloc[:,0]))

78047

In [184]:
x = np.array([[[0], [1], [2]]])
np.squeeze(x)

array([0, 1, 2])

### Topic Modeling for User Answers

In [57]:
import os
os.getcwd()

'C:\\Users\\songyifn\\Desktop\\mathoverflow_Recommender_System'

In [58]:
user_answers=pd.read_csv("User_answers.csv",header=None)

In [60]:
user_answers.iloc[6,0]

'<p>Here\'s a reduction to the finite dimensional case.  Let $F$ be a finite set of subspaces of $X$.  For each finite dimensional subspace $Y$ of $X$, let $u(Y)$ be the set of elements $Z$ of $F$ such that $Y$ is contained in $Z$.  By assumption, $u(Y)$ is non-empty for every $Y$.  Since any two finite dimensional subspaces are contained in a third, the intersection of the sets $u(Y)$, as $Y$ runs among all finite dimensional subspaces of $X$, is non-empty.  Hence there is at least one set in $F$ that contains every finite dimensional subspace of $X$, hence contains $X$.</p>\r\n\r\n<p>For the finite dimensional case, let $F$ be a finite set of subspaces of $X$.  By induction, every codimension 1 subspace of $X$ is contained in some $Y$ from $F$.  But there are infinitely many codimension $1$ subspaces, so some $Y$ in $F$ contains more than one such subspace.  Any two distinct codimension 1 subspaces $\\operatorname{span} X$ (if $\\dim X &gt; 1$) so $Y = X$.</p>\r\n <p>A variation on I

In [61]:
import re
def cleanhtml(raw_html):
    #clean HTML tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', raw_html)
    return cleantext

In [62]:
user_answers.iloc[:,0]=user_answers.iloc[:,0].apply(cleanhtml)

In [63]:
user_answers.iloc[6,0]

' Here\'s a reduction to the finite dimensional case.  Let $F$ be a finite set of subspaces of $X$.  For each finite dimensional subspace $Y$ of $X$, let $u(Y)$ be the set of elements $Z$ of $F$ such that $Y$ is contained in $Z$.  By assumption, $u(Y)$ is non-empty for every $Y$.  Since any two finite dimensional subspaces are contained in a third, the intersection of the sets $u(Y)$, as $Y$ runs among all finite dimensional subspaces of $X$, is non-empty.  Hence there is at least one set in $F$ that contains every finite dimensional subspace of $X$, hence contains $X$. \r\n\r\n For the finite dimensional case, let $F$ be a finite set of subspaces of $X$.  By induction, every codimension 1 subspace of $X$ is contained in some $Y$ from $F$.  But there are infinitely many codimension $1$ subspaces, so some $Y$ in $F$ contains more than one such subspace.  Any two distinct codimension 1 subspaces $\\operatorname{span} X$ (if $\\dim X &gt; 1$) so $Y = X$. \r\n  A variation on Ishai\'s exam

In [64]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
stop2 = set(pd.read_csv("stopword_list.txt",header=None).iloc[:,0])
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop and i not in stop2])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    to_be_normalized=[]
    for word in punc_free.split():
        try:
            cleaned_word=lemma.lemmatize(word)
            if len(cleaned_word)>2:
                to_be_normalized.append(cleaned_word)
        except UnicodeDecodeError:
            pass
    normalized = " ".join(to_be_normalized)
    return normalized

In [65]:
stop3=[u'set',
 u'space',
 u'map',
 u'finite',
 u'mathbb',
 u'function',
 u'example',
 u'question',
 u'theory',
 u'theorem',
 u'answer',
 u'paper',
 u'amp',
 u'form',
 u'time',
 u'result',
 u'book',
 u'using',
 u'note',
 u'dont',
 u'consider',
 u'following',
 u'this',
 u'term',
 u'follows',
 u'hence',
 u'case',
 u'look',
 u'comment',
 u'true',
 u'type',
 u'the',
 u'and',
 u'defined',
 u'extension',
 u'reference',
 u'point',
 u'section',
 u'variety',
 u'system',
 u'math',
 u'one',
 u'define',
 u'free',
 u'however',
 u'sense',
 u'exists',
 u'action',
 u'idea',
 u'actually',
 u'that',
 u'ideal',
 u'leq',
 u'sheaf',
 u'relation',
 u'alpha',
 u'method',
 u'limit',
 u'equal',
 u'edit',
 u'indeed',
 u'hold',
 u'nice',
 u'there',
 u'hand',
 u'found',
 u'exactly',
 u'choice',
 u'problem',
 u'called',
 u'write',
 u'functor',
 u'mathcal',
 u'which',
 u'page',
 u'see',
 u'word',
 u'yes',
 u'probably',
 u'now',
 u'way',
 u'reason',
 u'for',
 u'right',
 u'bit',
 u'cover',
 u'rightarrow',
 u'above',
 u'doesnt',
 u'lot',
 u'mathematical',
 u'check',
 u'detail',
 u'claim',
 u'taking',
 u'otimes',
 u'article',
 u'approach',
 u'special',
 u'start',
 u'exact',
 u'classical',
 u'try',
 u'looking',
 u'alpha',
 u'omega',
 u'gamma',
 u'phi',
 u'fact',
 u'fiber',
 u'not',
 u'path',
 u'application',
 u'mentioned',
 u'well',
 u'character',
 u'sigma',
 u'simply',
 u'useful',
 u'thus',
 u'step',
 u'usual',
 u'believe',
 u'people',
 u'exist',
 u'say',
 u'via',
 u'geq',
 u'cant']

In [66]:
user_answers_clean_1 = [clean(doc).split() for doc in user_answers.iloc[:,0]]
user_answers_clean_2 = [[i for i in doc if i not in stop3 and i not in stop2 and i not in stop] for doc in user_answers_clean_1]
#

In [67]:
from collections import Counter
term_frequency=Counter(x for sublist in user_answers_clean_2 for x in sublist)

In [68]:
list(pd.DataFrame.from_dict(term_frequency,orient='index').sort_values([0],ascending=False).head(300).index)

[u'proof',
 u'algebra',
 u'category',
 u'field',
 u'element',
 u'class',
 u'curve',
 u'structure',
 u'sequence',
 u'product',
 u'matrix',
 u'polynomial',
 u'model',
 u'complex',
 u'vector',
 u'algebraic',
 u'prime',
 u'representation',
 u'real',
 u'ring',
 u'sum',
 u'dimension',
 u'property',
 u'graph',
 u'mean',
 u'particular',
 u'subset',
 u'condition',
 u'equation',
 u'closed',
 u'degree',
 u'lie',
 u'line',
 u'zero',
 u'linear',
 u'solution',
 u'subgroup',
 u'manifold',
 u'bundle',
 u'value',
 u'formula',
 u'smooth',
 u'course',
 u'object',
 u'definition',
 u'positive',
 u'argument',
 u'compact',
 u'integer',
 u'simple',
 u'measure',
 u'equivalent',
 u'surface',
 u'natural',
 u'prove',
 u'assume',
 u'left',
 u'topology',
 u'vertex',
 u'cohomology',
 u'homotopy',
 u'bound',
 u'isomorphism',
 u'geometry',
 u'constant',
 u'construction',
 u'connected',
 u'suppose',
 u'generated',
 u'infinite',
 u'easy',
 u'operator',
 u'metric',
 u'conjecture',
 u'rational',
 u'standard',
 u'statement

In [69]:
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(user_answers_clean_2)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in user_answers_clean_2]

In [70]:
len(doc_term_matrix)

4513

In [37]:
from gensim.models.tfidfmodel import *
tfidf = TfidfModel(doc_term_matrix, id2word=dictionary)

#idf=gensim.models.tfidfmodel.df2idf(doc_term_matrix, 4513, log_base=2.0, add=0.0)
'''
low_value = 0.1
low_value_words = []
tfidf_sum
for bow in corpus:
    low_value_words += [id for id, value in tfidf[bow]]#]
'''
    

'\nlow_value = 0.1\nlow_value_words = []\ntfidf_sum\nfor bow in corpus:\n    low_value_words += [id for id, value in tfidf[bow]]#]\n'

In [71]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel2 = Lda(doc_term_matrix, num_topics=50, alpha='auto', id2word = dictionary, random_state=1)

In [73]:
print(ldamodel2.print_topics(num_topics=50, num_words=5))

[(0, u'0.010*"prime" + 0.007*"log" + 0.005*"sum" + 0.005*"integer" + 0.004*"bound"'), (1, u'0.008*"bundle" + 0.004*"complex" + 0.004*"manifold" + 0.004*"class" + 0.004*"field"'), (2, u'0.007*"model" + 0.005*"algebra" + 0.004*"kappa" + 0.004*"category" + 0.004*"structure"'), (3, u'0.005*"category" + 0.005*"ring" + 0.004*"field" + 0.003*"algebra" + 0.003*"proof"'), (4, u'0.010*"field" + 0.004*"isomorphism" + 0.004*"class" + 0.004*"scheme" + 0.004*"lie"'), (5, u'0.003*"beginalign" + 0.003*"endalign" + 0.003*"proof" + 0.002*"graph" + 0.002*"equation"'), (6, u'0.008*"manifold" + 0.007*"lagrangian" + 0.007*"bundle" + 0.006*"structure" + 0.006*"symplectic"'), (7, u'0.005*"polynomial" + 0.004*"graph" + 0.004*"element" + 0.003*"field" + 0.003*"class"'), (8, u'0.003*"field" + 0.003*"algebra" + 0.003*"matrix" + 0.003*"proof" + 0.003*"category"'), (9, u'0.004*"process" + 0.004*"equation" + 0.003*"field" + 0.003*"vector" + 0.003*"matrix"'), (10, u'0.010*"representation" + 0.005*"subgroup" + 0.005*"

In [84]:
ldamodel.get_document_topics(doc_term_matrix[3], minimum_probability=None, minimum_phi_value=None, per_word_topics=False)

[(1, 0.043863385540827429),
 (2, 0.026832315282950325),
 (5, 0.13882563107762458),
 (6, 0.15454777239606571),
 (8, 0.010341998968203056),
 (18, 0.018354492552330771),
 (19, 0.6034687050593075)]

In [74]:
m=np.zeros(shape=(4513,50))

In [75]:
for i in range(4513):
    topic_v=ldamodel2.get_document_topics(doc_term_matrix[i], minimum_probability=None, minimum_phi_value=None, per_word_topics=False)
    for t in topic_v:
        m[i,t[0]]=t[1]

In [76]:
import scipy.sparse as sp
m_sp=sp.coo_matrix(m,dtype=np.float32)

In [77]:
user_features_1=m_sp.tocsr()

In [78]:
user_features_1

<4513x50 sparse matrix of type '<type 'numpy.float32'>'
	with 17984 stored elements in Compressed Sparse Row format>

In [72]:
item_features_enhanced

<78048x1430 sparse matrix of type '<type 'numpy.float32'>'
	with 1034394 stored elements in Compressed Sparse Row format>

### Topic modeling for AboutMe

In [93]:
user_aboutme=pd.read_csv("User_aboutme.csv",header=None)
user_aboutme

Unnamed: 0,0
0,a <p>You can get in touch with me at geraschen...
1,a <p>Senior Lecturer at the Australian Nationa...
2,a <p>Dabbler in things.</p>\r\n
3,"a Assistant Professor at Indiana University, w..."
4,a I'm an arithmetic geometer at Emory University.
5,a
6,a
7,a
8,a I am a professor at UC Davis. <br><br>\r\r\n
9,a


In [94]:
user_aboutme.iloc[:,0]=user_aboutme.iloc[:,0].apply(cleanhtml)

In [96]:
user_aboutme_clean_1 = [clean(doc).split() for doc in user_aboutme.iloc[:,0]]
user_aboutme_clean_2 = [[i for i in doc if i not in stop2 and i not in stop] for doc in user_aboutme_clean_1]
#

In [101]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(user_aboutme_clean_2)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in user_aboutme_clean_2]

In [102]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel3 = Lda(doc_term_matrix, num_topics=20, alpha='auto', id2word = dictionary, random_state=1)

In [103]:
print(ldamodel3.print_topics(num_topics=20, num_words=5))

[(0, u'0.041*"student" + 0.026*"graduate" + 0.026*"list" + 0.016*"mathematical" + 0.015*"mathematics"'), (1, u'0.040*"university" + 0.018*"elementary" + 0.015*"mathematics" + 0.015*"postdoc" + 0.014*"faculty"'), (2, u'0.035*"math" + 0.031*"student" + 0.020*"hilbert" + 0.018*"love" + 0.016*"write"'), (3, u'0.033*"topology" + 0.026*"theory" + 0.022*"geometry" + 0.020*"mathematics" + 0.014*"university"'), (4, u'0.035*"student" + 0.027*"phd" + 0.022*"university" + 0.016*"user" + 0.016*"algebra"'), (5, u'0.050*"theory" + 0.023*"algebraic" + 0.022*"algebra" + 0.022*"university" + 0.015*"category"'), (6, u'0.025*"variety" + 0.019*"theory" + 0.018*"mathematical" + 0.012*"algebra" + 0.012*"physic"'), (7, u'0.019*"thanks" + 0.019*"equivalence" + 0.014*"string" + 0.013*"vision" + 0.011*"mathematics"'), (8, u'0.031*"site" + 0.018*"logic" + 0.014*"mathematical" + 0.014*"enthusiast" + 0.013*"participation"'), (9, u'0.065*"geometry" + 0.032*"algebraic" + 0.028*"theory" + 0.024*"differential" + 0.019*

In [104]:
m2=np.zeros(shape=(4513,20))

In [105]:
for i in range(4513):
    topic_v=ldamodel3.get_document_topics(doc_term_matrix[i], minimum_probability=None, minimum_phi_value=None, per_word_topics=False)
    for t in topic_v:
        m2[i,t[0]]=t[1]

In [106]:
m2_sp=sp.coo_matrix(m2,dtype=np.float32)

In [107]:
user_features_2=m2_sp.tocsr()

In [108]:
user_features_2

<4513x20 sparse matrix of type '<type 'numpy.float32'>'
	with 66395 stored elements in Compressed Sparse Row format>

#### Topic modeling for questions text

In [3]:
import os

import numpy as np
import pandas as pd
os.getcwd()

'C:\\Users\\songyifn\\Desktop'

In [4]:
questions_text=pd.read_csv("mathoverflow_Recommender_System\Answers_text.csv",header=None)

In [5]:
questions_text.iloc[23,0]

' Can the valuative criteria for separatedness/properness be checked "formally"? <p>Suppose f:X&rarr;Y is a morphism of finite type between locally noetherian schemes. The valuative criterion for separatedness (resp. properness) says roughly that f is a separated (resp. proper) morphism if and only if the following condition holds:</p>\r\n\r\n<blockquote>\r\n  <p>For any curve C in Y and for any lift\r\n  of C-{p} to X, there is at most one\r\n  (resp. exactly one) way to extend this\r\n  to a lift of C to X.</p>\r\n</blockquote>\r\n\r\n<p>More precisely,</p>\r\n\r\n<blockquote>\r\n  <p>If C is the spectrum of a DVR with\r\n  closed point p (a <em>very local</em> version\r\n  of a curve: the intersection of all\r\n  open neighborhoods of p on an "honest"\r\n  curve), C&rarr;Y is a morphism, and\r\n  C-{p}&rarr;X is a lift of that\r\n  morphism along f, there is at most one\r\n  (resp. exactly one) way to complete it to\r\n  a lift C&rarr;X.</p>\r\n</blockquote>\r\n\r\n<p>Does it suffic

In [8]:
questions_text=questions_text.fillna('a')

In [9]:
import re
def cleanhtml(raw_html):
    #clean HTML tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', raw_html)
    return cleantext

In [10]:
questions_text.iloc[:,0]=questions_text.iloc[:,0].apply(cleanhtml)

In [11]:
questions_text.iloc[23,0]

' Can the valuative criteria for separatedness/properness be checked "formally"?  Suppose f:X&rarr;Y is a morphism of finite type between locally noetherian schemes. The valuative criterion for separatedness (resp. properness) says roughly that f is a separated (resp. proper) morphism if and only if the following condition holds: \r\n\r\n \r\n   For any curve C in Y and for any lift\r\n  of C-{p} to X, there is at most one\r\n  (resp. exactly one) way to extend this\r\n  to a lift of C to X. \r\n \r\n\r\n More precisely, \r\n\r\n \r\n   If C is the spectrum of a DVR with\r\n  closed point p (a  very local  version\r\n  of a curve: the intersection of all\r\n  open neighborhoods of p on an "honest"\r\n  curve), C&rarr;Y is a morphism, and\r\n  C-{p}&rarr;X is a lift of that\r\n  morphism along f, there is at most one\r\n  (resp. exactly one) way to complete it to\r\n  a lift C&rarr;X. \r\n \r\n\r\n Does it suffice to check the valuative criteria on an even more local kind of object: the

In [12]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
stop2 = set(pd.read_csv("mathoverflow_Recommender_System\stopword_list.txt",header=None).iloc[:,0])
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop and i not in stop2])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    to_be_normalized=[]
    for word in punc_free.split():
        try:
            cleaned_word=lemma.lemmatize(word)
            if len(cleaned_word)>2:
                to_be_normalized.append(cleaned_word)
        except UnicodeDecodeError:
            pass
    normalized = " ".join(to_be_normalized)
    return normalized

In [14]:
questions_text_clean_1 = [clean(doc).split() for doc in questions_text.iloc[:,0]]
questions_text_clean_2 = [[i for i in doc if i not in stop2 and i not in stop] for doc in questions_text_clean_1]
#i not in stop3 and

In [17]:
from collections import Counter
term_frequency=Counter(x for sublist in questions_text_clean_2 for x in sublist)
list(pd.DataFrame.from_dict(term_frequency,orient='index').sort_values([0],ascending=False).head(300).index)

[u'question',
 u'set',
 u'space',
 'function',
 u'example',
 u'finite',
 u'theorem',
 u'amp',
 u'map',
 u'theory',
 u'field',
 u'following',
 u'algebra',
 u'category',
 u'proof',
 u'form',
 u'matrix',
 u'answer',
 u'time',
 u'result',
 u'reference',
 u'true',
 u'complex',
 u'graph',
 u'class',
 u'polynomial',
 u'element',
 u'subset',
 u'vector',
 u'manifold',
 u'curve',
 u'consider',
 u'condition',
 u'sequence',
 u'paper',
 u'smooth',
 u'property',
 u'ring',
 u'structure',
 u'algebraic',
 'prime',
 u'integer',
 u'representation',
 u'product',
 u'suppose',
 u'closed',
 u'equation',
 u'definition',
 u'mathbb',
 'defined',
 u'real',
 u'solution',
 u'using',
 u'positive',
 'variety',
 u'mean',
 u'prove',
 u'sum',
 u'compact',
 u'bundle',
 u'dimension',
 u'exists',
 u'define',
 u'dont',
 u'lie',
 u'value',
 u'thanks',
 u'simple',
 'subgroup',
 u'model',
 u'term',
 u'measure',
 u'operator',
 'linear',
 u'bound',
 u'assume',
 u'surface',
 u'formula',
 u'natural',
 u'degree',
 u'zero',
 u'righ

In [21]:
stop3=[u'question',
 u'set',
 u'space',
 'function',
 u'example',
 u'finite',
 u'theorem',
 u'amp',
 u'map',
 u'theory',
 u'following',
 u'form',
 u'answer',
 u'time',
 u'result',
 u'reference',
 u'true',
 u'class',
 u'consider',
 u'condition',
 u'paper',
 u'property',
 u'structure',
 u'suppose',
 u'definition',
 u'mathbb',
 'defined',
 u'using',
 'variety',
 u'exists',
 u'define',
 u'dont',
 u'lie',
 u'value',
 u'thanks',
 u'simple',
 u'term',
 u'assume',
 u'natural',
 u'rightarrow',
 u'leq',
 u'object',
 u'type',
 u'note',
 u'particular',
 u'book',
 u'exist',
 u'constant',
 u'scheme',
 u'system',
 u'extension',
 'statement',
 u'sense',
 u'mathcal',
 'edit',
 u'help',
 u'omega',
 u'looking',
 u'ideal',
 u'action',
 u'course',
 u'standard',
 u'denote',
 'alpha',
 u'omega',
 u'rational',
 u'similar',
 u'ive',
 u'idea',
 u'found',
 u'gamma',
 u'easy',
 u'limit',
 u'follows',
 u'look',
 u'functor',
 u'lambda',
 u'nice',
 u'word',
 u'pair',
 u'geq',
 u'corresponding',
 u'delta',
 u'section',
 u'trying',
 u'otimes',
 u'version',
 u'sigma',
 u'phi',
 u'actually',
 u'mathbbr',
 u'notion',
 u'called',
 u'thank',
 u'maybe',
 u'special',
 u'comment',
 u'infty',
 u'literature',
 u'cant',
 u'cdot',
 u'exact',
 u'choice',
 u'implies',
 u'associated',
 u'respect',
 u'hence',
 u'able',
 u'fiber',
 u'process',
 u'approach',
 u'page',
 u'usual',
 u'wondering',
 u'ldots',
 u'reason',
 u'write',
 u'obvious',
 u'contains',
 u'mathematics',
 u'exactly',
 u'doesnt',
 u'cover',
 u'advance',
 u'proper',
 u'call',
 u'classical',
 u'necessarily',
 u'read',
 u'application',
 u'please',
 u'etc',
 u'langle',
 u'via',
 u'simplicial',
 u'math',
 u'guess',
 u'correct',
 u'seen',
 u'mathbbc',
 u'information',
 u'hand',
 u'mathematical',
 u'article',
 u'motivation',
 u'appreciated',
 u'probably',
 u'forall',
 u'cdots',
 u'name',
 u'simply',
 u'taking',
 'mathbbrn',
 u'instead',
 u'mathbbz',
 u'text',
 u'basic',
 u'believe',
 u'situation',
 u'change',
 u'choose',
 u'step',
 u'obtain',
 u'reading',
 u'lot',
 u'claim',
 u'remark',
 u'people',
 u'hope',
 u'act',
 u'finding',
 u'wellknown',
 u'nbsp',
 u'theta',
 u'yes',
 u'torus',
 u'written',
 u'precisely',
 u'description',
 u'tell',
 u'fix',
 u'study',
 u'imply',
 u'beta',
 u'denotes',
 u'useful',
 u'language',
 u'start',
 u'context',
 u'obtained']

In [22]:
questions_text_clean_3 = [[i for i in doc if i not in stop3] for doc in questions_text_clean_2]

In [23]:
from collections import Counter
term_frequency=Counter(x for sublist in questions_text_clean_3 for x in sublist)
list(pd.DataFrame.from_dict(term_frequency,orient='index').sort_values([0],ascending=False).head(300).index)

[u'field',
 u'algebra',
 u'category',
 u'proof',
 u'matrix',
 u'complex',
 u'graph',
 u'polynomial',
 u'element',
 u'subset',
 u'vector',
 u'manifold',
 u'curve',
 u'sequence',
 u'smooth',
 u'ring',
 u'algebraic',
 'prime',
 u'integer',
 u'representation',
 u'product',
 u'closed',
 u'equation',
 u'real',
 u'solution',
 u'positive',
 u'mean',
 u'prove',
 u'sum',
 u'compact',
 u'bundle',
 u'dimension',
 'subgroup',
 u'model',
 u'measure',
 u'operator',
 'linear',
 u'bound',
 u'surface',
 u'formula',
 u'degree',
 u'zero',
 u'cohomology',
 u'line',
 u'metric',
 u'vertex',
 u'random',
 u'conjecture',
 u'probability',
 u'projective',
 u'left',
 u'integral',
 u'topology',
 u'sheaf',
 u'continuous',
 u'generated',
 u'connected',
 u'equivalent',
 u'variable',
 u'free',
 u'relation',
 u'distribution',
 u'hold',
 u'infinite',
 u'understand',
 u'related',
 u'bounded',
 u'abelian',
 u'topological',
 u'symmetric',
 u'homotopy',
 u'isomorphism',
 u'geometry',
 u'coefficient',
 u'local',
 u'fixed',
 '

In [24]:
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(questions_text_clean_3)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in questions_text_clean_3]



In [28]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=50, alpha='auto', id2word = dictionary, random_state=1)

In [54]:
print(ldamodel.print_topics(num_topics=50, num_words=10))

[(0, u'0.104*"mbox" + 0.061*"family" + 0.032*"modulus" + 0.025*"symplectic" + 0.017*"mathfrakg" + 0.015*"mathscrs0" + 0.015*"homogeneous" + 0.010*"ultrafilter" + 0.010*"relative" + 0.009*"gin"'), (1, u'0.042*"orbit" + 0.033*"mathfrak" + 0.023*"operatornamead" + 0.021*"oplus" + 0.020*"nilpotent" + 0.012*"expectation" + 0.012*"subalgebra" + 0.011*"cartan" + 0.011*"conditional" + 0.011*"ast"'), (2, u'0.049*"homotopy" + 0.036*"equivalence" + 0.031*"relation" + 0.030*"isomorphism" + 0.024*"diagram" + 0.023*"transformation" + 0.017*"loop" + 0.016*"equivalent" + 0.014*"preserve" + 0.011*"bijection"'), (3, u'0.034*"series" + 0.028*"method" + 0.020*"sum" + 0.019*"power" + 0.011*"modular" + 0.010*"mod" + 0.010*"compute" + 0.010*"formula" + 0.009*"numerical" + 0.009*"range"'), (4, u'0.094*"model" + 0.045*"countable" + 0.028*"mathscrs" + 0.016*"puzzle" + 0.016*"configuration" + 0.015*"transition" + 0.012*"segment" + 0.012*"mathbbr2" + 0.011*"pure" + 0.010*"particle"'), (5, u'0.026*"board" + 0.020*

In [31]:
m1=np.zeros(shape=(len(doc_term_matrix),50))
for i in range(len(doc_term_matrix)):
    topic_v=ldamodel.get_document_topics(doc_term_matrix[i], minimum_probability=None, minimum_phi_value=None, per_word_topics=False)
    for t in topic_v:
        m1[i,t[0]]=t[1]

In [None]:
pd.DataFrame(m1)

In [34]:
import scipy.sparse as sp
m1_csr=sp.coo_matrix(m1,dtype=np.float32).tocsr()

In [35]:
m1_csr

<78048x50 sparse matrix of type '<type 'numpy.float32'>'
	with 834539 stored elements in Compressed Sparse Row format>

In [46]:
from scipy.sparse import coo_matrix, hstack
item_features_enhanced=hstack([item_features.tocoo(),m1_csr]).tocsr()

In [47]:
item_features_enhanced

<78048x1430 sparse matrix of type '<type 'numpy.float32'>'
	with 1034394 stored elements in Compressed Sparse Row format>

In [51]:
pd.DataFrame([[1,2,3],[4,5,6]])

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6


In [52]:
pd.DataFrame(m1).to_csv("question_topics.csv",index=False,header=False)

In [53]:
os.getcwd()

'C:\\Users\\songyifn\\Desktop\\mathoverflow_Recommender_System'