In [113]:
import os
os.chdir('C:\\Users\songyifn\Desktop\mathoverflow_Recommender_System')

import numpy as np
import pandas as pd

import fetch_mathoverflow as fm

data = fm.fetch_mathoverflow('mathoverflow',
                           test_set_fraction=0.1,
                           indicator_features=False,
                           tag_features=True)

entire = data['entire_data']
train = data['train']
test = data['test']

In [3]:
print('The dataset has %s users and %s questions, '
      'with %s interactions in the test and %s interactions in the training set.'
      % (train.shape[0], train.shape[1], test.getnnz(), train.getnnz()))

The dataset has 4513 users and 78048 questions, with 9129 interactions in the test and 96667 interactions in the training set.


In [114]:
print('The dataset has %s users and %s questions, '
      'with %s interactions in the test and %s interactions in the training set.'
      % (entire.shape[0], entire.shape[1], test.getnnz(), train.getnnz()))

The dataset has 11166 users and 78048 questions, with 9129 interactions in the test and 96667 interactions in the training set.


In [8]:
to_include=np.where(np.squeeze(train.getnnz(axis=0))>0)[0]

In [11]:
to_include

array([    1,     2,     3, ..., 65622, 65788, 78047], dtype=int64)

In [17]:
train.tocsc()[:,to_include]

<4513x49773 sparse matrix of type '<type 'numpy.float32'>'
	with 96667 stored elements in Compressed Sparse Column format>

In [18]:
train_new = train.tocsc()[:,to_include].tocoo()
test_new = test.tocsc()[:,to_include].tocoo()

In [24]:
len(np.where(test_new.getnnz(axis=0)==0)[0]) #only 538 questions in the new test set have been answered

49235

In [146]:
np.array(pd.SparseSeries.from_coo(train).to_dense().index)

array([(0L, 2L), (0L, 4L), (1L, 5L), ..., (2835L, 16165L), (2836L, 2714L),
       (2836L, 39810L)], dtype=object)

In [26]:
# Import the model
from lightfm import LightFM

# Set the number of threads; you can increase this
# ify you have more physical cores available.
NUM_THREADS = 2
NUM_COMPONENTS = 30
NUM_EPOCHS = 3
ITEM_ALPHA = 1e-6

# Let's fit a WARP model: these generally have the best performance.
model = LightFM(loss='warp',
                item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS)

# Run 3 epochs and time it.
%time model = model.fit(train_new, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)

Wall time: 5.1 s


In [28]:
# Import the evaluation routines
from lightfm.evaluation import auc_score

# Compute and print the AUC score
train_auc = auc_score(model, train_new, num_threads=NUM_THREADS).mean()
print('Collaborative filtering train AUC: %s' % train_auc)

Collaborative filtering train AUC: 0.813704


In [29]:
# We pass in the train interactions to exclude them from predictions.
# This is to simulate a recommender system where we do not
# re-recommend things the user has already interacted with in the train
# set.
test_auc = auc_score(model, test_new, train_interactions=train_new, num_threads=NUM_THREADS).mean()
print('Collaborative filtering test AUC: %s' % test_auc)

Collaborative filtering test AUC: 0.676976


In [150]:
len(pd.SparseSeries.from_coo(train).index)

51472

In [30]:
# Set biases to zero
model.item_biases *= 0.0

test_auc = auc_score(model, test_new, train_interactions=train_new, num_threads=NUM_THREADS).mean()
print('Collaborative filtering test AUC: %s' % test_auc)

Collaborative filtering test AUC: 0.650039


In [117]:
item_features = data['item_features']
tag_labels = data['item_feature_labels']
item_features_new = item_features[to_include,:]
print('There are %s distinct tags, with values like %s.' % (item_features_new.shape[1], tag_labels[:10].tolist()))

There are 1380 distinct tags, with values like [u'ag.algebraic-geometry', u'at.algebraic-topology', u'ca.analysis-and-odes', u'ct.category-theory', u'co.combinatorics', u'gr.group-theory', u'lie-groups', u'lie-algebras', u'rt.representation-theory', u'ac.commutative-algebra'].


In [34]:
item_features_new.shape

(49773, 1380)

In [154]:
tag_labels[item_features[10,:].toarray()[0].astype(bool)]#.astype(bool)

array([u'population', u'forecasting', u'census'], 
      dtype='<U50')

In [155]:
train_csc=train.tocsc()

In [156]:
sum(train_csc[4,:].toarray()[0])

109.0

In [35]:
# Define a new model instance
model = LightFM(loss='warp',
                #item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS,
                max_sampled=30,
                item_alpha=1e-6)

# Fit the hybrid model. Note that this time, we pass
# in the item features matrix.
model = model.fit(train_new,
                item_features=item_features_new,
                epochs=15,
                num_threads=NUM_THREADS)

In [36]:
# Don't forget the pass in the item features again!
train_auc = auc_score(model,
                      train_new,
                      item_features=item_features_new,
                      num_threads=NUM_THREADS).mean()
print('Hybrid training set AUC: %s' % train_auc)

Hybrid training set AUC: 0.964387


In [37]:
test_auc = auc_score(model,
                    test_new,
                    train_interactions=train_new,
                    item_features=item_features_new,
                    num_threads=NUM_THREADS).mean()
print('Hybrid test set AUC: %s' % test_auc)

Hybrid test set AUC: 0.878862


### Baseline Model (Popularity-based recommendation)

In [56]:
### Baseline Model: Popularity-based (non-personalized) Recommender:
order_by_popularity=np.argsort(np.squeeze(train_new.getnnz(axis=0)))[::-1]

In [81]:
from sklearn import metrics
train_new_csr=train_new.tocsr()
test_new_csr=test_new.tocsr()

all_auc=[]
for i in range(4513):
    indices_kept=np.where(train_new_csr[i,:].toarray()[0]==0)[0]
    fpr, tpr, thresholds = metrics.roc_curve(pd.Series(test_new_csr[i,:].toarray()[0][indices_kept].astype(int)).apply(lambda x: 1 if x!=0 else 0), order_by_popularity[indices_kept])
    auc=metrics.auc(fpr,tpr)
    if pd.isnull(auc)==False:
        all_auc.append(auc)

In [86]:
sum(all_auc)/len(all_auc)

0.4679451456015028

### Topics + Tags

In [109]:
import scipy
topics_csr=scipy.sparse.coo_matrix(pd.read_csv('question_topics.csv',header=None),dtype=np.float32).tocsr()
item_features_enhanced=scipy.sparse.hstack([item_features,topics_csr])
item_features_new_enhanced=item_features_enhanced.tocsr()[to_include,:]
topics_csr_new=topics_csr[to_include,:]

In [111]:
# Define a new model instance
model1 = LightFM(loss='warp',
                #item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS,
                max_sampled=30,
                item_alpha=1e-6)

# Fit the hybrid model. Note that this time, we pass
# in the item features matrix.
model1 = model1.fit(train_new,
                item_features=item_features_new_enhanced,
                #user_features=user_features,
                epochs=15,
                num_threads=NUM_THREADS)


# Don't forget the pass in the item features again!
train_auc = auc_score(model1,
                      train_new,
                      item_features=item_features_new_enhanced,
                      #user_features=user_features,
                      num_threads=NUM_THREADS).mean()
print('Hybrid training set AUC: %s' % train_auc)


test_auc = auc_score(model1,
                    test_new,
                    train_interactions=train_new,
                    item_features=item_features_new_enhanced,
                    #user_features=user_features,
                    num_threads=NUM_THREADS).mean()
print('Hybrid test set AUC: %s' % test_auc)

Hybrid training set AUC: 0.968705
Hybrid test set AUC: 0.894619


### Topics only

In [112]:
model2 = LightFM(loss='warp',
                #item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS,
                max_sampled=30,
                item_alpha=1e-6)

# Fit the hybrid model. Note that this time, we pass
# in the item features matrix.
model2 = model2.fit(train_new,
                item_features=topics_csr_new,
                #user_features=user_features,
                epochs=15,
                num_threads=NUM_THREADS)


# Don't forget the pass in the item features again!
train_auc = auc_score(model2,
                      train_new,
                      item_features=topics_csr_new,
                      #user_features=user_features,
                      num_threads=NUM_THREADS).mean()
print('Hybrid training set AUC: %s' % train_auc)

test_auc = auc_score(model2,
                    test_new,
                    train_interactions=train_new,
                    item_features=topics_csr_new,
                    #user_features=user_features,
                    num_threads=NUM_THREADS).mean()
print('Hybrid test set AUC: %s' % test_auc)

Hybrid training set AUC: 0.918423
Hybrid test set AUC: 0.8182


In [119]:
# Define a new model instance
model3 = LightFM(loss='warp',
                #item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS,
                max_sampled=30,
                item_alpha=1e-6)

# Fit the hybrid model. Note that this time, we pass
# in the item features matrix.
model3 = model3.fit(train_new,
                item_features=item_features_new,
                #user_features=user_features,
                epochs=15,
                num_threads=NUM_THREADS)


# Don't forget the pass in the item features again!
train_auc = auc_score(model3,
                      train_new,
                      item_features=item_features_new,
                      #user_features=user_features,
                      num_threads=NUM_THREADS).mean()
print('Hybrid training set AUC: %s' % train_auc)


test_auc = auc_score(model3,
                    test_new,
                    train_interactions=train_new,
                    item_features=item_features_new,
                    #user_features=user_features,
                    num_threads=NUM_THREADS).mean()
print('Hybrid test set AUC: %s' % test_auc)

Hybrid training set AUC: 0.963928
Hybrid test set AUC: 0.871383


In [78]:
sum(test_new_csr[i,:].toarray()[0][indices_kept].astype(int))

4

In [79]:
np.where(test_new_csr[i,:].toarray()[0][indices_kept].astype(int)!=0)[0]

array([ 1552,  9035, 46476], dtype=int64)

In [71]:
import scipy.io
mat = scipy.io.loadmat('kaggle77b_trainset.mat')

In [74]:
mat['trainset'].shape

(21983L, 100L)

In [77]:
import os
os.chdir('C:\\Users\\songyifn\\Desktop')

In [78]:
users=pd.read_csv("kaggle_users.txt",header=None)

Unnamed: 0,0
0,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d
1,d7083f5e1d50c264277d624340edaaf3dc16095b
2,d68dc6fc25248234590d7668a11e3335534ae4b4
3,9be82340a8b5ef32357fe5af957ccd54736ece95
4,841b2394ae3a9febbd6b06497b4a8ee8eb24b7f8
5,91b8fac7dc5e03f6cfaf6e2aa7171f14a8354d62
6,458833ce4418010e61304b34b2c992e1cce63435
7,c34670d9c1718361feb93068a853cead3c95b76a
8,0f40e074aab2c5f47b7ddc2277fb0295b5b3a058
9,ef0d21935a2f8ae90571dbfab800f87fa5b38769


In [135]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve([0,1,1,0], [8,6,9,3])

In [136]:
metrics.auc(fpr,tpr)

0.75

In [138]:
tpr

array([ 0.5,  0.5,  1. ,  1. ])

In [139]:
fpr

array([ 0. ,  0.5,  0.5,  1. ])

In [61]:
train_new.shape

(4513, 49773)

In [142]:
data['item_features']

<72360x1246 sparse matrix of type '<type 'numpy.float32'>'
	with 198963 stored elements in Compressed Sparse Row format>

In [185]:
user_id=pd.read_csv("map_question.csv",header=None)

In [187]:
len(np.unique(user_id.iloc[:,0]))

78047

In [184]:
x = np.array([[[0], [1], [2]]])
np.squeeze(x)

array([0, 1, 2])

In [193]:
user_answers=pd.read_csv("mathoverflow_Recommender_System\User_answers.csv",header=None)

In [194]:
user_answers.iloc[6,0]

'<p>Here\'s a reduction to the finite dimensional case.  Let $F$ be a finite set of subspaces of $X$.  For each finite dimensional subspace $Y$ of $X$, let $u(Y)$ be the set of elements $Z$ of $F$ such that $Y$ is contained in $Z$.  By assumption, $u(Y)$ is non-empty for every $Y$.  Since any two finite dimensional subspaces are contained in a third, the intersection of the sets $u(Y)$, as $Y$ runs among all finite dimensional subspaces of $X$, is non-empty.  Hence there is at least one set in $F$ that contains every finite dimensional subspace of $X$, hence contains $X$.</p>\r\n\r\n<p>For the finite dimensional case, let $F$ be a finite set of subspaces of $X$.  By induction, every codimension 1 subspace of $X$ is contained in some $Y$ from $F$.  But there are infinitely many codimension $1$ subspaces, so some $Y$ in $F$ contains more than one such subspace.  Any two distinct codimension 1 subspaces $\\operatorname{span} X$ (if $\\dim X &gt; 1$) so $Y = X$.</p>\r\n <p>A variation on I

In [195]:
import re
def cleanhtml(raw_html):
    #clean HTML tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', raw_html)
    return cleantext

In [196]:
user_answers.iloc[:,0]=user_answers.iloc[:,0].apply(cleanhtml)

In [197]:
user_answers.iloc[6,0]

' Here\'s a reduction to the finite dimensional case.  Let $F$ be a finite set of subspaces of $X$.  For each finite dimensional subspace $Y$ of $X$, let $u(Y)$ be the set of elements $Z$ of $F$ such that $Y$ is contained in $Z$.  By assumption, $u(Y)$ is non-empty for every $Y$.  Since any two finite dimensional subspaces are contained in a third, the intersection of the sets $u(Y)$, as $Y$ runs among all finite dimensional subspaces of $X$, is non-empty.  Hence there is at least one set in $F$ that contains every finite dimensional subspace of $X$, hence contains $X$. \r\n\r\n For the finite dimensional case, let $F$ be a finite set of subspaces of $X$.  By induction, every codimension 1 subspace of $X$ is contained in some $Y$ from $F$.  But there are infinitely many codimension $1$ subspaces, so some $Y$ in $F$ contains more than one such subspace.  Any two distinct codimension 1 subspaces $\\operatorname{span} X$ (if $\\dim X &gt; 1$) so $Y = X$. \r\n  A variation on Ishai\'s exam

In [198]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

LookupError: 
**********************************************************************
  Resource u'corpora/stopwords' not found.  Please use the NLTK
  Downloader to obtain the resource:  >>> nltk.download()
  Searched in:
    - 'C:\\Users\\songyifn/nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - 'C:\\Users\\songyifn\\Anaconda2\\nltk_data'
    - 'C:\\Users\\songyifn\\Anaconda2\\lib\\nltk_data'
    - 'C:\\Users\\songyifn\\AppData\\Roaming\\nltk_data'
**********************************************************************

In [200]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [38]:
pd.DataFrame([[1,2,3],[4,5,6]])

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6


In [None]:
doc_clean = [clean(doc).split() for doc in doc_complete]