## Experiment-05
### Amirreza Fosoul and Bithiah Yuan

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import *
import string
import csv
import re
import time
spark = SparkSession.builder.appName('ex5').getOrCreate()

from pyspark.sql.functions import *
from pyspark.sql.functions import split, udf, desc, concat, col, lit
import pyspark.sql.functions as f
from pyspark.sql.types import ArrayType, FloatType, StringType, IntegerType, DoubleType, StructType, StructField
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.window import Window
from pyspark.ml.linalg import SparseVector, VectorUDT, DenseVector
import scipy.sparse
from pyspark.ml.linalg import Vectors, _convert_to_vector, VectorUDT
import numpy as np
from pyspark.sql import SQLContext
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import math
import re
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
#from pyspark.mllib.feature import Word2Vec
from pyspark.ml.feature import Word2Vec

sc = spark.sparkContext
sqlContext = SQLContext(sc)

In [5]:
# Read user ratings into Dataframe
#user_df = spark.read.option("delimiter", ";").csv('./users_libraries.txt')
user_df = spark.read.option("delimiter", ";").csv('./example0.txt')
user_df = user_df.select(col("_c0").alias("userID"), col("_c1").alias("paperID"))

sampled = user_df

user_df_pre = user_df
user_df_pre = user_df_pre.withColumn("paperID", split(col("paperID"), ",").cast(ArrayType(IntegerType())).alias("paperID"))
user_df = user_df.select("userID", f.split("paperID", ",").alias("papers"), f.explode(f.split("paperID", ",")).alias("paperID"))
user_df = user_df.drop("papers")

# Get a dataframe of the distinct papers
d_paper = user_df.select("paperID").distinct()

# Read in the stopwords as a list
with open('./stopwords_en.txt') as file:
    stopwordList = file.read().splitlines()

# Read in records of paper information
#w_df = spark.read.csv('./papers.csv')
w_df = spark.read.csv('./paper0.csv')
w_df = w_df.select("_c0", "_c13", "_c14")
w_df = w_df.select(col("_c0").alias("paperID"), col("_c13").alias("title"), col("_c14").alias("abstract"))
w_df = w_df.na.fill({'title': '', 'abstract': ''}) # to replace null values with empty string
# Get text from title and abstract
w_df = w_df.select(col("paperID"), concat(col("title"), lit(" "), col("abstract")).alias("words"))
#w_df.show()

# Transform the distinct paperIDs dataframe to a list
paper_list = list(d_paper.select('paperID').toPandas()['paperID'])
# Map each distinct paper into int
paper_list = list(map(int, paper_list))

### Exercise 5. 1 (Pre-processing Text for word2vec)

In [6]:
################################### Conservative pre-processing ###################################################

# Extracting words from the papers and keeping "-" and "_"
tokenizer = RegexTokenizer(inputCol="words", outputCol="tokens", pattern="[a-zA-Z-_]+", gaps=False) 
# Built-in tokenizer
tokenized = tokenizer.transform(w_df)
tokenized = tokenized.select("paperID", "tokens")

# udf to remove "-" and "_" from the tokens
remove_hyphen_udf = udf(lambda x: [re.sub('[-|_]', '', word) for word in x], ArrayType(StringType()))
# Apply udf to the tokens
df = tokenized.withColumn('tokens', remove_hyphen_udf(col('tokens')))
# udf to remove words less than 3 letters
remove_short_words = udf(lambda x: [item for item in x if len(item) >= 3], ArrayType(StringType()))
# Apply udf to the tokens
df = df.withColumn('tokens', remove_short_words(col('tokens')))

conservative_df = df
conservative_df = conservative_df.withColumn("paperID", conservative_df["paperID"].cast(IntegerType()))

word2Vec = Word2Vec(vectorSize=100, inputCol="tokens", outputCol="result")
model = word2Vec.fit(conservative_df)


print("top10 most similar words to “science” using conservative pre-processing")
print()
synonyms = model.findSynonyms('science', 10)
synonyms.show()

#################################### Intensive pre-processing #####################################################
# Built-in function to remove stopwords from our custom list
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered" , stopWords=stopwordList)
df = remover.transform(df)
df = df.select("paperID", "filtered")

# Apply stemming with NLTK
# Built-in class from NLTK
ps = PorterStemmer()
# udf to apply stemming
stemming = udf(lambda x: [ps.stem(item) for item in x], ArrayType(StringType()))
# apply udf to tokens
df = df.withColumn('tokens', stemming(col('filtered')))
df = df.select("paperID", "tokens")

intensive_df = df

word2Vec = Word2Vec(vectorSize=100, inputCol="tokens", outputCol="result")
model2 = word2Vec.fit(intensive_df)

print("top10 most similar words to “science” using intensive pre-processing")
print()
synonyms2 = model2.findSynonyms(ps.stem('science'), 10)
synonyms2.show()

top10 most similar words to “science” using conservative pre-processing

+------------+------------------+
|        word|        similarity|
+------------+------------------+
|       among| 0.929947018623352|
|        they|0.9288403391838074|
|     analyze|     0.91845703125|
|  principles|0.9139120578765869|
|    elements|0.9135629534721375|
|  integrated|0.9131134748458862|
|organization|0.9103100299835205|
| recommender|0.9093801379203796|
|   represent| 0.907145082950592|
|  processing|0.9065096974372864|
+------------+------------------+

top10 most similar words to “science” using intensive pre-processing

+--------+------------------+
|    word|        similarity|
+--------+------------------+
| qualiti|0.9525512456893921|
|   evolv|0.9520372152328491|
| practic|0.9485098123550415|
| overlap|0.9436403512954712|
|research|0.9434059262275696|
|  reveal|0.9401988983154297|
|    idea|0.9399848580360413|
|   motiv|0.9399530291557312|
|  absenc|0.9376935362815857|
| topolog|0.93735170

### The results for intensive pre-processing are better (the similarity scores are higher) because unimportant words are removed and words with the same stems are shown in the same token.

### Exercise 5. 2 (Analogies)

In [7]:
def getAverageVector(query, model):
    df = sqlContext.createDataFrame([[query]], ['tokens'])
    return model.transform(df)

def analogy(word1, word2, word3, model, stemming=False):        
    vec = model.getVectors()
    words = [word1, word2, word3]
    keywords = []
    vectors = []
    for i in words:
        keywords.append([x.lower().strip() for x in re.split("[^A-Za-z]+", i)])
        
    if stemming:
        for i, query in enumerate(keywords):
            for j, word in enumerate(query):
                keywords[i][j] = ps.stem(word)
    
    for query in keywords:
        if len(query) > 1:
            vectors.append(getAverageVector(query, model).head()[1])
        else:
            vectors.append(vec.where(vec.word==query[0]).head()[1])
            
    w = vectors[0] - vectors[1] + vectors[2]
    result = model.findSynonyms((-1)*w,5)
    return result

print("analogy for the conservative pre-processing model:")
print()
analogy("machine learning", "prediction", "recommender system", model, stemming=False).show()

print("analogy for the intensive pre-processing model:")
print()
analogy("machine learning", "prediction", "recommender systems", model2, stemming=True).show()


analogy for the conservative pre-processing model:

+--------------+------------------+
|          word|        similarity|
+--------------+------------------+
|           dna|0.9437289834022522|
|           rna|0.9341936111450195|
|highthroughput|0.9332094788551331|
|      projects|0.9273788928985596|
|       protein|0.9262586236000061|
+--------------+------------------+

analogy for the intensive pre-processing model:

+--------+------------------+
|    word|        similarity|
+--------+------------------+
|contrast|0.9072569608688354|
|  materi|0.8987579941749573|
|    mass|0.8907632827758789|
|    wiki|0.8852062821388245|
| sequenc|0.8777000308036804|
+--------+------------------+



### Exercise 5. 3 (From Embeddings to Paper Recommendation)

In [8]:
paper_w2v_cp = model.transform(conservative_df)
paper_w2v_cp = paper_w2v_cp.select("paperID", col("result").alias("paper_profile"))

paper_w2v_cp.show()

+--------+--------------------+
| paperID|       paper_profile|
+--------+--------------------+
|12832332|[-0.0047933138315...|
| 1305474|[-0.0058180570508...|
| 1001231|[-0.0077452716323...|
|  352713|[-0.0148051936151...|
|  956315|[-0.0140641083614...|
|  945604|[-0.0126434465753...|
|10294999|[-0.0168046080972...|
|  967275|[-0.0096054069311...|
|  115945|[-0.0151569036622...|
|11733005|[-0.0146136129983...|
| 9045137|[-0.0162341385197...|
| 3728173|[-0.0073623910933...|
| 8310458|[-0.0058715931459...|
|   80546|[-0.0089703239850...|
| 5842862|[-0.0147599245198...|
| 1242600|[-0.0181218879103...|
| 3467077|[-0.0114230531488...|
|  309395|[-0.0178380076933...|
|  305755|[0.00962374638766...|
| 6603134|[-0.0135982233764...|
+--------+--------------------+
only showing top 20 rows



In [9]:
user_doc = user_df.join(conservative_df, ['paperID']).orderBy("userID")
user_doc = user_doc.drop("paperID")

# Concatenate the words of the user library
user_doc = user_doc.rdd.map(lambda user_doc: (user_doc.userID, user_doc.tokens)).reduceByKey(lambda x,y: x + y).toDF(['userID','tokens'])

model_user = word2Vec.fit(user_doc)

user_w2v_cp = model_user.transform(user_doc)

user_w2v_cp = user_w2v_cp.select("userID", col("result").alias("user_profile"))

user_w2v_cp.show()

+--------------------+--------------------+
|              userID|        user_profile|
+--------------------+--------------------+
|90f1a3e6fcdbf9bc5...|[-0.0025672873174...|
|bbcd9dae3160ddcb9...|[-0.0028135964860...|
|9cb1df6a39b70d88f...|[-0.0023582438061...|
|22ede8ed38ebcbaf8...|[-0.0042628118548...|
|1eac022a97d683eac...|[-0.0032774288954...|
|589b870a611c25fa9...|[-0.0034229244691...|
|a0bbf6bb9b1c818f3...|[-0.0031705740899...|
|8d898a2171f552b3d...|[-0.0010895416652...|
|d1d41a15201915503...|[-0.0022795030562...|
|f1e1cd4ff25018273...|[-0.0045282421986...|
|3b715ebaf1f8f81a1...|[-0.0036120140960...|
|cbd4a69e4b3ed3472...|[-0.0038574325744...|
|f05bcffe7951de9e5...|[-0.0032565576942...|
|cf9c7f356092c34be...|[-0.0028873409683...|
|ee1dfee93ebeadade...|[-0.0034400954917...|
|b36c3189bb1457cd0...|[-0.0040417211316...|
|3c9a5b5e6448a0119...|[-0.0010189504721...|
|a7fe408c548c6112d...|[-0.0047462778076...|
|b656009a6efdc8b1a...|[-0.0036004181851...|
|4c8912d1b04471cf5...|[-0.003591

In [11]:
# Function to call in udf
def unrated(papers):
    # Transform the list of distinct papers and the list of rated papers of each user to a set
    # Substract the two sets to get the list of unrated papers for each user
    # Transform back to list
    unrated = list(set(paper_list) - set(papers))
    
    return unrated


# udf to get a list of unrated papers with the length of rated papers for each user
get_unrated = udf(lambda x: unrated(x), ArrayType(IntegerType()))

# Add a new column of unrated papers for each user
unrated_df = user_df_pre.withColumn("unrated", get_unrated(user_df_pre.paperID))

unrated_df = unrated_df.drop("paperID")

unrated_df = unrated_df.withColumn("paperID", explode(unrated_df.unrated))

unrated_df = unrated_df.drop("unrated")


In [15]:
new_df = unrated_df.join(user_w2v_cp, ["userID"]).join(paper_w2v_cp, ["paperID"])
new_df.show()

+-------+--------------------+--------------------+--------------------+
|paperID|              userID|        user_profile|       paper_profile|
+-------+--------------------+--------------------+--------------------+
| 212874|90f1a3e6fcdbf9bc5...|[-0.0025672873174...|[-0.0131514674861...|
|1326856|90f1a3e6fcdbf9bc5...|[-0.0025672873174...|[-0.0102063186265...|
|  81501|90f1a3e6fcdbf9bc5...|[-0.0025672873174...|[-0.0232182141199...|
|  65083|90f1a3e6fcdbf9bc5...|[-0.0025672873174...|[-0.0306622119581...|
| 523772|90f1a3e6fcdbf9bc5...|[-0.0025672873174...|[-0.0243687142307...|
| 105906|90f1a3e6fcdbf9bc5...|[-0.0025672873174...|[-0.0094669061247...|
|1121661|90f1a3e6fcdbf9bc5...|[-0.0025672873174...|[-0.0072813053467...|
|7355647|90f1a3e6fcdbf9bc5...|[-0.0025672873174...|[-0.0194716824508...|
|  72879|90f1a3e6fcdbf9bc5...|[-0.0025672873174...|[-0.0139008522373...|
| 244827|90f1a3e6fcdbf9bc5...|[-0.0025672873174...|[-0.0163729196105...|
| 244827|90f1a3e6fcdbf9bc5...|[-0.0025672873174...|

In [13]:
import numpy as np 

def cos_sim(u, p):
    result = (np.dot(u, p))/(np.linalg.norm(u) * np.linalg.norm(p))
    result = result.item()
    return result

compute_sim = udf(cos_sim, FloatType())

In [16]:
def w2vRS(userID, df, k):
    df = df.where(df.userID==userID)
    # Apply similarity metric to the user_profile and paper_profile
    sim_df = df.withColumn('Similarity', compute_sim(df.user_profile, df.paper_profile))
    # Partition by userID and order by the similarity in descending order
    window = Window.partitionBy(col("userID")).orderBy((col("Similarity")).desc())
    # Add row numbers to the rows and get the top-k rows
    sim_df = sim_df.select(col('*'), row_number().over(window).alias('row_number')).where(col('row_number') <= k)
    # Renaming
    get_r = sim_df.select("userID", "paperID", col("row_number").alias("rank"))
    w2vRS_df = get_r.select("userID", "paperID")
    # un-explode, concatenate the recommended papers for each user
    w2vRS_df = w2vRS_df.groupby("userID").agg(f.concat_ws(", ", f.collect_list(w2vRS_df.paperID)).alias("top_papers"))
    
    return w2vRS_df

k = 10

user = "1eac022a97d683eace8815545ce3153f"

user_rec = w2vRS(user, new_df, k)

user_rec.show(truncate=False)

+--------------------------------+---------------------------------------------------------------------------+
|userID                          |top_papers                                                                 |
+--------------------------------+---------------------------------------------------------------------------+
|1eac022a97d683eace8815545ce3153f|361498, 154, 5307378, 6434100, 4511, 197260, 484851, 820297, 5662136, 90413|
+--------------------------------+---------------------------------------------------------------------------+



## Exercise 4. 3 (Content-based recommendations)

In [11]:
def cbrs(u, k):
    # Apply similarity metric to the user_profile and paper_profile
    sim_df = u.withColumn('Similarity', compute_sim(u.user_profile, u.paper_profile))
    # Partition by userID and order by the similarity in descending order
    window = Window.partitionBy(col("userID")).orderBy((col("Similarity")).desc())
    # Add row numbers to the rows and get the top-k rows
    sim_df = sim_df.select(col('*'), row_number().over(window).alias('row_number')).where(col('row_number') <= k)
    # Renaming
    get_r = sim_df.select("userID", "paperID", col("row_number").alias("rank"))
    cbrs_df = get_r.select("userID", "paperID")
    # un-explode, concatenate the recommended papers for each user
    cbrs_df = cbrs_df.groupby("userID").agg(f.concat_ws(", ", f.collect_list(cbrs_df.paperID)).alias("top_papers"))
    
    return cbrs_df

k = 5

user_rec = cbrs(df_selected, k).show(truncate=False)

#print("The top " + str(k) + " papers for " + str(user_rec.head()[0]) + " are " + str(user_rec.head()[1]))

+--------------------------------+-------------------------------------------+
|userID                          |top_papers                                 |
+--------------------------------+-------------------------------------------+
|1eac022a97d683eace8815545ce3153f|11733005, 8336239, 7010764, 2887105, 115945|
+--------------------------------+-------------------------------------------+



In [12]:
### LDA ###
from pyspark.ml.clustering import LDA

# just rename the column to use it with the built-in methods
termFrequencyVector = vector_df.select('paperId', col('term_frequency_sparse').alias('features'))
# Trains a LDA model
# set k=40 to have 40 different topics
lda = LDA(k=40)
model = lda.fit(termFrequencyVector)

# each topic is described by 5 terms
topics = model.describeTopics(5)

# print("The topics described by their top-weighted terms:")
#topics.show(truncate=False)

# Shows the result
# it shows the probabilty of each topic for each paper
transformed = model.transform(termFrequencyVector)
#transformed.show()

In [13]:
# creating a user profile based on the LDA results

lda_user_profile = transformed.join(user_df, ["paperID"]).orderBy("userID").select('userId', col('topicDistribution').alias('features'))

lda_user_profile = lda_user_profile.rdd.mapValues(lambda v: v.toArray()).reduceByKey(lambda x, y: x + y).mapValues(lambda x: DenseVector(x))\
.toDF(["userId", "features"])
lda_user_profile = lda_user_profile.select("userId", col("features").alias("lda_user_profile"))

lda_user_profile = lda_user_profile.withColumn("user_profile", to_sparse_udf(col("lda_user_profile")))

#lda_user_profile.show()

df_lda = unrated_df.join(transformed, ["paperID"]).join(lda_user_profile, ["userID"])

df_lda = df_lda.select("userID", "paperID", "user_profile", col("features").alias("paper_profile"))

#df_lda.show()

df_selected_lda = df_lda.where(df_lda.userID=="1eac022a97d683eace8815545ce3153f")

#df_selected_lda.show()

In [14]:
cbrs(df_selected_lda, 5).show(truncate=False)

+--------------------------------+--------------------------------------------+
|userID                          |top_papers                                  |
+--------------------------------+--------------------------------------------+
|1eac022a97d683eace8815545ce3153f|7164691, 9563857, 1363828, 11733005, 8336239|
+--------------------------------+--------------------------------------------+



## Exercise 4. 4 Sampling and data preparation

In [15]:
from pyspark.sql.functions import rand 

num_user = 2

# Order the users randomly and get n sampled users
sampled_users = user_df_pre.orderBy(rand()).limit(num_user)

# Get length of each list in a column
get_len_udf = udf(lambda x: len(x), IntegerType())

# Get library size for each user
sampled_users = sampled_users.withColumn("libSize", get_len_udf("paperID"))

# Get the size of the training set
get_train = udf(lambda x: int(x*0.8), IntegerType())

# Get size of the training set for each user
sampled_users = sampled_users.withColumn("trainSize", get_train("libSize"))

sampled_users.show()

+--------------------+--------------------+-------+---------+
|              userID|             paperID|libSize|trainSize|
+--------------------+--------------------+-------+---------+
|1eac022a97d683eac...|[3973229, 322433,...|    321|      256|
|589b870a611c25fa9...|[1283233, 1305474...|      8|        6|
+--------------------+--------------------+-------+---------+



In [16]:
# explode the paperIDs for each user
sampled_exploded = sampled_users.withColumn("paperID", explode(col("paperID")))

# Partion by userID and order them randomly
window = Window.partitionBy(col("userID")).orderBy(rand())

# Get row numbers
sampled_exploded = sampled_exploded.select(col('*'), row_number().over(window).alias('row_number'))

# Get the rows less than or equal to the training set size
# The rows will be different each time because of .orderBy(rand()) in the window function
training_df = sampled_exploded.where(col('row_number') <= col("trainSize"))
training_df = training_df.select("userID", "paperID").orderBy("userID")
#training_df.show()

# Get the test set by selecting the rows greater than the training size
test_df = sampled_exploded.where(col('row_number') > col("trainSize"))
test_df = test_df.select("userID", "paperID").orderBy("userID")
#test_df.show()

In [17]:
# create the user profile using the tf_idf dataframe and the users' library dataframe
training_user_profile = tf_idf_built_in.join(training_df, ["paperID"]).orderBy("userID").select('userId', 'features')

# convert the dataframe to RDD to sum up the tf_idf vector of each user and then convert back to dataframe
training_user_profile = training_user_profile.rdd.mapValues(lambda v: v.toArray()).reduceByKey(lambda x, y: x + y).mapValues(lambda x: DenseVector(x))\
.toDF(["userId", "features"])

# Same steps as above
training_user_profile = training_user_profile.select("userId", col("features").alias("user_profile"))

training_user_profile = training_user_profile.withColumn("user_profile", to_sparse_udf(col("user_profile")))

train_df = unrated_df.join(tf_idf_built_in, ["paperID"]).join(training_user_profile, ["userID"])

train_df = train_df.select("userID", "paperID", "user_profile", col("features").alias("paper_profile"))

train_df = unrated_df.join(tf_idf_built_in, ["paperID"]).join(training_user_profile, ["userID"])

train_df = train_df.select("userID", "paperID", "user_profile", col("features").alias("paper_profile")).orderBy("userID")

#train_df.show()

In [18]:
# creating a user profile based on the LDA results
training_lda_user_profile = transformed.join(training_df, ["paperID"]).orderBy("userID").select('userId', col('topicDistribution').alias('features'))

training_lda_user_profile = training_lda_user_profile.rdd.mapValues(lambda v: v.toArray()).reduceByKey(lambda x, y: x + y).mapValues(lambda x: DenseVector(x))\
.toDF(["userId", "features"])
training_lda_user_profile = training_lda_user_profile.select("userId", col("features").alias("lda_user_profile"))

training_lda_user_profile = training_lda_user_profile.withColumn("user_profile", to_sparse_udf(col("lda_user_profile")))

train_df_lda = unrated_df.join(transformed, ["paperID"]).join(training_lda_user_profile, ["userID"])

train_df_lda = train_df_lda.select("userID", "paperID", "user_profile", col("features").alias("paper_profile")).orderBy("userID")

#train_df_lda.show()

## Exercise 4. 5 (Off-line evaluation)

In [19]:
# a) Generate 10 recommendations

def castToArray(df, colName):
    dff = df.withColumn(colName, split(col(colName), ", ").cast(ArrayType(IntegerType())))
    return dff

k = 10

# TF-IDF
print("TF-IDF recommender")
tf_rec = cbrs(train_df, k).orderBy("userID")
# Cast the recommendations to a list of integers
tf_rec  = castToArray(tf_rec, "top_papers")
tf_rec.show(truncate=False)

print("LDA recommender")
# LDA
lda_rec  = cbrs(train_df_lda, k).orderBy("userID")
# Cast the recommendations to a list of integers
lda_rec  = castToArray(lda_rec, "top_papers")
lda_rec.show(truncate=False)

TF-IDF recommender
+--------------------------------+-----------------------------------------------------------------------------------------+
|userID                          |top_papers                                                                               |
+--------------------------------+-----------------------------------------------------------------------------------------+
|1eac022a97d683eace8815545ce3153f|[11733005, 7010764, 8336239, 115945, 7496675, 2887105, 8310458, 945604, 1305474, 9045137]|
|589b870a611c25fa99bd3d7295ac0622|[2887105, 7496675, 8336239, 9563857, 7010764, 1042553, 7164691, 9045137, 255030, 3010240]|
+--------------------------------+-----------------------------------------------------------------------------------------+

LDA recommender
+--------------------------------+------------------------------------------------------------------------------------------+
|userID                          |top_papers                                            

In [20]:
# Concatenate the test set into a list of integers
test_df_collected = test_df.groupby("userID").agg(f.concat_ws(", ", f.collect_list(test_df.paperID)).alias("paperID"))
test_df_collected = test_df_collected.withColumn("paperID", split(col("paperID"), ",\s*").cast(ArrayType(IntegerType())).alias("paperID")).orderBy("userID")

test_df_collected.show()

+--------------------+--------------------+
|              userID|             paperID|
+--------------------+--------------------+
|1eac022a97d683eac...|[3469193, 600359,...|
|589b870a611c25fa9...|   [1283233, 956315]|
+--------------------+--------------------+



In [21]:
joined_tf_test = test_df_collected.join(tf_rec, "userID")
joined_tf_test = joined_tf_test.select("userID", col("paperID").alias("test_set"), col("top_papers").alias("train_set"))

joined_lda_test = test_df_collected.join(lda_rec, "userID")
joined_lda_test = joined_lda_test.select("userID", col("paperID").alias("test_set"), col("top_papers").alias("train_set"))

In [22]:
def getHits(train, test):
    return list(set(train).intersection(test))

getHits_udf = udf(getHits, ArrayType(IntegerType()))
# TF-IDF
print("TF-IDF recommender")
tf_hits = joined_tf_test.withColumn('Hits', getHits_udf(joined_tf_test.train_set, joined_tf_test.test_set))
tf_hits = tf_hits.select("userID", "Hits")
tf_hits.show(truncate=False)

print("LDA recommender")
# LDA
lda_hits = joined_lda_test.withColumn('Hits', getHits_udf(joined_lda_test.train_set, joined_lda_test.test_set))
lda_hits = lda_hits.select("userID", "Hits")
lda_hits.show(truncate=False)

TF-IDF recommender
+--------------------------------+----+
|userID                          |Hits|
+--------------------------------+----+
|1eac022a97d683eace8815545ce3153f|[]  |
|589b870a611c25fa99bd3d7295ac0622|[]  |
+--------------------------------+----+

LDA recommender
+--------------------------------+----+
|userID                          |Hits|
+--------------------------------+----+
|1eac022a97d683eace8815545ce3153f|[]  |
|589b870a611c25fa99bd3d7295ac0622|[]  |
+--------------------------------+----+



### Since we could not get results for the hits, we modified the results to show our computations for the evaluation

In [23]:
########## example #####################

columns = ['userID', 'Hits']
vals = [("user0", "1, 2, 3"), ("user1", "4, 5, 6")]

ex = sqlContext.createDataFrame(vals, columns)
ex = castToArray(ex, "Hits")

ex_hits = ex.union(tf_hits)
ex_hits.show()

columns2 = ['userID', 'paperID']
vals2 = [("user0", "2, 3, 4, 1, 5"), ("user1", "5, 4, 6, 7, 8, 9, 10")]
ex2 = sqlContext.createDataFrame(vals2, columns2)
ex2 = castToArray(ex2, "paperID")

ex_test = ex2.union(test_df_collected)
ex_test.show()

+--------------------+---------+
|              userID|     Hits|
+--------------------+---------+
|               user0|[1, 2, 3]|
|               user1|[4, 5, 6]|
|1eac022a97d683eac...|       []|
|589b870a611c25fa9...|       []|
+--------------------+---------+

+--------------------+--------------------+
|              userID|             paperID|
+--------------------+--------------------+
|               user0|     [2, 3, 4, 1, 5]|
|               user1|[5, 4, 6, 7, 8, 9...|
|1eac022a97d683eac...|[3469193, 600359,...|
|589b870a611c25fa9...|   [1283233, 956315]|
+--------------------+--------------------+



In [24]:
joined_test_hits = ex_test.join(ex_hits, "userID")
joined_test_hits.show()

+--------------------+--------------------+---------+
|              userID|             paperID|     Hits|
+--------------------+--------------------+---------+
|               user1|[5, 4, 6, 7, 8, 9...|[4, 5, 6]|
|1eac022a97d683eac...|[3469193, 600359,...|       []|
|589b870a611c25fa9...|   [1283233, 956315]|       []|
|               user0|     [2, 3, 4, 1, 5]|[1, 2, 3]|
+--------------------+--------------------+---------+



In [29]:
num_user = 4

def hitSize_k(hits):
    return len(hits)/k

hitSize_k_udf = udf(lambda x: hitSize_k(x), FloatType())

def precisionK(df):
    df = df.withColumn("hitSize_k", hitSize_k_udf("Hits"))
    sumHits_k = df.select(f.sum("hitSize_k")).collect()[0][0]
    precision = (1/num_user)*sumHits_k
    return precision

print("The precision@" + str(k) + " for TF-IDF is: " + ("%.2f" % precisionK(ex_hits)))

The precision@10 for TF-IDF is: 0.15


In [30]:
def hitSize_testSize(hits, testSize):
    return len(hits)/len(testSize)

hitSize_testSize_udf = udf(hitSize_testSize, FloatType())

def recallK(df):
    df = df.withColumn("hitSize_testSize", hitSize_testSize_udf(df.Hits, df.paperID))
    sumHits_test = df.select(f.sum("hitSize_testSize")).collect()[0][0]
    recall = (1/num_user)*sumHits_test
    return recall

print("The Recall@" + str(k) + " for TF-IDF is: " + ("%.2f" % recallK(joined_test_hits)))

The Recall@10 for TF-IDF is: 0.26


In [54]:
def getPositionU(hits, test):
    if not hits:
        return 0.0
    else:
        return 1/test.index(hits[0])

getPositionU_udf = udf(getPositionU, FloatType())

def mrrK(df):
    df = df.withColumn("P_u", getPositionU_udf(df.Hits, df.paperID))
    sumP_u = df.select(f.sum("P_u")).collect()[0][0]
    mrr = (1/num_user)*sumP_u
    return mrr

print("The MRR@" + str(k) + " for TF-IDF is: " + ("%.2f" % mrrK(joined_test_hits)))

The MRR@10 for TF-IDF is: 0.33


### Since we could not get good results for the hits (empty), we computed the precision for k in {10, 20}
#### We could change the 21 in the range to 101 to compute the precision up till k = 100

In [61]:
num_user = 2

for k in range(10, 21):
    # TF-IDF
    tf_rec = cbrs(train_df, k).orderBy("userID")
    tf_rec  = castToArray(tf_rec, "top_papers")
    joined_tf_test = test_df_collected.join(tf_rec, "userID")
    joined_tf_test = joined_tf_test.select("userID", col("paperID").alias("test_set"), col("top_papers").alias("train_set"))
    tf_hits = joined_tf_test.withColumn('Hits', getHits_udf(joined_tf_test.train_set, joined_tf_test.test_set))
    tf_hits = tf_hits.select("userID", "Hits")
    test_hits =  test_df_collected.join(tf_hits, "userID")
    
    print("The precision@" + str(k) + " for TF-IDF is: " + ("%.2f" % precisionK(tf_hits)))
    print("The Recall@" + str(k) + " for TF-IDF is: " + ("%.2f" % recallK(test_hits)) + "\n")
   

The precision@10 for TF-IDF is: 0.00
The Recall@10 for TF-IDF is: 0.00

The precision@11 for TF-IDF is: 0.00
The Recall@11 for TF-IDF is: 0.00

The precision@12 for TF-IDF is: 0.00
The Recall@12 for TF-IDF is: 0.00

The precision@13 for TF-IDF is: 0.00
The Recall@13 for TF-IDF is: 0.00

The precision@14 for TF-IDF is: 0.00
The Recall@14 for TF-IDF is: 0.00

The precision@15 for TF-IDF is: 0.00
The Recall@15 for TF-IDF is: 0.00

The precision@16 for TF-IDF is: 0.00


KeyboardInterrupt: 

In [None]:
for k in range(10, 21):
    # LDA
    lda_rec  = cbrs(train_df_lda, k).orderBy("userID")
    lda_rec  = castToArray(lda_rec, "top_papers")
    joined_lda_test = test_df_collected.join(lda_rec, "userID")
    joined_lda_test = joined_lda_test.select("userID", col("paperID").alias("test_set"), col("top_papers").alias("train_set"))
    lda_hits = joined_lda_test.withColumn('Hits', getHits_udf(joined_lda_test.train_set, joined_lda_test.test_set))
    lda_hits = lda_hits.select("userID", "Hits")
    test_hits_lda =  test_df_collected.join(lda_hits, "userID")
    
    print("The precision@" + str(k) + " for LDA is: " + ("%.2f" % precisionK(tf_hits)))
    print("The Recall@" + str(k) + " for LDA is: " + ("%.2f" % recallK(test_hits)) + "\n")
