## Experiment-05
### Amirreza Fosoul and Bithiah Yuan

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import *
import string
import csv
import re
import time
spark = SparkSession.builder.appName('ex5').getOrCreate()

from pyspark.sql.functions import *
from pyspark.sql.functions import split, udf, desc, concat, col, lit
import pyspark.sql.functions as f
from pyspark.sql.types import ArrayType, FloatType, StringType, IntegerType, DoubleType, StructType, StructField
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.window import Window
from pyspark.ml.linalg import SparseVector, VectorUDT, DenseVector
import scipy.sparse
from pyspark.ml.linalg import Vectors, _convert_to_vector, VectorUDT
import numpy as np
from pyspark.sql import SQLContext
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import math
import re
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
#from pyspark.mllib.feature import Word2Vec
from pyspark.ml.feature import Word2Vec
from pyspark.sql.functions import rand 

sc = spark.sparkContext
sqlContext = SQLContext(sc)

In [2]:
# Read user ratings into Dataframe
#user_df = spark.read.option("delimiter", ";").csv('./users_libraries.txt')
user_df = spark.read.option("delimiter", ";").csv('./example0.txt')
user_df = user_df.select(col("_c0").alias("userID"), col("_c1").alias("paperID"))

user_df_pre = user_df
user_df_pre = user_df_pre.withColumn("paperID", split(col("paperID"), ",").cast(ArrayType(IntegerType())).alias("paperID"))

user_df = user_df.select("userID", f.split("paperID", ",").alias("papers"), f.explode(f.split("paperID", ",")).alias("paperID"))
user_df = user_df.drop("papers")

# Get a dataframe of the distinct papers
d_paper = user_df.select("paperID").distinct()

# Read in the stopwords as a list
with open('./stopwords_en.txt') as file:
    stopwordList = file.read().splitlines()

# Read in records of paper information
w_df = spark.read.csv('./papers.csv')
#w_df = spark.read.csv('./paper0.csv')
w_df = w_df.select("_c0", "_c13", "_c14")
w_df = w_df.select(col("_c0").alias("paperID"), col("_c13").alias("title"), col("_c14").alias("abstract"))
w_df = w_df.na.fill({'title': '', 'abstract': ''}) # to replace null values with empty string
# Get text from title and abstract
w_df = w_df.select(col("paperID"), concat(col("title"), lit(" "), col("abstract")).alias("words"))
#w_df.show()

# Transform the distinct paperIDs dataframe to a list
paper_list = list(d_paper.select('paperID').toPandas()['paperID'])
# Map each distinct paper into int
paper_list = list(map(int, paper_list))

### Exercise 5. 1 (Pre-processing Text for word2vec)

In [3]:
################################### Conservative pre-processing ###################################################

# Extracting words from the papers and keeping "-" and "_"
tokenizer = RegexTokenizer(inputCol="words", outputCol="tokens", pattern="[a-zA-Z-_]+", gaps=False) 
# Built-in tokenizer
tokenized = tokenizer.transform(w_df)
tokenized = tokenized.select("paperID", "tokens")

# udf to remove "-" and "_" from the tokens
remove_hyphen_udf = udf(lambda x: [re.sub('[-|_]', '', word) for word in x], ArrayType(StringType()))
# Apply udf to the tokens
df = tokenized.withColumn('tokens', remove_hyphen_udf(col('tokens')))
# udf to remove words less than 3 letters
remove_short_words = udf(lambda x: [item for item in x if len(item) >= 3], ArrayType(StringType()))
# Apply udf to the tokens
df = df.withColumn('tokens', remove_short_words(col('tokens')))

conservative_df = df
conservative_df = conservative_df.withColumn("paperID", conservative_df["paperID"].cast(IntegerType()))

word2Vec = Word2Vec(vectorSize=100, inputCol="tokens", outputCol="result")
model = word2Vec.fit(conservative_df)


# print("top10 most similar words to “science” using conservative pre-processing")
# print()
# synonyms = model.findSynonyms('science', 10)
# synonyms.show()

# #################################### Intensive pre-processing #####################################################
# # Built-in function to remove stopwords from our custom list
# remover = StopWordsRemover(inputCol="tokens", outputCol="filtered" , stopWords=stopwordList)
# df = remover.transform(df)
# df = df.select("paperID", "filtered")

# # Apply stemming with NLTK
# # Built-in class from NLTK
# ps = PorterStemmer()
# # udf to apply stemming
# stemming = udf(lambda x: [ps.stem(item) for item in x], ArrayType(StringType()))
# # apply udf to tokens
# df = df.withColumn('tokens', stemming(col('filtered')))
# df = df.select("paperID", "tokens")

# intensive_df = df

# word2Vec = Word2Vec(vectorSize=100, inputCol="tokens", outputCol="result")
# model2 = word2Vec.fit(intensive_df)

# print("top10 most similar words to “science” using intensive pre-processing")
# print()
# synonyms2 = model2.findSynonyms(ps.stem('science'), 10)
# synonyms2.show()

### The results for intensive pre-processing are better (the similarity scores are higher) because unimportant words are removed and words with the same stems are shown in the same token.

### Exercise 5. 2 (Analogies)

In [4]:
def getAverageVector(query, model):
    df = sqlContext.createDataFrame([[query]], ['tokens'])
    return model.transform(df)

def analogy(word1, word2, word3, model, stemming=False):        
    vec = model.getVectors()
    words = [word1, word2, word3]
    keywords = []
    vectors = []
    for i in words:
        keywords.append([x.lower().strip() for x in re.split("[^A-Za-z]+", i)])
        
    if stemming:
        for i, query in enumerate(keywords):
            for j, word in enumerate(query):
                keywords[i][j] = ps.stem(word)
    
    for query in keywords:
        if len(query) > 1:
            vectors.append(getAverageVector(query, model).head()[1])
        else:
            vectors.append(vec.where(vec.word==query[0]).head()[1])
            
    w = vectors[0] - vectors[1] + vectors[2]
    result = model.findSynonyms((-1)*w,5)
    return result

print("analogy for the conservative pre-processing model:")
print()
analogy("machine learning", "prediction", "recommender system", model, stemming=False).show()

print("analogy for the intensive pre-processing model:")
print()
analogy("machine learning", "prediction", "recommender systems", model2, stemming=True).show()


analogy for the conservative pre-processing model:

+----------+------------------+
|      word|        similarity|
+----------+------------------+
|      gene|0.9341816902160645|
|largescale|0.9305847883224487|
|sequencing| 0.929004967212677|
|   several|0.9272071123123169|
|       seq|0.9258444905281067|
+----------+------------------+

analogy for the intensive pre-processing model:

+------------+------------------+
|        word|        similarity|
+------------+------------------+
|     graphen|0.9251900315284729|
|        mass|0.9182692766189575|
|        test|0.9107112884521484|
|spectrometri|0.9089006185531616|
|       point|0.9021362662315369|
+------------+------------------+



### Exercise 5. 3 (From Embeddings to Paper Recommendation)

In [4]:
paper_w2v_cp = model.transform(conservative_df)
paper_w2v_cp = paper_w2v_cp.select("paperID", col("result").alias("paper_profile"))

#paper_w2v_cp.show()

In [5]:
def getUserProfile(user_df, preprocessed_df):
    user_doc = user_df.join(preprocessed_df, ['paperID']).orderBy("userID")
    user_doc = user_doc.drop("paperID")
    # Concatenate the words of the user library
    user_doc = user_doc.rdd.map(lambda user_doc: (user_doc.userID, user_doc.tokens)).reduceByKey(lambda x,y: x + y).toDF(['userID','tokens'])
    
    # Use word2Vec model fitting
    model_user = word2Vec.fit(user_doc)
    user_w2v_cp = model_user.transform(user_doc)
    user_w2v_cp = user_w2v_cp.select("userID", col("result").alias("user_profile"))
    return user_w2v_cp

#getUserProfile(user_df, conservative_df).show()

In [6]:
# Function to call in udf
def unrated(papers):
    # Transform the list of distinct papers and the list of rated papers of each user to a set
    # Substract the two sets to get the list of unrated papers for each user
    # Transform back to list
    unrated = list(set(paper_list) - set(papers))
    
    return unrated


# udf to get a list of unrated papers with the length of rated papers for each user
get_unrated = udf(lambda x: unrated(x), ArrayType(IntegerType()))

# Add a new column of unrated papers for each user
unrated_df = user_df_pre.withColumn("unrated", get_unrated(user_df_pre.paperID))

unrated_df = unrated_df.drop("paperID")

unrated_df = unrated_df.withColumn("paperID", explode(unrated_df.unrated))

unrated_df = unrated_df.drop("unrated")


In [8]:
new_df = unrated_df.join(user_w2v_cp, ["userID"]).join(paper_w2v_cp, ["paperID"])
new_df.show()

+-------+--------------------+--------------------+--------------------+
|paperID|              userID|        user_profile|       paper_profile|
+-------+--------------------+--------------------+--------------------+
| 212874|90f1a3e6fcdbf9bc5...|[-0.0123414472889...|[0.00687372803361...|
|1326856|90f1a3e6fcdbf9bc5...|[-0.0123414472889...|[0.01563527342708...|
|  81501|90f1a3e6fcdbf9bc5...|[-0.0123414472889...|[0.01630953084111...|
|  65083|90f1a3e6fcdbf9bc5...|[-0.0123414472889...|[-0.0018720912533...|
| 523772|90f1a3e6fcdbf9bc5...|[-0.0123414472889...|[0.00310046652642...|
| 105906|90f1a3e6fcdbf9bc5...|[-0.0123414472889...|[-0.0282111878506...|
|1121661|90f1a3e6fcdbf9bc5...|[-0.0123414472889...|[0.02301386825811...|
|7355647|90f1a3e6fcdbf9bc5...|[-0.0123414472889...|[0.00381538080857...|
|  72879|90f1a3e6fcdbf9bc5...|[-0.0123414472889...|[0.00342003746935...|
| 244827|90f1a3e6fcdbf9bc5...|[-0.0123414472889...|[0.03173502150324...|
| 244827|90f1a3e6fcdbf9bc5...|[-0.0123414472889...|

In [7]:
import numpy as np 

def cos_sim(u, p):
    result = (np.dot(u, p))/(np.linalg.norm(u) * np.linalg.norm(p))
    result = result.item()
    return result

compute_sim = udf(cos_sim, FloatType())

In [8]:
def w2vRS(df, k, passUserID=False, userID=None):
    if passUserID:
        df = df.where(df.userID==userID)
    # Apply similarity metric to the user_profile and paper_profile
    sim_df = df.withColumn('Similarity', compute_sim(df.user_profile, df.paper_profile))
    # Partition by userID and order by the similarity in descending order
    window = Window.partitionBy(col("userID")).orderBy((col("Similarity")).desc())
    # Add row numbers to the rows and get the top-k rows
    sim_df = sim_df.select(col('*'), row_number().over(window).alias('row_number')).where(col('row_number') <= k)

    # Renaming
    get_r = sim_df.select("userID", "paperID", col("row_number").alias("rank"))
    w2vRS_df = get_r.select("userID", "paperID")
    # un-explode, concatenate the recommended papers for each user
    w2vRS_df = w2vRS_df.groupby("userID").agg(f.concat_ws(", ", f.collect_list(w2vRS_df.paperID)).alias("top_papers"))
    
    return w2vRS_df

# k = 10

# user = "1eac022a97d683eace8815545ce3153f"

# user_rec = w2vRS(new_df, k, passUserID=True, userID=user)

# user_rec.show(truncate=False)

### Exercise 5. 4 (Evaluation of Recommender System)

In [9]:
get_lib_size_udf = udf(lambda x:len(x), IntegerType())

def sampling(num_users, df, trainingSize=0.8):
    # first we sample users and then we sample each user's library
    
    get_training_size_udf = udf(lambda x:int(x*trainingSize), IntegerType())
    sampled_users = df.orderBy(rand()).limit(num_users)
    sampled_users = sampled_users.withColumn('libSize', get_lib_size_udf('paperID'))
    sampled_users = sampled_users.withColumn('trainingSize', get_training_size_udf('libSize'))
    
    # explode the paperIDs for each user
    sampled_exploded = sampled_users.withColumn('paperID', explode(col('paperID')))

    # Partion by userID and order them randomly
    window = Window.partitionBy(col('userID')).orderBy(rand())

    # Get row numbers
    sampled_exploded = sampled_exploded.select(col('*'), row_number().over(window).alias('row_number'))

    # Get the rows less than or equal to the training set size
    # The rows will be different each time because of .orderBy(rand()) in the window function
    training_df = sampled_exploded.where(col('row_number') <= col('trainingSize'))
    training_df = training_df.select('userID', 'paperID').orderBy('userID')
    #training_df.show()

    # Get the test set by selecting the rows greater than the training size
    test_df = sampled_exploded.where(col('row_number') > col('trainingSize'))
    test_df = test_df.select('userID', 'paperID').orderBy('userID')
    
    return (training_df, test_df)

(training_df, test_df) = sampling(50, user_df_pre, trainingSize=0.8)

In [None]:
def getProfile(train_df, preprocessed_df, unrated_df, paper_profile):
    
    userProfile = getUserProfile(user_df, preprocessed_df)
    train_user_w2v = userProfile.join(train_df, ["userID"])
    train_user_w2v = train_user_w2v.drop("paperID")
    profile_df = unrated_df.join(train_user_w2v, ["userID"]).join(paper_profile, ["paperID"])
    
    return profile_df

sampledProfiles = getProfile(training_df, conservative_df, unrated_df, paper_w2v_cp)

In [None]:
k = 10

sampledRec = w2vRS(sampledProfiles, k)

sampledRec.show(truncate=False)

In [41]:
############################# Sample users with library size greater than 20 ######################################

sampled_20 = user_df_pre.withColumn("libSize", get_lib_size_udf("paperID")).filter(col("libSize") > 20)

(training20_df, test20_df) = sampling(50, sampled_20, trainingSize=0.8)

sampledProfiles20 = getProfile(training20_df, conservative_df, unrated_df, paper_w2v_cp)

sampledRec20 = w2vRS(sampledProfiles20, k)

sampledRec20.show(truncate=False)

+--------------------+--------------------+-------+
|              userID|             paperID|libSize|
+--------------------+--------------------+-------+
|d0c9aaa788153daea...|[2080631, 6343346...|     45|
|f05bcffe7951de9e5...|[1158654, 478707,...|    170|
|d1d41a15201915503...|[6610569, 6493797...|     42|
|b656009a6efdc8b1a...|[771870, 181369, ...|    104|
|d85f7d83f27b3f533...|[7610843, 3633347...|     29|
|3b715ebaf1f8f81a1...|[4119394, 3378798...|    216|
|f3c28e50db4ce8ad8...|[2856540, 2994495...|     24|
|38fe6373389d12b5b...|[7276116, 255799,...|     32|
|1eac022a97d683eac...|[3973229, 322433,...|    334|
|7c0081293b3988065...|[3453059, 3007833...|    227|
|4c8912d1b04471cf5...|[3579579, 1931121...|     32|
|51656fa1c9a7e0412...|[927126, 967059, ...|     41|
|f1e1cd4ff25018273...|[106227, 813152, ...|    168|
|a0bbf6bb9b1c818f3...|[1202499, 1215706...|     35|
|11740197cbfd484fb...|[4110917, 5138376...|     36|
|6ceb80c354731c63e...|[2074791, 154213,...|     21|
|cbd4a69e4b3

In [None]:
def castToArray(df, colName):
    dff = df.withColumn(colName, split(col(colName), ", ").cast(ArrayType(IntegerType())))
    return dff

sampledRec = castToArray(sampledRec, "top_papers")
sampledRec20 = castToArray(sampledRec20, "top_papers")

In [None]:
# Concatenate the test set into a list of integers
test_df_collected = test_df.groupby("userID").agg(f.concat_ws(", ", f.collect_list(test_df.paperID)).alias("paperID"))
test_df_collected = test_df_collected.withColumn("paperID", split(col("paperID"), ",\s*").cast(ArrayType(IntegerType())).alias("paperID")).orderBy("userID")

test20_df_collected = test20_df.groupby("userID").agg(f.concat_ws(", ", f.collect_list(test_df.paperID)).alias("paperID"))
test20_df_collected = test20_df_collected.withColumn("paperID", split(col("paperID"), ",\s*").cast(ArrayType(IntegerType())).alias("paperID")).orderBy("userID")

In [21]:
joined_test = test_df_collected.join(sampledRec, "userID")
joined_test = joined_test.select("userID", col("paperID").alias("test_set"), col("top_papers").alias("train_set"))

joined20_test = test20_df_collected.join(sampledRec20, "userID")
joined20_test = joined20_test.select("userID", col("paperID").alias("test_set"), col("top_papers").alias("train_set"))

In [22]:
def getHits(train, test):
    return list(set(train).intersection(test))

getHits_udf = udf(getHits, ArrayType(IntegerType()))
# TF-IDF
print("TF-IDF recommender")
tf_hits = joined_tf_test.withColumn('Hits', getHits_udf(joined_tf_test.train_set, joined_tf_test.test_set))
tf_hits = tf_hits.select("userID", "Hits")
tf_hits.show(truncate=False)

print("LDA recommender")
# LDA
lda_hits = joined_lda_test.withColumn('Hits', getHits_udf(joined_lda_test.train_set, joined_lda_test.test_set))
lda_hits = lda_hits.select("userID", "Hits")
lda_hits.show(truncate=False)

TF-IDF recommender
+--------------------------------+----+
|userID                          |Hits|
+--------------------------------+----+
|1eac022a97d683eace8815545ce3153f|[]  |
|589b870a611c25fa99bd3d7295ac0622|[]  |
+--------------------------------+----+

LDA recommender
+--------------------------------+----+
|userID                          |Hits|
+--------------------------------+----+
|1eac022a97d683eace8815545ce3153f|[]  |
|589b870a611c25fa99bd3d7295ac0622|[]  |
+--------------------------------+----+



### Since we could not get results for the hits, we modified the results to show our computations for the evaluation

In [23]:
########## example #####################

columns = ['userID', 'Hits']
vals = [("user0", "1, 2, 3"), ("user1", "4, 5, 6")]

ex = sqlContext.createDataFrame(vals, columns)
ex = castToArray(ex, "Hits")

ex_hits = ex.union(tf_hits)
ex_hits.show()

columns2 = ['userID', 'paperID']
vals2 = [("user0", "2, 3, 4, 1, 5"), ("user1", "5, 4, 6, 7, 8, 9, 10")]
ex2 = sqlContext.createDataFrame(vals2, columns2)
ex2 = castToArray(ex2, "paperID")

ex_test = ex2.union(test_df_collected)
ex_test.show()

+--------------------+---------+
|              userID|     Hits|
+--------------------+---------+
|               user0|[1, 2, 3]|
|               user1|[4, 5, 6]|
|1eac022a97d683eac...|       []|
|589b870a611c25fa9...|       []|
+--------------------+---------+

+--------------------+--------------------+
|              userID|             paperID|
+--------------------+--------------------+
|               user0|     [2, 3, 4, 1, 5]|
|               user1|[5, 4, 6, 7, 8, 9...|
|1eac022a97d683eac...|[3469193, 600359,...|
|589b870a611c25fa9...|   [1283233, 956315]|
+--------------------+--------------------+



In [24]:
joined_test_hits = ex_test.join(ex_hits, "userID")
joined_test_hits.show()

+--------------------+--------------------+---------+
|              userID|             paperID|     Hits|
+--------------------+--------------------+---------+
|               user1|[5, 4, 6, 7, 8, 9...|[4, 5, 6]|
|1eac022a97d683eac...|[3469193, 600359,...|       []|
|589b870a611c25fa9...|   [1283233, 956315]|       []|
|               user0|     [2, 3, 4, 1, 5]|[1, 2, 3]|
+--------------------+--------------------+---------+



In [29]:
num_user = 4

def hitSize_k(hits):
    return len(hits)/k

hitSize_k_udf = udf(lambda x: hitSize_k(x), FloatType())

def precisionK(df):
    df = df.withColumn("hitSize_k", hitSize_k_udf("Hits"))
    sumHits_k = df.select(f.sum("hitSize_k")).collect()[0][0]
    precision = (1/num_user)*sumHits_k
    return precision

print("The precision@" + str(k) + " for TF-IDF is: " + ("%.2f" % precisionK(ex_hits)))

The precision@10 for TF-IDF is: 0.15


In [30]:
def hitSize_testSize(hits, testSize):
    return len(hits)/len(testSize)

hitSize_testSize_udf = udf(hitSize_testSize, FloatType())

def recallK(df):
    df = df.withColumn("hitSize_testSize", hitSize_testSize_udf(df.Hits, df.paperID))
    sumHits_test = df.select(f.sum("hitSize_testSize")).collect()[0][0]
    recall = (1/num_user)*sumHits_test
    return recall

print("The Recall@" + str(k) + " for TF-IDF is: " + ("%.2f" % recallK(joined_test_hits)))

The Recall@10 for TF-IDF is: 0.26


In [54]:
def getPositionU(hits, test):
    if not hits:
        return 0.0
    else:
        return 1/test.index(hits[0])

getPositionU_udf = udf(getPositionU, FloatType())

def mrrK(df):
    df = df.withColumn("P_u", getPositionU_udf(df.Hits, df.paperID))
    sumP_u = df.select(f.sum("P_u")).collect()[0][0]
    mrr = (1/num_user)*sumP_u
    return mrr

print("The MRR@" + str(k) + " for TF-IDF is: " + ("%.2f" % mrrK(joined_test_hits)))

The MRR@10 for TF-IDF is: 0.33


### Since we could not get good results for the hits (empty), we computed the precision for k in {10, 20}
#### We could change the 21 in the range to 101 to compute the precision up till k = 100

In [61]:
num_user = 2

for k in range(10, 21):
    # TF-IDF
    tf_rec = cbrs(train_df, k).orderBy("userID")
    tf_rec  = castToArray(tf_rec, "top_papers")
    joined_tf_test = test_df_collected.join(tf_rec, "userID")
    joined_tf_test = joined_tf_test.select("userID", col("paperID").alias("test_set"), col("top_papers").alias("train_set"))
    tf_hits = joined_tf_test.withColumn('Hits', getHits_udf(joined_tf_test.train_set, joined_tf_test.test_set))
    tf_hits = tf_hits.select("userID", "Hits")
    test_hits =  test_df_collected.join(tf_hits, "userID")
    
    print("The precision@" + str(k) + " for TF-IDF is: " + ("%.2f" % precisionK(tf_hits)))
    print("The Recall@" + str(k) + " for TF-IDF is: " + ("%.2f" % recallK(test_hits)) + "\n")
   

The precision@10 for TF-IDF is: 0.00
The Recall@10 for TF-IDF is: 0.00

The precision@11 for TF-IDF is: 0.00
The Recall@11 for TF-IDF is: 0.00

The precision@12 for TF-IDF is: 0.00
The Recall@12 for TF-IDF is: 0.00

The precision@13 for TF-IDF is: 0.00
The Recall@13 for TF-IDF is: 0.00

The precision@14 for TF-IDF is: 0.00
The Recall@14 for TF-IDF is: 0.00

The precision@15 for TF-IDF is: 0.00
The Recall@15 for TF-IDF is: 0.00

The precision@16 for TF-IDF is: 0.00


KeyboardInterrupt: 

In [None]:
for k in range(10, 21):
    # LDA
    lda_rec  = cbrs(train_df_lda, k).orderBy("userID")
    lda_rec  = castToArray(lda_rec, "top_papers")
    joined_lda_test = test_df_collected.join(lda_rec, "userID")
    joined_lda_test = joined_lda_test.select("userID", col("paperID").alias("test_set"), col("top_papers").alias("train_set"))
    lda_hits = joined_lda_test.withColumn('Hits', getHits_udf(joined_lda_test.train_set, joined_lda_test.test_set))
    lda_hits = lda_hits.select("userID", "Hits")
    test_hits_lda =  test_df_collected.join(lda_hits, "userID")
    
    print("The precision@" + str(k) + " for LDA is: " + ("%.2f" % precisionK(tf_hits)))
    print("The Recall@" + str(k) + " for LDA is: " + ("%.2f" % recallK(test_hits)) + "\n")
