## Experiment-05
### Amirreza Fosoul and Bithiah Yuan

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import *
import string
import csv
import re
import time
spark = SparkSession.builder.appName('ex5').getOrCreate()

from pyspark.sql.functions import *
from pyspark.sql.functions import split, udf, desc, concat, col, lit
import pyspark.sql.functions as f
from pyspark.sql.types import ArrayType, FloatType, StringType, IntegerType, DoubleType, StructType, StructField
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.window import Window
from pyspark.ml.linalg import SparseVector, VectorUDT, DenseVector
import scipy.sparse
from pyspark.ml.linalg import Vectors, _convert_to_vector, VectorUDT
import numpy as np
from pyspark.sql import SQLContext
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import math
import re
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
from pyspark.ml.feature import Word2Vec
from pyspark.sql.functions import rand 
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sc = spark.sparkContext
sqlContext = SQLContext(sc)

In [76]:
# Unexplode a column to an array of given datatype
def unExplode(df, groupByColName, collectColName, colType):
    types = {'string': StringType(), 'integer': IntegerType()}
    df_collected = df.groupby(groupByColName).agg(f.concat_ws(", ", f.collect_list(df[collectColName])).alias(collectColName))
    result = df_collected.withColumn(collectColName, split(col(collectColName), ",\s*").cast(ArrayType(types[colType])).alias(collectColName)).orderBy(groupByColName)
    return result

In [79]:
# Read user ratings into Dataframe
#user_df = spark.read.option("delimiter", ";").csv('./users_libraries.txt')
user_df = spark.read.option("delimiter", ";").csv('./example0.txt')
user_df = user_df.select(col("_c0").alias("userID"), col("_c1").alias("paperID"))

# Pre-explode user_df
user_df_pre = user_df
user_df_pre = user_df_pre.withColumn("paperID", split(col("paperID"), ",").cast(ArrayType(IntegerType())).alias("paperID"))
user_df = user_df.select("userID", f.split("paperID", ",").alias("papers"), f.explode(f.split("paperID", ",")).alias("paperID"))
user_df = user_df.drop("papers")

# Get a dataframe of the distinct papers
d_paper = user_df.select("paperID").distinct()

# Read in the stopwords as a list
with open('./stopwords_en.txt') as file:
    stopwordList = file.read().splitlines()

# Read in records of paper information
#w_df = spark.read.csv('./papers.csv')
w_df = spark.read.csv('./paper0.csv')
w_df = w_df.select("_c0", "_c13", "_c14")
w_df = w_df.select(col("_c0").alias("paperID"), col("_c13").alias("title"), col("_c14").alias("abstract"))
w_df = w_df.na.fill({'title': '', 'abstract': ''}) # to replace null values with empty string
# Get text from title and abstract
w_df = w_df.select(col("paperID"), concat(col("title"), lit(" "), col("abstract")).alias("words"))
#w_df.show()

paper_df = w_df

# Transform the distinct paperIDs dataframe to a list
paper_list = list(d_paper.select('paperID').toPandas()['paperID'])
# Map each distinct paper into int
paper_list = list(map(int, paper_list))

### Exercise 5. 1 (Pre-processing Text for word2vec)

In [80]:
################################### Conservative pre-processing ###################################################
# Extracting words from the papers and keeping "-" and "_"
tokenizer = RegexTokenizer(inputCol="words", outputCol="tokens", pattern="[a-zA-Z-_]+", gaps=False) 
# Built-in tokenizer
tokenized = tokenizer.transform(w_df)
tokenized = tokenized.select("paperID", "tokens")

# udf to remove "-" and "_" from the tokens
remove_hyphen_udf = udf(lambda x: [re.sub('[-|_]', '', word) for word in x], ArrayType(StringType()))
# Apply udf to the tokens
df = tokenized.withColumn('tokens', remove_hyphen_udf(col('tokens')))
# udf to remove words less than 3 letters
remove_short_words = udf(lambda x: [item for item in x if len(item) >= 3], ArrayType(StringType()))
# Apply udf to the tokens
df = df.withColumn('tokens', remove_short_words(col('tokens')))

# Conservative pre-processing df
conservative_df = df
conservative_df = conservative_df.withColumn("paperID", conservative_df["paperID"].cast(IntegerType()))

# Define word2vec parameters
word2Vec = Word2Vec(vectorSize=100, inputCol="tokens", outputCol="result")
# Fit the w2v_cp model
model = word2Vec.fit(conservative_df)

print("top10 most similar words to “science” using conservative pre-processing")
print()
# Find the words most similar to science (top 10)
synonyms = model.findSynonyms('science', 10)
synonyms.show()

# #################################### Intensive pre-processing #####################################################
# Built-in function to remove stopwords from our custom list
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered" , stopWords=stopwordList)
df = remover.transform(df)
df = df.select("paperID", "filtered")

# Apply stemming with NLTK
# Built-in class from NLTK
ps = PorterStemmer()
# udf to apply stemming
stemming = udf(lambda x: [ps.stem(item) for item in x], ArrayType(StringType()))
# apply udf to tokens
df = df.withColumn('tokens', stemming(col('filtered')))
df = df.select("paperID", "tokens")

# Intensive pre-processing df
intensive_df = df
paper_terms = df

# w2v_ip model
model2 = word2Vec.fit(intensive_df)

print("top10 most similar words to “science” using intensive pre-processing")
print()
# Find the words most similar to science (top 10)
# We stem science to have token matches
synonyms2 = model2.findSynonyms(ps.stem('science'), 10)
synonyms2.show()

top10 most similar words to “science” using conservative pre-processing

+-------------+------------------+
|         word|        similarity|
+-------------+------------------+
|         wide|0.9303041100502014|
|      between|0.9300709962844849|
|     emerging|0.9270402789115906|
|      discuss|0.9263132810592651|
|     citation|0.9259647130966187|
|        speed|0.9251464605331421|
|        blast|0.9241999983787537|
|technological|0.9241734147071838|
|       matrix|0.9241451025009155|
|     features|0.9241414070129395|
+-------------+------------------+

top10 most similar words to “science” using intensive pre-processing

+-------+------------------+
|   word|        similarity|
+-------+------------------+
| commun|0.9815123677253723|
| includ|0.9749888777732849|
|network|0.9698975682258606|
|overlap|0.9698217511177063|
| social|0.9681050181388855|
|modular|0.9649649858474731|
|topolog|0.9592804312705994|
| physic|0.9573175311088562|
|  natur| 0.952199399471283|
|connect|0.9514593

### The results for intensive pre-processing are better (the similarity scores are higher) because unimportant words are removed and words with the same stems are shown in the same token.

### Exercise 5. 2 (Analogies)

In [81]:
# For queries with multiple words
# Store the words in an array of strings in a dataframe
# Get the average vector using word2vec
def getAverageVector(query, model):
    df = sqlContext.createDataFrame([[query]], ['tokens'])
    return model.transform(df)

def analogy(word1, word2, word3, model, stemming=False):
    # Get the vectors from the word2vec model
    vec = model.getVectors()
    # Store the query words into a list
    words = [word1, word2, word3]
    keywords = []
    vectors = []
    for i in words:
        # Clean the query and store them in a list
        # Note: multiple words in one query will result in a nested list
        keywords.append([x.lower().strip() for x in re.split("[^A-Za-z]+", i)])
        
    # When using intensive preprocessing, stemming needs to be applied to the queries
    if stemming:
        for i, query in enumerate(keywords):
            for j, word in enumerate(query):
                # Apply stemming to each word in query
                keywords[i][j] = ps.stem(word)
    
    for query in keywords:
        # If multiple words are in one query then get the average of their vectors
        if len(query) > 1:
            vectors.append(getAverageVector(query, model).head()[1])
        else:
            # Otherwise append the vector of the word in a list
            vectors.append(vec.where(vec.word==query[0]).head()[1])
    # Perform analogy effect from word2vec
    w = vectors[0] - vectors[1] + vectors[2]
    # Call the word2vec model to find the top 5 similar words
    result = model.findSynonyms((-1)*w,5)
    return result

print("analogy for the conservative pre-processing model:")
print()
analogy("machine learning", "prediction", "recommender system", model, stemming=False).show()

print("analogy for the intensive pre-processing model:")
print()
analogy("machine learning", "prediction", "recommender systems", model2, stemming=True).show()


analogy for the conservative pre-processing model:

+------------+------------------+
|        word|        similarity|
+------------+------------------+
|increasingly|0.9157098531723022|
|     studied|0.9040472507476807|
|     created|0.9021034240722656|
|    proposed|0.8921966552734375|
|folksonomies|0.8918269872665405|
+------------+------------------+

analogy for the intensive pre-processing model:

+--------+------------------+
|    word|        similarity|
+--------+------------------+
|  precis|0.7692493200302124|
|   regul|0.7513030171394348|
|consider|0.7002859711647034|
|   excit|0.6967695355415344|
|   innov|0.6751542687416077|
+--------+------------------+



### Exercise 5. 3 (From Embeddings to Paper Recommendation)

In [82]:
# average of the embeddings of all words which appear in the paper’s text
paper_w2v_cp = model.transform(conservative_df)
paper_w2v_cp = paper_w2v_cp.select("paperID", col("result").alias("paper_profile"))

# compute the user profile using word2vec
def getUserProfile(user_df, preprocessed_df):
    # concatenate the papers in user libraries with their tokens inside w2v_cp df
    user_doc = user_df.join(preprocessed_df, ['paperID']).orderBy("userID")
    user_doc = user_doc.drop("paperID")
    # Concatenate all words of the user library
    user_doc = user_doc.rdd.map(lambda user_doc: (user_doc.userID, user_doc.tokens)).reduceByKey(lambda x,y: x + y).toDF(['userID','tokens'])
    
    # Use word2Vec model fitting
    model_user = word2Vec.fit(user_doc)
    # Get average of the embeddings of all words in users' libraries
    user_w2v_cp = model_user.transform(user_doc)
    user_w2v_cp = user_w2v_cp.select("userID", col("result").alias("user_profile"))
    return user_w2v_cp

user_w2v_cp = getUserProfile(user_df, conservative_df)
user_w2v_cp.show(10)

+--------------------+--------------------+
|              userID|        user_profile|
+--------------------+--------------------+
|4c8912d1b04471cf5...|[-0.0062182044786...|
|90f1a3e6fcdbf9bc5...|[-0.0057416195997...|
|d503571e44a0373eb...|[-0.0048411928860...|
|b36c3189bb1457cd0...|[-0.0076820166222...|
|bbcd9dae3160ddcb9...|[-0.0060672587091...|
|589b870a611c25fa9...|[-0.0059061539388...|
|f1e1cd4ff25018273...|[-0.0082436984605...|
|a0bbf6bb9b1c818f3...|[-0.0067649564573...|
|1eac022a97d683eac...|[-0.0070713948695...|
|3b715ebaf1f8f81a1...|[-0.0077834039900...|
+--------------------+--------------------+
only showing top 10 rows



In [96]:
# Function to call in udf
def unrated(papers):
    # Transform the list of distinct papers and the list of rated papers of each user to a set
    # Substract the two sets to get the list of unrated papers for each user
    # Transform back to list
    unrated = list(set(paper_list) - set(papers))
    
    return unrated

# udf to get a list of unrated papers with the length of rated papers for each user
get_unrated = udf(lambda x: unrated(x), ArrayType(IntegerType()))

def cos_sim(u, p):
    result = (np.dot(u, p))/(np.linalg.norm(u) * np.linalg.norm(p))
    result = result.item()
    return result

compute_sim = udf(cos_sim, FloatType())

def getUnrated(user_df):
    # Add a new column of unrated papers for each user
    unrated_df = user_df.withColumn("unrated", get_unrated(user_df.paperID))
    unrated_df = unrated_df.drop("paperID")
    unrated_df = unrated_df.withColumn("paperID", explode(unrated_df.unrated))
    unrated_df = unrated_df.drop("unrated")
    
    return unrated_df

unrated_df = getUnrated(user_df_pre)

conservative_df.show()

+--------+--------------------+
| paperID|              tokens|
+--------+--------------------+
|12832332|[deepening, democ...|
| 1305474|[the, tesla, broa...|
| 1001231|[anonymous, commu...|
|  352713|[survey, sensor, ...|
|  956315|[wormhole, attack...|
|  945604|[packet, leashes,...|
|10294999|[distance, recons...|
|  967275|[the, survivabili...|
|  115945|[twelvestep, prog...|
|11733005|[evidence, for, w...|
| 9045137|[nuclear, dna, co...|
| 3728173|[evolution, indiv...|
| 8310458|[evolution, compl...|
|   80546|[the, arbitrarine...|
| 5842862|[how, choose, goo...|
| 1242600|[how, write, cons...|
| 3467077|[defrosting, the,...|
|  309395|[why, most, publi...|
|  305755|[the, structure, ...|
| 6603134|[how, build, moti...|
+--------+--------------------+
only showing top 20 rows



In [101]:
# a recommender system based on word2vector
def w2vRS(user_df, unrated_df, preprocessed_df, k, passUserID=False, userID=None):
    
    # compute the user profile and paper prfiles
    model = word2Vec.fit(preprocessed_df)
    paper_w2v_cp = model.transform(preprocessed_df)
    paper_w2v_cp = paper_w2v_cp.select("paperID", col("result").alias("paper_profile"))
    user_w2v_cp = getUserProfile(user_df, preprocessed_df)
    
    df = unrated_df.join(user_w2v_cp, ["userID"]).join(paper_w2v_cp, ["paperID"])
        
    # if the userID passed, filter the dataframe
    if passUserID:
        df = df.where(df.userID==userID)
        
    # Apply similarity metric to the user_profile and paper_profile
    sim_df = df.withColumn('Similarity', compute_sim(df.user_profile, df.paper_profile))
    # Partition by userID and order by the similarity in descending order

    window = Window.partitionBy(col("userID")).orderBy((col("Similarity")).desc())
    # Add row numbers to the rows and get the top-k rows
    sim_df = sim_df.select(col('*'), row_number().over(window).alias('row_number')).where(col('row_number') <= k)

    # Renaming
    get_r = sim_df.select("userID", "paperID", col("row_number").alias("rank"))
    w2vRS_df = get_r.select("userID", "paperID")
    # un-explode, concatenate the recommended papers for each user
    w2vRS_df = unExplode(w2vRS_df, 'userID', 'paperID', 'integer')
    w2vRS_df = w2vRS_df.select('userID', col('paperID').alias('top_papers'))
    
    return w2vRS_df

# sample user
user = "1eac022a97d683eace8815545ce3153f"

# get the recommendations for the sample user
user_rec = w2vRS(user_df, unrated_df, conservative_df, 10, passUserID=True, userID=user)

user_rec.show(truncate=False) 

+--------------------------------+------------------------------------------------------------------------------------+
|userID                          |top_papers                                                                          |
+--------------------------------+------------------------------------------------------------------------------------+
|1eac022a97d683eace8815545ce3153f|[4302361, 105906, 3281478, 816066, 6603134, 3614773, 658201, 688160, 920055, 965334]|
+--------------------------------+------------------------------------------------------------------------------------+



### Exercise 5. 4 (Evaluation of Recommender System)

In [49]:
get_lib_size_udf = udf(lambda x:len(x), IntegerType())

def sampling(num_users, df, trainingSize=0.8):
    # first we sample users and then we sample each user's library
    
    get_training_size_udf = udf(lambda x:int(x*trainingSize), IntegerType())
    sampled_users = df.orderBy(rand()).limit(num_users)
    sampled_users = sampled_users.withColumn('libSize', get_lib_size_udf('paperID'))
    sampled_users = sampled_users.withColumn('trainingSize', get_training_size_udf('libSize'))
    
    # explode the paperIDs for each user
    sampled_exploded = sampled_users.withColumn('paperID', explode(col('paperID')))

    # Partion by userID and order them randomly
    window = Window.partitionBy(col('userID')).orderBy(rand())

    # Get row numbers
    sampled_exploded = sampled_exploded.select(col('*'), row_number().over(window).alias('row_number'))

    # Get the rows less than or equal to the training set size
    # The rows will be different each time because of .orderBy(rand()) in the window function
    training_df = sampled_exploded.where(col('row_number') <= col('trainingSize'))
    training_df = training_df.select('userID', 'paperID').orderBy('userID')
    #training_df.show()

    # Get the test set by selecting the rows greater than the training size
    test_df = sampled_exploded.where(col('row_number') > col('trainingSize'))
    test_df = test_df.select('userID', 'paperID').orderBy('userID')
    
    return (training_df, test_df)

(training_df, test_df) = sampling(50, user_df_pre, trainingSize=0.8)

In [109]:
train_unrated_df = getUnrated(training_df_collected)

k = 10

# Get recommendations for the sampled users
user_rec = w2vRS(training_df, train_unrated_df, conservative_df, k)
#user_rec.show()

In [122]:
# cast the column to array
def castToArray(df, colName):
    dff = df.withColumn(colName, split(col(colName), ", ").cast(ArrayType(IntegerType())))
    return dff


def getHits(train, test):
    return list(set(train).intersection(test))

getHits_udf = udf(getHits, ArrayType(IntegerType()))

num_user = 50

def hitSize_k(hits):
    if len(hits) == 0:
        return 0.0
    else:
        return len(hits)/k

hitSize_k_udf = udf(lambda x: hitSize_k(x), FloatType())

def precisionK(df):
    df = df.withColumn("hitSize_k", hitSize_k_udf("Hits"))
    sumHits_k = df.select(f.sum("hitSize_k")).collect()[0][0]
    precision = (1/num_user)*sumHits_k
    return precision

def hitSize_testSize(hits, testSize):
    return len(hits)/len(testSize)

hitSize_testSize_udf = udf(hitSize_testSize, FloatType())

def recallK(df):
    df = df.withColumn("hitSize_testSize", hitSize_testSize_udf(df.Hits, df.paperID))
    sumHits_test = df.select(f.sum("hitSize_testSize")).collect()[0][0]
    recall = (1/num_user)*sumHits_test
    return recall

# Get the positions of the hits in the test set
# find the minimum hit index that appeared in the test set
def getPositionU(hits, test):
    if not hits:
        return 0.0
    else:
        hit_index = []
        for i in range(0, len(hits)):
            # Index starting at 1 to avoid division by 0
            hit_index.append((test.index(hits[i]))+1)
    
        return 1/int((np.min(hit_index)))

getPositionU_udf = udf(getPositionU, FloatType())

def mrrK(df):
    df = df.withColumn("P_u", getPositionU_udf(df.Hits, df.paperID))
    sumP_u = df.select(f.sum("P_u")).collect()[0][0]
    mrr = (1/num_user)*sumP_u
    return mrr

In [108]:
# evaluates the precision@k, recall@k, mrr@k
def  evaluateK(k, rec_df, test_df):
    
    # Concatenate the papers for test_df
    test_df_collected = unExplode(test_df, 'userID', 'paperID', 'integer')
    
    # Join recommended papers with test_df
    joined_test = test_df_collected.join(rec_df, "userID")
    joined_test = joined_test.select("userID", col("paperID").alias("test_set"), col("top_papers").alias("train_set"))
    
    # find the hits by calling udf to make the intersection of test set and hits
    hits = joined_test.withColumn('Hits', getHits_udf(joined_test.train_set, joined_test.test_set))
    hits = hits.select("userID", "Hits")
    
    # Join the test set and the hits
    test_hits =  test_df_collected.join(hits, "userID")
    
    return (precisionK(hits), recallK(test_hits), mrrK(test_hits))
    
K = [10, 100]

for i in range(0, len(K)):
    prec, recall, mrr = evaluateK(K[i], user_rec, test_df)  
    print("The precision@" + str(K[i]) + " for TF-IDF is: " + ("%.3f" % prec))
    print("The Recall@" + str(K[i]) + " for TF-IDF is: " + ("%.3f" % recall))
    print("The MRR@" + str(K[i]) + " for TF-IDF is: " + ("%.3f" % mrr) + "\n")

The precision@10 for TF-IDF is: 0.006
The Recall@10 for TF-IDF is: 0.002
The MRR@10 for TF-IDF is: 0.003

The precision@100 for TF-IDF is: 0.006
The Recall@100 for TF-IDF is: 0.002
The MRR@100 for TF-IDF is: 0.003



In [112]:
############################# Sample users with library size greater than 20 ######################################

sampled_20 = user_df_pre.withColumn("libSize", get_lib_size_udf("paperID")).filter(col("libSize") > 20)

(training20_df, test20_df) = sampling(50, sampled_20, trainingSize=0.8)

train20_unrated_df = getUnrated(training20_df_collected)

k = 10

# Get recommendations for the sampled users with more than 20 papers in their libraries
user_rec20 = w2vRS(training20_df, train20_unrated_df, conservative_df, k)
#user_rec20.show()

In [113]:
k = [10, 100]

for i in range(0, len(k)):
    prec, recall, mrr = evaluateK(k[i], user_rec20, test20_df)  
    print("The precision@" + str(k[i]) + " for TF-IDF is: " + ("%.3f" % prec))
    print("The Recall@" + str(k[i]) + " for TF-IDF is: " + ("%.3f" % recall))
    print("The MRR@" + str(k[i]) + " for TF-IDF is: " + ("%.3f" % mrr) + "\n")

The precision@10 for TF-IDF is: 0.002
The Recall@10 for TF-IDF is: 0.001
The MRR@10 for TF-IDF is: 0.007

The precision@100 for TF-IDF is: 0.002
The Recall@100 for TF-IDF is: 0.001
The MRR@100 for TF-IDF is: 0.007



### After repeating the experiment by picking only users which have more than 20 papers in their libraries, we noticed that the performance of the RS improves. This makes sense because there are more papers in the users' libraries, so the similarity scores from the RS will be more accurate.

### Exercise 5. 5 (Improving the Recommender System with TF-IDF)

In [115]:
# Explode/Split the tokens in the list for each paperID and get the distinct tokens
ip_df = intensive_df.select("paperID", f.explode("tokens").alias("tokens")).distinct().orderBy("paperID")

# Assign count of 1 to each token w.r.t. the paperID since the tokens are distinct
ip_df = ip_df.groupBy("paperID","tokens").count()

# Get the number of distinct papers
num_papers = w_df.select("paperID").distinct().count()

# Get the value of ten percent of the number of papers
ten_percent = math.ceil(num_papers*.1)

# Create a new df with the tokens and count (without paperID)
df2 = ip_df.select("tokens", "count")
# Count the number of papers containing the tokens
df2 = df2.groupBy("tokens").agg(f.collect_list("tokens").alias("duplicated_values"), f.sum("count").alias("count"))
# Filter out tokens that appeared in more than 10 percent of the papers
df2 = df2.drop("duplicated_values").orderBy((col("count")).desc()).filter(col("count") < ten_percent)
# Filter out tokens that appeared in less than 20 papers
# Limit the df to 1000 tokens
df2 = df2.filter(col("count") >= 20).limit(1000)
# Create a new df with terms and count
important_words = df2.select(col("tokens").alias("terms"), col("count"))

# Create a new df where each term is replaced by a unique index that takes a value from the range between 0 and |T| − 1
df = important_words.withColumn("row_num", row_number().over(Window.orderBy("count"))-1)
# Create a df to store the indices and the corresponding terms
terms_index_hash = df.select(col("row_num").alias("index"), "terms")
#terms_index_hash.show()

num_terms = terms_index_hash.select("terms").distinct().count()
#print(num_terms)

p_terms = paper_terms.select("paperID", f.explode("tokens").alias("terms"))

# Join p_terms with the terms_index_hash to replace the terms with the indices
joined_df = terms_index_hash.join(p_terms, ["terms"])
joined_df = joined_df.drop("index")

# Create a new df to compute the term frequency vectors
tf_df = joined_df
tf_df = tf_df.groupby("paperID").agg(f.concat_ws(", ", f.collect_list(tf_df.terms)).alias("terms"))
tf_df = tf_df.withColumn("terms_", split(col("terms"), ",\s*").cast(ArrayType(StringType())).alias("terms"))
tf_df = tf_df.drop("terms")
# tf_df is now a df with a column of paperID and a column of lists of the terms (unexploded)
#tf_df.show()

cv = CountVectorizer(inputCol="terms_", outputCol="vectors")
model = cv.fit(tf_df)
vector_df = model.transform(tf_df)
vector_df = vector_df.select("paperID", col("vectors").alias("term_frequency_sparse"))

idf = IDF(inputCol="term_frequency_sparse", outputCol="features")
idfModel = idf.fit(vector_df)
rescaledData = idfModel.transform(vector_df)
tf_idf_built_in = rescaledData.select("paperID", "features")

In [116]:
def getTop10Terms(sparsevector):
    
    # Sort indices of SparseVector by the scores in reverse order
    # Get top 10 indices of the score
    return sorted(range(len(sparsevector)), key=lambda k: sparsevector[k], reverse=True)[:10]

getTop10TermsUdf = udf(lambda x: getTop10Terms(x), ArrayType(IntegerType()))

# Get a column of the top10 words of each paper represented by the hashed index
tf_idf_filtered = tf_idf_built_in.withColumn("index", getTop10TermsUdf(col("features")))
tf_idf_filtered = tf_idf_filtered.drop("features")

# Explode top10 terms for each paper
tf_idf_filtered = tf_idf_filtered.withColumn("index", explode(tf_idf_filtered.index))

# Papers now have only the 10 most important words
tf_idf_filtered.show(5)

+-------+-----+
|paperID|index|
+-------+-----+
| 159967|    0|
| 159967|   18|
| 159967|   21|
| 159967|   79|
| 159967|   61|
+-------+-----+
only showing top 5 rows



In [120]:
# Join the dataframes to get the terms from the hash
hash_joined = terms_index_hash.join(tf_idf_filtered, ["index"])
hash_joined = hash_joined.drop("index")

# The dataframe now has a column of paperID and their top10 important terms
hash_joined_collected = unExplode(hash_joined, "paperID", "terms", "string")
hash_joined_collected = hash_joined_collected.select("paperID", col("terms").alias("tokens"))

#hash_joined_collected.show()

In [119]:
k = 10

user = "1eac022a97d683eace8815545ce3153f"

# Get recommendations using tf_idf
user_rec_tf = w2vRS(user_df, unrated_df, hash_joined_collected, k, passUserID=True, userID=user)

user_rec_tf.show(truncate=False) 

+--------------------------------+-----------------------------------------------------------------------------------+
|userID                          |top_papers                                                                         |
+--------------------------------+-----------------------------------------------------------------------------------+
|1eac022a97d683eace8815545ce3153f|[5394760, 166220, 3281478, 967275, 469428, 1326856, 820297, 3614773, 11191048, 249]|
+--------------------------------+-----------------------------------------------------------------------------------+



In [124]:
# Get user profile for the sampled users using intensive preprocessing
sampled_user_w2v_ip = getUserProfile(training_df, intensive_df)

k = 10

# Get recommendations for the sampled users
user_rec_tf = w2vRS(training_df, train_unrated_df, hash_joined_collected, k)

K = [10, 100]

for i in range(0, len(K)):
    prec, recall, mrr = evaluateK(K[i], user_rec_tf, test_df)  
    print("The precision@" + str(K[i]) + " for TF-IDF is: " + ("%.3f" % prec))
    print("The Recall@" + str(K[i]) + " for TF-IDF is: " + ("%.3f" % recall))
    print("The MRR@" + str(K[i]) + " for TF-IDF is: " + ("%.3f" % mrr) + "\n")

KeyboardInterrupt: 

In [None]:
# Get user profile for the sampled users with libraries greater than 20
sampled20_user_w2v_ip = getUserProfile(training20_df, intensive_dfsive_df)

k = 10

# Get recommendations for the sampled users
user_rec20_tf = w2vRS(training20_df, train20_unrated_df, hash_joined_collected, k)

k = [10, 100]

for i in range(0, len(k)):
    prec, recall, mrr = evaluateK(k[i], user_rec20_tf, test20_df)  
    print("The precision@" + str(k[i]) + " for TF-IDF is: " + ("%.3f" % prec))
    print("The Recall@" + str(k[i]) + " for TF-IDF is: " + ("%.3f" % recall))
    print("The MRR@" + str(k[i]) + " for TF-IDF is: " + ("%.3f" % mrr) + "\n")