# DeepWalk

In [24]:
import os
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.mllib.feature import Word2Vec
# from pyspark.ml.feature import Word2Vec
from pyspark.ml.linalg import Vectors
import random
from collections import defaultdict
import numpy as np
from pyspark.sql import functions as F


In [26]:
def getItemSeqs(spark, samplesRating):
    """
    extract item sequences for each user from dataframe
    1. for each user, collect the corresponding visited movies and timestamp into a list
    2. use UDF to process movie list and timestamp list to sort the movie sequence for each user
    3. join the movie list to get a string for each user
    """
    def sortF(movie_list, timestamp_list):
        """
        sort by time and return the corresponding movie sequence
        eg:
            input: movie_list:[1,2,3]
                   timestamp_list:[1112486027,1212546032,1012486033]
            return [3,1,2]
        """
        pairs = []
        # concat timestamp with movie id
        for m, t in zip(movie_list, timestamp_list):
            pairs.append((m, t))
        # sort by time
        pairs = sorted(pairs, key=lambda x: x[1])
        return [x[0] for x in pairs]
    
    
    sortUDF = udf(sortF, ArrayType(StringType()))
    
    # rating data
    # ratingSamples.show(5)
    # ratingSamples.printSchema()
    userSequence = samplesRating.where(F.col("rating") > 3) \
                    .groupBy("userId")\
                    .agg(sortUDF(F.collect_list("movieId"), F.collect_list("timestamp")).alias("movieIds"))\
                    .withColumn("movieIdStr", F.array_join(F.col("movieIds"), " "))
    seq = userSequence.select("movieIdStr").rdd.map(lambda x : x[0].split(" "))
    
    return seq



def embeddingLSH(spark, movieEmbMap):
    """
    Local sensitive hashing using bucketedRandomProjection
    """
    movieEmbSeq = []
    for key, embedding_list in movieEmbMap.items():
        embedding_list = [np.float64(embedding) for embedding in embedding_list]
        movieEmbSeq.append((key, Vectors.dense(embedding_list)))
    movieEmbDF = spark.createDataFrame(movieEmbSeq).toDF("movieId", "emb")
    bucketProjectionLSH = BucketedRandomProjectionLSH(inputCol="emb", outputCol="bucketId", bucketLength=0.1,
                                                      numHashTables=3)
    bucketModel = bucketProjectionLSH.fit(movieEmbDF)
    embBucketResult = bucketModel.transform(movieEmbDF)
    print("movieId, emb, bucketId schema:")
    embBucketResult.printSchema()
    print("movieId, emb, bucketId data result:")
    embBucketResult.show(10, truncate=False)
    print("Approximately searching for 5 nearest neighbors of the sample embedding:")
    sampleEmb = Vectors.dense(0.795, 0.583, 1.120, 0.850, 0.174, -0.839, -0.0633, 0.249, 0.673, -0.237)
    bucketModel.approxNearestNeighbors(movieEmbDF, sampleEmb, 5).show(truncate=False)



def getTransitionMatrix(item_seq):
    """
    build graph and transition matrix based on input item sequences 
    """
    def generate_pair(ls):
        """
        use a sliding window with size of 2 to generate item pairs
        input: ls =  list of items 
        output: list of pair 
        example:
            input: [86, 90, 11, 100,]
            output: [[86,90], [90, 11], [11,100]]
        """
        res = []
        prev = None
        for i in range(len(ls)):
            if i >0:
                res.append([ls[i-1],ls[i]])
                
    return res

def randomWalk(trans_mat, walk_length):
    """
    generate one random walk sequence based on transition matrix
    """
    return

def generateItemSeqs(trans_mat, num_seq=20000, emb_length = 10  ):
    """
    use random walk to generate multiple item sequences
    """
    
    return


# def trainItem2vec(spark, samples, embLength, embOutputPath, saveToRedis, redisKeyPrefix):
#     word2vec = Word2Vec().setVectorSize(embLength).setWindowSize(5).setNumIterations(10)
#     model = word2vec.fit(samples)
#     synonyms = model.findSynonyms("158", 20)
#     for synonym, cosineSimilarity in synonyms:
#         print(synonym, cosineSimilarity)
#     embOutputDir = '/'.join(embOutputPath.split('/')[:-1])
#     if not os.path.exists(embOutputDir):
#         os.makedirs(embOutputDir)
#     with open(embOutputPath, 'w') as f:
#         for movie_id in model.getVectors():
#             vectors = " ".join([str(emb) for emb in model.getVectors()[movie_id]])
#             f.write(movie_id + ":" + vectors + "\n")
#     embeddingLSH(spark, model.getVectors())
#     return model


def trainItem2Vec(item_seqs, emb_length, output_path, save_to_redis, redis_keyprefix):
    """
    use Word2Vec to train item embedding
    input:
        - item_seqs: RDD pipeline instance, rather than dataframe
    Note:  
    - Word2Vec from mllib is a function that take RDD pipeline as input.
    - Word2Vec from ml is a function that take Dataframe as input 
    
    """
    # train word2Vec
#     w2v = Word2Vec(vectorSize=emb_length, windowSize = 5, maxIter = 10, seed=42)
    w2v = Word2Vec().setVectorSize(embLength).setWindowSize(5).setNumIterations(10)
    model = w2v.fit(item_seqs)
    # test word2vec
    synonyms = model.findSynonyms("157", 20)
    for synonym, cos_similarity in synonyms:
        print(synonym, cos_similarity)
    
    # save word2Vec to input path 
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    with open(output_path, "w") as fp:
        for movie_id in model.getVectors():
            # convert vector to string type and store it
            vector = " ".join([str(emb) for emb in model.getVectors()[movie_id]])
            pair = movie_id + ":" + vector + "\n"
            fp.write(pair)
    return model


def DeepWalk(spark, item_seq, walk_length, num_walk, output_file, save_to_redis=False, redis_key_prefix=None):
    """
    use DeepWalk to generate graph embeddings
    input:
        - item_seq: RDD based sequence of item visited by a user
        
    """
    
    # construct probability graph
    trans_mat = getTransitionMatrix(item_seq)
    
    # generate sequence samples randomly
    samples = generateItemSeqs(trans_mat, num_seq=20000, emb_length = 10 )
    # train item2Vec
    graphEmb = trainItem2Vec(spark, samples)
    
    return graphEmb

def getUserEmb():
    """
    generate user embedding based on item embedding
    """
    
    return


if __name__ == '__main__':
    conf = SparkConf().setAppName('ctrModel').setMaster('local')
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
    # Change to your own filepath
    file_path = '../../data/'
    rawSampleDataPath = file_path + "ratings.csv"
    embLength = 10
    print("Process ItemSquence...")
    samplesRating = spark.read.csv(rawSampleDataPath, header = True)
    item_seqs = getItemSeqs(spark, samplesRating)
    #print(samples)
    
    trainItem2Vec(item_seqs, emb_length=10, output_path=file_path+"modeldata/itemGraphEmb.csv", save_to_redis=False, redis_keyprefix=None)
#     print("Train Item2Vec...")
#     model = trainItem2vec(spark, samples, embLength,
#                           embOutputPath=file_path + "modeldata/item2vecEmb.csv", saveToRedis=False,
#                           redisKeyPrefix="i2vEmb")
#     print("Train graph Embedding...")
#     graphEmb(samples, spark, embLength, embOutputFilename=file_path + "modeldata/itemGraphEmb.csv",
#              saveToRedis=True, redisKeyPrefix="graphEmb")
#     print("Train User Embedding...")
#     generateUserEmb(spark, rawSampleDataPath, model, embLength,
#                     embOutputPath=file_path + "modeldata/userEmb.csv", saveToRedis=False,
#                     redisKeyPrefix="uEmb")
    print("Done!")
    

Process ItemSquence...
325 0.942238450050354
255 0.9304288029670715
330 0.930080771446228
606 0.9175326228141785
93 0.9100475311279297
267 0.9068793058395386
415 0.9025557637214661
437 0.9016014337539673
275 0.9014120101928711
366 0.8982865214347839
505 0.8923134207725525
413 0.890252947807312
433 0.8874485492706299
177 0.8846246600151062
240 0.8830882906913757
174 0.8828160762786865
386 0.8783508539199829
328 0.8764989376068115
312 0.8761969208717346
476 0.8732856512069702
Done!


In [3]:
class UdfFunction:
    @staticmethod
    def sortF(movie_list, timestamp_list):
        """
        sort by time and return the corresponding movie sequence
        eg:
            input: movie_list:[1,2,3]
                   timestamp_list:[1112486027,1212546032,1012486033]
            return [3,1,2]
        """
        pairs = []
        for m, t in zip(movie_list, timestamp_list):
            pairs.append((m, t))
        # sort by time
        pairs = sorted(pairs, key=lambda x: x[1])
        return [x[0] for x in pairs]


def processItemSequence(spark, rawSampleDataPath):
    # rating data
    ratingSamples = spark.read.format("csv").option("header", "true").load(rawSampleDataPath)
    # ratingSamples.show(5)
    # ratingSamples.printSchema()
    sortUdf = udf(UdfFunction.sortF, ArrayType(StringType()))
    userSeq = ratingSamples \
        .where(F.col("rating") >= 3.5) \
        .groupBy("userId") \
        .agg(sortUdf(F.collect_list("movieId"), F.collect_list("timestamp")).alias('movieIds')) \
        .withColumn("movieIdStr", array_join(F.col("movieIds"), " "))
    # userSeq.select("userId", "movieIdStr").show(10, truncate = False)
    return userSeq.select('movieIdStr').rdd.map(lambda x: x[0].split(' '))


def embeddingLSH(spark, movieEmbMap):
    movieEmbSeq = []
    for key, embedding_list in movieEmbMap.items():
        embedding_list = [np.float64(embedding) for embedding in embedding_list]
        movieEmbSeq.append((key, Vectors.dense(embedding_list)))
    movieEmbDF = spark.createDataFrame(movieEmbSeq).toDF("movieId", "emb")
    bucketProjectionLSH = BucketedRandomProjectionLSH(inputCol="emb", outputCol="bucketId", bucketLength=0.1,
                                                      numHashTables=3)
    bucketModel = bucketProjectionLSH.fit(movieEmbDF)
    embBucketResult = bucketModel.transform(movieEmbDF)
    print("movieId, emb, bucketId schema:")
    embBucketResult.printSchema()
    print("movieId, emb, bucketId data result:")
    embBucketResult.show(10, truncate=False)
    print("Approximately searching for 5 nearest neighbors of the sample embedding:")
    sampleEmb = Vectors.dense(0.795, 0.583, 1.120, 0.850, 0.174, -0.839, -0.0633, 0.249, 0.673, -0.237)
    bucketModel.approxNearestNeighbors(movieEmbDF, sampleEmb, 5).show(truncate=False)


def trainItem2vec(spark, samples, embLength, embOutputPath, saveToRedis, redisKeyPrefix):
    """
    input:
        - 
    """
    word2vec = Word2Vec().setVectorSize(embLength).setWindowSize(5).setNumIterations(10)
    model = word2vec.fit(samples)
    synonyms = model.findSynonyms("158", 20)
    for synonym, cosineSimilarity in synonyms:
        print(synonym, cosineSimilarity)
    embOutputDir = '/'.join(embOutputPath.split('/')[:-1])
    if not os.path.exists(embOutputDir):
        os.makedirs(embOutputDir)
    with open(embOutputPath, 'w') as f:
        for movie_id in model.getVectors():
            vectors = " ".join([str(emb) for emb in model.getVectors()[movie_id]])
            f.write(movie_id + ":" + vectors + "\n")
    embeddingLSH(spark, model.getVectors())
    return model


def generate_pair(x):
    # eg:
    # watch sequence:['858', '50', '593', '457']
    # return:[['858', '50'],['50', '593'],['593', '457']]
    pairSeq = []
    previousItem = ''
    for item in x:
        if not previousItem:
            previousItem = item
        else:
            pairSeq.append((previousItem, item))
            previousItem = item
    return pairSeq


def generateTransitionMatrix(samples):
    pairSamples = samples.flatMap(lambda x: generate_pair(x))
    pairCountMap = pairSamples.countByValue()
    pairTotalCount = 0
    transitionCountMatrix = defaultdict(dict)
    itemCountMap = defaultdict(int)
    for key, cnt in pairCountMap.items():
        key1, key2 = key
        transitionCountMatrix[key1][key2] = cnt
        itemCountMap[key1] += cnt
        pairTotalCount += cnt
    transitionMatrix = defaultdict(dict)
    itemDistribution = defaultdict(dict)
    for key1, transitionMap in transitionCountMatrix.items():
        for key2, cnt in transitionMap.items():
            transitionMatrix[key1][key2] = transitionCountMatrix[key1][key2] / itemCountMap[key1]
    for itemid, cnt in itemCountMap.items():
        itemDistribution[itemid] = cnt / pairTotalCount
    return transitionMatrix, itemDistribution


def oneRandomWalk(transitionMatrix, itemDistribution, sampleLength):
    sample = []
    # pick the first element
    randomDouble = random.random()
    firstItem = ""
    accumulateProb = 0.0
    for item, prob in itemDistribution.items():
        accumulateProb += prob
        if accumulateProb >= randomDouble:
            firstItem = item
            break
    sample.append(firstItem)
    curElement = firstItem
    i = 1
    while i < sampleLength:
        if (curElement not in itemDistribution) or (curElement not in transitionMatrix):
            break
        probDistribution = transitionMatrix[curElement]
        randomDouble = random.random()
        accumulateProb = 0.0
        for item, prob in probDistribution.items():
            accumulateProb += prob
            if accumulateProb >= randomDouble:
                curElement = item
                break
        sample.append(curElement)
        i += 1
    return sample


def randomWalk(transitionMatrix, itemDistribution, sampleCount, sampleLength):
    samples = []
    for i in range(sampleCount):
        samples.append(oneRandomWalk(transitionMatrix, itemDistribution, sampleLength))
    return samples


def graphEmb(samples, spark, embLength, embOutputFilename, saveToRedis, redisKeyPrefix):
    transitionMatrix, itemDistribution = generateTransitionMatrix(samples)
    sampleCount = 20000
    sampleLength = 10
    newSamples = randomWalk(transitionMatrix, itemDistribution, sampleCount, sampleLength)
    rddSamples = spark.sparkContext.parallelize(newSamples)
    trainItem2vec(spark, rddSamples, embLength, embOutputFilename, saveToRedis, redisKeyPrefix)


def generateUserEmb(spark, rawSampleDataPath, model, embLength, embOutputPath, saveToRedis, redisKeyPrefix):
    ratingSamples = spark.read.format("csv").option("header", "true").load(rawSampleDataPath)
    Vectors_list = []
    for key, value in model.getVectors().items():
        Vectors_list.append((key, list(value)))
    fields = [
        StructField('movieId', StringType(), False),
        StructField('emb', ArrayType(FloatType()), False)
    ]
    schema = StructType(fields)
    Vectors_df = spark.createDataFrame(Vectors_list, schema=schema)
    ratingSamples = ratingSamples.join(Vectors_df, on='movieId', how='inner')
    result = ratingSamples.select('userId', 'emb').rdd.map(lambda x: (x[0], x[1])) \
        .reduceByKey(lambda a, b: [a[i] + b[i] for i in range(len(a))]).collect()
    with open(embOutputPath, 'w') as f:
        for row in result:
            vectors = " ".join([str(emb) for emb in row[1]])
            f.write(row[0] + ":" + vectors + "\n")

In [5]:
if __name__ == '__main__':
    conf = SparkConf().setAppName('ctrModel').setMaster('local')
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
    # Change to your own filepath
    file_path = '../../data/'
    rawSampleDataPath = file_path + "ratings.csv"
    embLength = 10
    print("Process ItemSquence...")
    samples = processItemSequence(spark, rawSampleDataPath)
    print("Train Item2Vec...")
    model = trainItem2vec(spark, samples, embLength,
                          embOutputPath=file_path + "modeldata/item2vecEmb.csv", saveToRedis=False,
                          redisKeyPrefix="i2vEmb")
    print("Train graph Embedding...")
    graphEmb(samples, spark, embLength, embOutputFilename=file_path + "modeldata/itemGraphEmb.csv",
             saveToRedis=True, redisKeyPrefix="graphEmb")
    print("Train User Embedding...")
    generateUserEmb(spark, rawSampleDataPath, model, embLength,
                    embOutputPath=file_path + "modeldata/userEmb.csv", saveToRedis=False,
                    redisKeyPrefix="uEmb")
    print("Done!")

Process ItemSquence...
Train Item2Vec...
48 0.9554703831672668
256 0.9526916146278381
186 0.9149225950241089
31 0.9058868885040283
168 0.8825162053108215
355 0.8798472285270691
277 0.8489635586738586
252 0.846895158290863
432 0.838388979434967
552 0.8213717341423035
520 0.8202667236328125
276 0.8162290453910828
44 0.8061022162437439
236 0.799111008644104
2 0.778323769569397
455 0.776578426361084
435 0.753156304359436
60 0.7498870491981506
204 0.7483898401260376
169 0.747954249382019
movieId, emb, bucketId schema:
root
 |-- movieId: string (nullable = true)
 |-- emb: vector (nullable = true)
 |-- bucketId: array (nullable = true)
 |    |-- element: vector (containsNull = true)

movieId, emb, bucketId data result:
+-------+-----------------------------------------------------------------------------------------------------------------------+------------------------+
|movieId|emb                                                                                                               

# Conclusion
+ Difference between mllib and ml in pyspark
    - mllib provides the RDD-based functions that take RDD pipeline class instance as input
    - ml provides dataframe-based functions that take Spark Dataframe as input