In [1]:
import os
import sys
# 如果当前代码文件运行测试需要加入修改路径，避免出现后导包问题
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.insert(0, os.path.join(BASE_DIR))

PYSPARK_PYTHON = "/miniconda2/envs/reco_sys/bin/python"
# 当存在多个版本时，不指定很可能会导致出错
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON

from offline import SparkSessionBase

class UpdateRecall(SparkSessionBase):

    SPARK_APP_NAME = "updateRecall"
    ENABLE_HIVE_SUPPORT = True

    def __init__(self):
        self.spark = self._create_spark_session()

ur = UpdateRecall()

In [2]:
# - 1、数据类型转换,clicked以及用户ID与文章ID处理
ur.spark.sql('use profile')
user_article_basic = ur.spark.sql("select user_id, article_id, clicked from user_article_basic")
user_article_basic.show()




+-------------------+-------------------+-------+
|            user_id|         article_id|clicked|
+-------------------+-------------------+-------+
|1105045287866466304|              14225|  false|
|1106476833370537984|              14208|  false|
|1109980466942836736|              19233|  false|
|1109980466942836736|              44737|  false|
|1109993249109442560|              17283|  false|
|1111189494544990208|              19322|  false|
|1111524501104885760|              44161|  false|
|1112727762809913344|              18172|   true|
|1113020831425888256|1112592065390182400|  false|
|1114863735962337280|              17665|  false|
|1114863741448486912|              14208|  false|
|1114863751909081088|              13751|  false|
|1114863846486441984|              17940|  false|
|1114863941936218112|              15196|  false|
|1114863998437687296|              19233|  false|
|1114864164158832640|             141431|  false|
|1114864237131333632|              13797|  false|


In [3]:
def convert_boolean_int(row):
    return row.user_id, row.article_id, int(row.clicked)
    
    
user_article_basic = user_article_basic.rdd.map(convert_boolean_int).toDF(['user_id', 'article_id', 'clicked'])

In [4]:
user_article_basic.show()


+-------------------+-------------------+-------+
|            user_id|         article_id|clicked|
+-------------------+-------------------+-------+
|1105045287866466304|              14225|      0|
|1106476833370537984|              14208|      0|
|1109980466942836736|              19233|      0|
|1109980466942836736|              44737|      0|
|1109993249109442560|              17283|      0|
|1111189494544990208|              19322|      0|
|1111524501104885760|              44161|      0|
|1112727762809913344|              18172|      1|
|1113020831425888256|1112592065390182400|      0|
|1114863735962337280|              17665|      0|
|1114863741448486912|              14208|      0|
|1114863751909081088|              13751|      0|
|1114863846486441984|              17940|      0|
|1114863941936218112|              15196|      0|
|1114863998437687296|              19233|      0|
|1114864164158832640|             141431|      0|
|1114864237131333632|              13797|      0|


In [5]:
# 对用户ID和文章ID进行，索引建立，
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

user_indexer = StringIndexer(inputCol='user_id', outputCol='als_user_id')
article_indexer = StringIndexer(inputCol='article_id', outputCol='als_article_id')
pip = Pipeline(stages=[user_indexer, article_indexer])
pip_model = pip.fit(user_article_basic)
als_user_article = pip_model.transform(user_article_basic)

In [6]:
als_user_article.show()


+-------------------+-------------------+-------+-----------+--------------+
|            user_id|         article_id|clicked|als_user_id|als_article_id|
+-------------------+-------------------+-------+-----------+--------------+
|1105045287866466304|              14225|      0|       28.0|          13.0|
|1106476833370537984|              14208|      0|       14.0|           1.0|
|1109980466942836736|              19233|      0|       60.0|          19.0|
|1109980466942836736|              44737|      0|       60.0|          17.0|
|1109993249109442560|              17283|      0|       48.0|           7.0|
|1111189494544990208|              19322|      0|        7.0|         140.0|
|1111524501104885760|              44161|      0|       49.0|          11.0|
|1112727762809913344|              18172|      1|       45.0|          55.0|
|1113020831425888256|1112592065390182400|      0|       71.0|          29.0|
|1114863735962337280|              17665|      0|        9.0|           5.0|

In [7]:
#- 2、ALS模型训练以及推荐
from pyspark.ml.recommendation import ALS
als = ALS(userCol='als_user_id', itemCol='als_article_id', ratingCol='clicked')
als_model = als.fit(als_user_article)


In [8]:
recall_res = als_model.recommendForAllUsers(100)
recall_res.show()



+-----------+--------------------+
|als_user_id|     recommendations|
+-----------+--------------------+
|         31|[[255,0.17741075]...|
|         65|[[0,0.0], [10,0.0...|
|         53|[[0,0.0], [10,0.0...|
|         34|[[255,0.06449647]...|
|         28|[[255,0.010355139...|
|         26|[[255,0.094471075...|
|         27|[[255,0.1838476],...|
|         44|[[255,0.19231558]...|
|         12|[[255,0.42547882]...|
|         22|[[255,0.14787017]...|
|         47|[[255,0.18815793]...|
|          1|[[255,0.22982743]...|
|         52|[[255,0.24817318]...|
|         13|[[255,0.1511052],...|
|          6|[[255,0.262215], ...|
|         16|[[255,0.2500384],...|
|          3|[[207,0.91574323]...|
|         20|[[255,0.21161154]...|
|         40|[[255,0.06386929]...|
|         57|[[255,0.18380986]...|
+-----------+--------------------+
only showing top 20 rows



In [9]:
# - 3、推荐结果解析处理
# 建立用户真实ID与用户索引的映射，文章真实ID与文章索引的映射
user_real_index = als_user_article.groupBy(['user_id']).max('als_user_id').withColumnRenamed('max(als_user_id)', 'als_user_id')

# user_real_index.show()



+-------------------+-----------+
|            user_id|als_user_id|
+-------------------+-----------+
|1106473203766657024|       26.0|
|1113049054452908032|       44.0|
|1114863751909081088|       37.0|
|1115534909935452160|       42.0|
|1113100263847100416|       54.0|
|1103195673450250240|        5.0|
|1105045287866466304|       28.0|
|1114864237131333632|        4.0|
|1111524501104885760|       49.0|
|1109995264376045568|       19.0|
|1105105185656537088|       46.0|
|1110071654421102592|       64.0|
|1114863965080387584|       65.0|
|1114864128259784704|       17.0|
|1114864233264185344|       40.0|
|1115436666438287360|       29.0|
|1114863846486441984|        2.0|
|1115089292662669312|       13.0|
|1113316420155867136|       72.0|
|1114863902073552896|       16.0|
+-------------------+-----------+
only showing top 20 rows



In [10]:
article_real_index = als_user_article.groupBy(['article_id']).max('als_article_id').withColumnRenamed('max(als_article_id)', 'als_article_id')


In [11]:
recall_res = recall_res.join(user_real_index, on=['als_user_id'], how='left').select(['als_user_id', 'recommendations', 'user_id'])

In [12]:
recall_res.show()

+-----------+--------------------+-------------------+
|als_user_id|     recommendations|            user_id|
+-----------+--------------------+-------------------+
|          8|[[263,0.3481275],...|1109976363453906944|
|         67|[[255,0.4729983],...|1114096769035141120|
|         70|[[255,0.38408458]...|1115534898262704128|
|          0|[[255,0.58535063]...|1106396183141548032|
|         69|[[0,0.0], [10,0.0...|1114094806092480512|
|          7|[[255,0.18091725]...|1111189494544990208|
|         49|[[0,0.0], [10,0.0...|1111524501104885760|
|         29|[[255,0.10471068]...|1115436666438287360|
|         64|[[255,0.09094194]...|1110071654421102592|
|         47|[[255,0.18815793]...|1112995431274512384|
|         42|[[255,0.24995728]...|1115534909935452160|
|         44|[[255,0.19231558]...|1113049054452908032|
|         35|[[207,0.6278385],...|                  4|
|         62|[[255,0.10057049]...|1114863741448486912|
|         18|[[255,0.20005049]...|1114864164158832640|
|         

In [13]:
# 对于文章推荐的解析
import pyspark.sql.functions as F

recall_res = recall_res.withColumn('als_article_id', F.explode('recommendations')).drop('recommendations').select(['user_id', 'als_article_id'])



In [14]:
recall_res.show()

+-------------------+-----------------+
|            user_id|   als_article_id|
+-------------------+-----------------+
|1109976363453906944|  [263,0.3481275]|
|1109976363453906944| [181,0.20810685]|
|1109976363453906944| [255,0.20628238]|
|1109976363453906944| [307,0.20628238]|
|1109976363453906944| [323,0.20628238]|
|1109976363453906944| [293,0.20628238]|
|1109976363453906944| [336,0.19855197]|
|1109976363453906944|   [164,0.104869]|
|1109976363453906944|[207,0.104758695]|
|1109976363453906944| [224,0.10435731]|
|1109976363453906944| [210,0.09976264]|
|1109976363453906944| [204,0.09689172]|
|1109976363453906944| [184,0.09689172]|
|1109976363453906944| [125,0.08567983]|
|1109976363453906944|[149,0.081252605]|
|1109976363453906944| [327,0.07774757]|
|1109976363453906944| [341,0.07774757]|
|1109976363453906944| [299,0.07774757]|
|1109976363453906944| [305,0.07774757]|
|1109976363453906944| [275,0.07774757]|
+-------------------+-----------------+
only showing top 20 rows



In [15]:
def get_article_index(row):
    return row.user_id, row.als_article_id[0]

recall_res = recall_res.rdd.map(get_article_index).toDF(['user_id', 'als_article_id'])

In [16]:
recall_res.show()

+-------------------+--------------+
|            user_id|als_article_id|
+-------------------+--------------+
|1109976363453906944|           263|
|1109976363453906944|           181|
|1109976363453906944|           255|
|1109976363453906944|           307|
|1109976363453906944|           323|
|1109976363453906944|           293|
|1109976363453906944|           336|
|1109976363453906944|           164|
|1109976363453906944|           207|
|1109976363453906944|           224|
|1109976363453906944|           210|
|1109976363453906944|           204|
|1109976363453906944|           184|
|1109976363453906944|           125|
|1109976363453906944|           149|
|1109976363453906944|           327|
|1109976363453906944|           341|
|1109976363453906944|           299|
|1109976363453906944|           305|
|1109976363453906944|           275|
+-------------------+--------------+
only showing top 20 rows



In [17]:
recall_res = recall_res.join(article_real_index, on=['als_article_id'], how='left').select(['user_id', 'article_id'])

In [18]:
# - 4、推荐结果存储
# 获取每个文章对应的频道，推荐给用户时按照频道存储
recall_res.show()

+-------------------+----------+
|            user_id|article_id|
+-------------------+----------+
|1108264901190615040|     13890|
|1114863751909081088|     13890|
|1114865014205841408|     13890|
|                 10|     13890|
|                  5|     13890|
|1109995683777085440|     13890|
|1114864233264185344|     13890|
|1115089292662669312|     13890|
|1114864474352779264|     13890|
|1114865875103514624|     13890|
|1114863941936218112|     13890|
|1113004557979353088|     13890|
|1114863748553637888|     13890|
|1103195673450250240|     13890|
|1114865402044743680|     13890|
|1114863998437687296|     13890|
|                 33|     13890|
|1114863735962337280|     13890|
|1106473203766657024|     13890|
|1114863902073552896|     13890|
+-------------------+----------+
only showing top 20 rows



In [19]:
#找到每个文章对应的频道，然后按照频道分组
ur.spark.sql('use article')

article_data = ur.spark.sql("select article_id, channel_id from article_data")

In [20]:
recall_channel = recall_res.join(article_data, on=['article_id'], how='left')

In [21]:
recall_channel.show()

+----------+-------------------+----------+
|article_id|            user_id|channel_id|
+----------+-------------------+----------+
|     13401|1114094806092480512|        18|
|     13401|1111524501104885760|        18|
|     13401|1114866560301793280|        18|
|     13401|1113316420155867136|        18|
|     13401|1109984273839947776|        18|
|     13401|1114865682668847104|        18|
|     13401|1114863965080387584|        18|
|     14805|1105045287866466304|        18|
|     14805|1114863846486441984|        18|
|     14805|1115535317173010432|        18|
|     14805|1114864128259784704|        18|
|     14805|1114871412419461120|        18|
|     14805|1114863759672737792|        18|
|     14805|                 10|        18|
|     14805|                  5|        18|
|     14805|1109995683777085440|        18|
|     14805|1114864233264185344|        18|
|     14805|1115089292662669312|        18|
|     14805|1114864474352779264|        18|
|     14805|1114865875103514624|

In [22]:
recall_channel = recall_channel.groupBy(['user_id', 'channel_id']).agg(F.collect_list('article_id')).withColumnRenamed('collect_list(article_id)', 'article_list')





In [23]:
recall_channel.show()

+-------------------+----------+--------------------+
|            user_id|channel_id|        article_list|
+-------------------+----------+--------------------+
|                 23|        18|[14805, 14839, 17...|
|1109993249109442560|         7|            [141437]|
|1113049054452908032|         7|            [141437]|
|1113100263847100416|         5|            [141440]|
|1114863751909081088|         7|            [141437]|
|                 38|        13|    [141431, 141431]|
|1114864233264185344|        13|    [141431, 141431]|
|                 10|         5|            [141440]|
|1106473203766657024|         7|            [141437]|
|                 33|        18|[14805, 14839, 17...|
|1106473203766657024|      null|[1112593324574769...|
|                  1|         7|            [141437]|
|1105093883106164736|        13|    [141431, 141431]|
|1114866560301793280|         7|            [141469]|
|1114864434305564672|         7|            [141437]|
|1105093883106164736|       

In [2]:
# 基于内容的召回， 用户的点击行为

ur.spark.sql('use profile')
user_article_basic = ur.spark.sql("select * from user_article_basic")
user_article_basic = user_article_basic.filter('clicked=True')


In [3]:
user_article_basic.show()

+-------------------+-------------------+-------------------+----------+------+-------+---------+--------+---------+
|            user_id|        action_time|         article_id|channel_id|shared|clicked|collected|exposure|read_time|
+-------------------+-------------------+-------------------+----------+------+-------+---------+--------+---------+
|1112727762809913344|2019-04-03 12:51:57|              18172|        18| false|   true|     true|    true|    19413|
|                  1|2019-03-07 16:57:34|              44386|        18| false|   true|    false|    true|    17850|
|1109976363453906944|2019-03-25 11:52:31|              13728|        18| false|   true|    false|    true|    14218|
|1114864354622177280|2019-04-09 16:39:22|              17304|        18| false|   true|    false|    true|         |
|                 23|2019-04-03 08:10:23|              44739|        18| false|   true|    false|    true|     7013|
|                  1|2019-03-17 10:32:01|              17632|   

In [None]:
def get_clicked_article_similar(partition):
    """召回用户点击的文章当中相似的文章推荐
    """
    import happybase
    pool = happybase.ConnectionPool(size=10, host='hadoop-master')
    
    with pool.connection() as conn:
        similar_table = conn.table('article_similar')
        for row in partition:
            # 获取相似文章结果表
            similar_article = similar_table.row(str(row.article_id).encode(),
                                                columns=[b'similar'])
            # 相似文章相似度排序过滤，召回不需要太大的数据， 百个，千
            _srt = sorted(similar_article.items(), key=lambda item: item[1], reverse=True)
            if _srt:
                # 每次行为推荐10篇文章
                reco_article = [int(i[0].split(b':')[1]) for i in _srt][:10]
            
                # 过滤历史
                
                # 存储
                 # 获取历史看过的该频道文章
                history_table = conn.table('history_recall')
                # 多个版本
                data = history_table.cells('reco:his:{}'.format(row.user_id).encode(),
                                           'channel:{}'.format(row.channel_id).encode())

                history = []
                if len(data) >= 2:
                    for l in data[:-1]:
                        history.extend(eval(l))
                else:
                    history = []

                # 过滤reco_article与history
                reco_res = list(set(reco_article) - set(history))

                # 进行推荐，放入基于内容的召回表当中以及历史看过的文章表当中
                if reco_res:
                    # content_table = conn.table('cb_content_recall')
                    content_table = conn.table('cb_recall')
                    content_table.put("recall:user:{}".format(row.user_id).encode(),
                                      {'content:{}'.format(row.channel_id).encode(): str(reco_res).encode()})

                    # 放入历史推荐过文章
                    history_table.put("reco:his:{}".format(row.user_id).encode(),
                                      {'channel:{}'.format(row.channel_id).encode(): str(reco_res).encode()})


user_article_basic.foreachPartition(get_clicked_article_similar)