## 合并数据

In [1]:
import os
import sys
# 如果当前代码文件运行测试需要加入修改路径，避免出现后导包问题
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.insert(0, os.path.join(BASE_DIR))
print(BASE_DIR)
PYSPARK_PYTHON = "/miniconda2/envs/reco_sys/bin/python"
# 当存在多个版本时，不指定很可能会导致出错
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON
from offline import SparkSessionBase

/root/toutiao_project/reco_sys


In [2]:
class OriginArticleData(SparkSessionBase):
    
    SPARK_APP_NAME = "mergeArticle"
    SPARK_URL = "spark://hadoop-master:7077"

    ENABLE_HIVE_SUPPORT = True
    
    def __init__(self):
        self.spark = self._create_spark_session()

In [3]:
oa = OriginArticleData()

In [4]:
# 进行文章 前两个表 的合并
oa.spark.sql("use toutiao")
# news_article_basic 与news_article_content, article_id
titlce_content = oa.spark.sql("select a.article_id, a.channel_id, a.title, b.content from news_article_basic a inner join news_article_content b on a.article_id=b.article_id where a.article_id=116636")


In [5]:
titlce_content.show()

+----------+----------+---------------+--------------------+
|article_id|channel_id|          title|             content|
+----------+----------+---------------+--------------------+
|    116636|        18|动态再平衡投资策略历史数据回测|<p>赚钱是个俗气的话题，但又是人...|
+----------+----------+---------------+--------------------+



In [6]:
# 进行title_content 与 文章频道名称合并
titlce_content.registerTempTable('temptable')

channel_title_content = oa.spark.sql("select t.*, n.channel_name from temptable t left join news_channel n on t.channel_id=n.channel_id")




In [7]:
channel_title_content.show()

+----------+----------+---------------+--------------------+------------+
|article_id|channel_id|          title|             content|channel_name|
+----------+----------+---------------+--------------------+------------+
|    116636|        18|动态再平衡投资策略历史数据回测|<p>赚钱是个俗气的话题，但又是人...|      python|
+----------+----------+---------------+--------------------+------------+



In [8]:
# 合并三个内容到一个字符串
import pyspark.sql.functions as F

sentence_df = channel_title_content.select("article_id", "channel_id", "channel_name", "title", "content", 
                            F.concat_ws(',', 
                                       channel_title_content.channel_name,
                                       channel_title_content.title,
                                       channel_title_content.content).alias('sentence'))


In [9]:
sentence_df.show()
# sentence_df.write.insertInto("article_data")

+----------+----------+------------+---------------+--------------------+--------------------+
|article_id|channel_id|channel_name|          title|             content|            sentence|
+----------+----------+------------+---------------+--------------------+--------------------+
|    116636|        18|      python|动态再平衡投资策略历史数据回测|<p>赚钱是个俗气的话题，但又是人...|python,动态再平衡投资策略历...|
+----------+----------+------------+---------------+--------------------+--------------------+



## 文章分词

In [10]:
# 读取文章，进行每篇张分词
oa.spark.sql("use article")
# article_data = oa.spark.sql("select * from article_data limit 10")
article_data = oa.spark.sql("select * from article_data where channel_id = 18 limit 10")
article_data.show()


+----------+----------+------------+--------------------+--------------------+--------------------+
|article_id|channel_id|channel_name|               title|             content|            sentence|
+----------+----------+------------+--------------------+--------------------+--------------------+
|     12237|        18|      python|想学习区块链？那就用 Python...|<div id="article_...|python,想学习区块链？那就用...|
|     12238|        18|      python|鲜为人知的 Python 语法 使...|<p>所有人（好吧，不是所有人）都...|python,鲜为人知的 Pyth...|
|     12243|        18|      python|手把手教你写网络爬虫（4）：Scr...|<div id="cnblogs_...|python,手把手教你写网络爬虫...|
|     12245|        18|      python|手把手教你写网络爬虫（5）：Pha...|<div id="cnblogs_...|python,手把手教你写网络爬虫...|
|     12247|        18|      python|用 Plumbum 开发 Pyth...|<div id="article_...|python,用 Plumbum ...|
|     12249|        18|      python|手把手教你写网络爬虫（6）：分布式...|<div id="cnblogs_...|python,手把手教你写网络爬虫...|
|     12251|        18|      python|手把手教你写网络爬虫（7）：URL...|<p><a href="http:...|python,手把手教你写网络爬虫...|


In [11]:
# 文章数据进行分词处理,得到分词结果
# 分词
def segmentation(partition):
    import os
    import re

    import jieba
    import jieba.analyse
    import jieba.posseg as pseg
    import codecs

    abspath = "/root/words"

    # 结巴加载用户词典
    userDict_path = os.path.join(abspath, "ITKeywords.txt")
    jieba.load_userdict(userDict_path)

    # 停用词文本
    stopwords_path = os.path.join(abspath, "stopwords.txt")

    def get_stopwords_list():
        """返回stopwords列表"""
        stopwords_list = [i.strip()
                          for i in codecs.open(stopwords_path).readlines()]
        return stopwords_list

    # 所有的停用词列表
    stopwords_list = get_stopwords_list()

    # 分词
    def cut_sentence(sentence):
        """对切割之后的词语进行过滤，去除停用词，保留名词，英文和自定义词库中的词，长度大于2的词"""
        # print(sentence,"*"*100)
        # eg:[pair('今天', 't'), pair('有', 'd'), pair('雾', 'n'), pair('霾', 'g')]
        seg_list = pseg.lcut(sentence)
        seg_list = [i for i in seg_list if i.flag not in stopwords_list]
        filtered_words_list = []
        for seg in seg_list:
            # print(seg)
            if len(seg.word) <= 1:
                continue
            elif seg.flag == "eng":
                if len(seg.word) <= 2:
                    continue
                else:
                    filtered_words_list.append(seg.word)
            elif seg.flag.startswith("n"):
                filtered_words_list.append(seg.word)
            elif seg.flag in ["x", "eng"]:  # 是自定一个词语或者是英文单词
                filtered_words_list.append(seg.word)
        return filtered_words_list

    for row in partition:
        sentence = re.sub("<.*?>", "", row.sentence)    # 替换掉标签数据
        words = cut_sentence(sentence)
        yield row.article_id, row.channel_id, words


In [12]:
words_df = article_data.rdd.mapPartitions(segmentation).toDF(['article_id', 'channel_id', 'words'])

In [13]:
words_df.show()

+----------+----------+--------------------+
|article_id|channel_id|               words|
+----------+----------+--------------------+
|     48084|        18|[python, 学会, YOLO...|
|     48097|        18|[python, Python, ...|
|     48099|        18|[python, Kaggle, ...|
|     48100|        18|[python, Python, ...|
|     48103|        18|[python, python数据...|
|     48105|        18|[python, NumPy, 数...|
|     48106|        18|[python, python, ...|
|     48110|        18|[python, Python实践...|
|     48117|        18|[python, Python, ...|
|     48143|        18|[python, Python, ...|
+----------+----------+--------------------+



## 得到词频CV模型

In [14]:
# 先计算分词之后的每篇文章的词频（词袋模型，建立词的索引，统计词频），得到CV模型
# 统计所有文章不同的词，组成一个词列表 words_list = [1,2,3,,34,4,45,56,67,78,8.......,,,,.]
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol='words', outputCol='countFeatures', vocabSize=2000, minDF=1.0)
cv_model = cv.fit(words_df)
# 然后根据词频计算IDF以及词，得到IDF模型

In [15]:
cv_model.write().overwrite().save("hdfs://hadoop-master:9000/headlines/models/test.model")

In [16]:
from pyspark.ml.feature import CountVectorizerModel
cv_m = CountVectorizerModel.load("hdfs://hadoop-master:9000/headlines/models/test.model")

In [17]:
cv_result = cv_m.transform(words_df)

In [18]:
cv_result.show(truncate=False)
# (541,[0,2,3,4,5,6....]) 的意思是词袋中一共有541个词， 其中索引为0的词在该篇文章中出现了0词

+----------+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## 得到IDF模型

In [19]:
# IDF 模型
from pyspark.ml.feature import IDF
idf = IDF(inputCol="countFeatures", outputCol="idfFeatures")
idfModel = idf.fit(cv_result)
idfModel.write().overwrite().save("hdfs://hadoop-master:9000/headlines/models/testIDF.model")

In [20]:
# 可以进行转换
cv_m.vocabulary

['pa',
 'Python',
 '__',
 '问题',
 'import',
 'name',
 'python',
 'posts',
 'models',
 'post',
 'blog',
 'class',
 '.a',
 '博客',
 '文件',
 '数据',
 '项目',
 'url',
 'ul',
 'comment',
 'self',
 '模型',
 'setup',
 'legit',
 'return',
 'div',
 '代码',
 'request',
 'Iris',
 '策略',
 'def',
 'dist',
 '.h',
 'ckages',
 'get',
 'views',
 '目录',
 'form',
 'verbose',
 'django',
 'root',
 'ckage',
 '算法',
 '机器学习',
 '收益',
 'markdown',
 'ppend',
 'Post',
 'extensions',
 'tml',
 '属性',
 '评论',
 'sys',
 '标签',
 '结果',
 '编程',
 'index',
 'None',
 'version',
 'info',
 'objects',
 'format',
 '方法',
 'detail',
 '数据集',
 'category',
 'time',
 'sklearn',
 'lib',
 'comments',
 '收益率',
 'tag',
 'widget',
 'for',
 'Language',
 'render',
 'print',
 '新手',
 'the',
 '错误',
 'urls',
 'length',
 'create',
 '文章',
 '部分',
 'tags',
 '专栏',
 'validation',
 'Programming',
 'data',
 '函数',
 'body',
 'PyCharm',
 'install',
 '环境',
 'description',
 'foo',
 '类别',
 '脚本',
 'href',
 '指数',
 'left',
 'from',
 'title',
 '时间',
 'include',
 'system',
 'value',

In [21]:
idfModel.idf.toArray()[:20]

array([0.2006707 , 0.2006707 , 0.78845736, 0.45198512, 0.45198512,
       0.6061358 , 0.        , 2.39789527, 2.39789527, 2.39789527,
       1.70474809, 0.45198512, 0.31845373, 1.70474809, 0.6061358 ,
       0.2006707 , 1.29928298, 0.31845373, 0.45198512, 1.70474809])

## 依据CV及IDF得到TF-IDF

In [22]:
# IDF对CV结果进行计算TFIDF
from pyspark.ml.feature import IDFModel
idf_model = IDFModel.load("hdfs://hadoop-master:9000/headlines/models/testIDF.model")
tfidf_res = idf_model.transform(cv_result)

In [23]:
tfidf_res.show(truncate=False)

+----------+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [24]:
# 1265词的 {索引 以及 权重}
def func(partition):
    TOPK = 20
    for row in partition:
        # 找到索引与IDF值并进行排序
        _ = list(zip(row.idfFeatures.indices, row.idfFeatures.values))
        _ = sorted(_, key=lambda x: x[1], reverse=True)
        result = _[:TOPK]
        for word_index, tfidf in result:
            yield row.article_id, row.channel_id, int(word_index), round(float(tfidf), 4)
kewords_tfidf = tfidf_res.rdd.mapPartitions(func).toDF(['article_id', 'channel_id', 'index', 'weights'])

In [25]:
kewords_tfidf.show()

+----------+----------+-----+--------+
|article_id|channel_id|index| weights|
+----------+----------+-----+--------+
|     12237|        18|  582|134.6751|
|     12237|        18|   78| 78.9049|
|     12237|        18|   20| 61.4997|
|     12237|        18|  192| 57.9614|
|     12237|        18|  397| 40.2778|
|     12237|        18|  971| 35.7997|
|     12237|        18|   56| 34.3944|
|     12237|        18|   81|  34.095|
|     12237|        18|  980| 28.5842|
|     12237|        18|  374| 22.1617|
|     12237|        18|    4| 22.1473|
|     12237|        18|  218| 22.0878|
|     12237|        18|  533| 22.0878|
|     12237|        18|   30| 21.6953|
|     12237|        18|   24| 19.1072|
|     12237|        18|    2|  18.923|
|     12237|        18|  280|   18.19|
|     12237|        18| 1152| 16.1856|
|     12237|        18|  369| 15.5914|
|     12237|        18|  260| 15.3427|
+----------+----------+-----+--------+
only showing top 20 rows



In [26]:
# 利用keywordsIndex = ktt.spark.sql("select keyword, index idx from idf_keywords_values")中标，知道索引对应的词
idf_keywords_values = oa.spark.sql("select keyword, index idx from idf_keywords_values")

In [27]:
keyword_str_tfidf = kewords_tfidf.join(idf_keywords_values, idf_keywords_values.idx==kewords_tfidf.index).select(["article_id", "channel_id", "keyword", "weights"])

keyword_str_tfidf.show()

+----------+----------+--------+--------+
|article_id|channel_id| keyword| weights|
+----------+----------+--------+--------+
|    116649|        18|     var|  7.8798|
|    116636|        18|      属性|  40.914|
|    116641|        18|      规定|  2.3979|
|    116648|        18|   https| 31.1726|
|    116651|        18|document|  9.5916|
|    116636|        18|      名字|  7.1937|
|    116648|        18|     com| 49.4377|
|    116645|        18|      图片| 21.5811|
|    116645|        18|      架构| 16.7853|
|    116649|        18|      方向|  7.1937|
|    116644|        18|      空间|  9.5916|
|    116647|        18|  master|  3.4095|
|    116645|        18|      用户| 67.1411|
|    116648|        18|      对象|105.5074|
|    116647|        18|    else| 14.3874|
|    116649|        18|    else|  7.1937|
|    116641|        18|    more|  2.3979|
|    116649|        18|    json| 11.9895|
|    116641|        18|      全球|  2.3979|
|    116650|        18|      节点| 15.5914|
+----------+----------+--------+--

## 得到TextRank

In [28]:
# texrank
# 分词
def textrank(partition):
    import os

    import jieba
    import jieba.analyse
    import jieba.posseg as pseg
    import codecs

    abspath = "/root/words"

    # 结巴加载用户词典
    userDict_path = os.path.join(abspath, "ITKeywords.txt")
    jieba.load_userdict(userDict_path)

    # 停用词文本
    stopwords_path = os.path.join(abspath, "stopwords.txt")

    def get_stopwords_list():
        """返回stopwords列表"""
        stopwords_list = [i.strip()
                          for i in codecs.open(stopwords_path).readlines()]
        return stopwords_list

    # 所有的停用词列表
    stopwords_list = get_stopwords_list()

    class TextRank(jieba.analyse.TextRank):
        def __init__(self, window=20, word_min_len=2):
            super(TextRank, self).__init__()
            self.span = window  # 窗口大小
            self.word_min_len = word_min_len  # 单词的最小长度
            # 要保留的词性，根据jieba github ，具体参见https://github.com/baidu/lac
            self.pos_filt = frozenset(
                ('n', 'x', 'eng', 'f', 's', 't', 'nr', 'ns', 'nt', "nw", "nz", "PER", "LOC", "ORG"))

        def pairfilter(self, wp):
            """过滤条件，返回True或者False"""

            if wp.flag == "eng":
                if len(wp.word) <= 2:
                    return False

            if wp.flag in self.pos_filt and len(wp.word.strip()) >= self.word_min_len \
                    and wp.word.lower() not in stopwords_list:
                return True
    # TextRank过滤窗口大小为5，单词最小为2
    textrank_model = TextRank(window=5, word_min_len=2)
    allowPOS = ('n', "x", 'eng', 'nr', 'ns', 'nt', "nw", "nz", "c")

    for row in partition:
        tags = textrank_model.textrank(row.sentence, topK=20, withWeight=True, allowPOS=allowPOS, withFlag=False)
        for tag in tags:
            yield row.article_id, row.channel_id, tag[0], tag[1]

In [29]:
textrank = article_data.rdd.mapPartitions(textrank).toDF(["article_id", "channel_id", "keyword", "textrank"])

In [30]:
textrank.show()

+----------+----------+----------+-------------------+
|article_id|channel_id|   keyword|           textrank|
+----------+----------+----------+-------------------+
|    116636|        18|    strong|                1.0|
|    116636|        18|        收益| 0.6192849441619113|
|    116636|        18|        策略| 0.4334340987622379|
|    116636|        18|noreferrer| 0.3579020085676457|
|    116636|        18|        指数| 0.3532373135412326|
|    116636|        18|        基金| 0.3243784993341385|
|    116636|        18|  nofollow| 0.3194251024727278|
|    116636|        18|       收益率|0.30972169743824307|
|    116636|        18|       img| 0.2953536476557739|
|    116636|        18|     https|0.23612604888003635|
|    116636|        18|        资产| 0.1954448041748237|
|    116636|        18|        股票|0.16595421589513454|
|    116636|        18|  zhuanlan| 0.1616599991034617|
|    116636|        18|       rel| 0.1495616224868557|
|    116636|        18|        编程|0.12859125571265928|
|    11663

## 依据TF-IDF及TextRank得到词及其权重

In [31]:
# 计算关键词最后的权重，Textank  * IDF
idf = oa.spark.sql("select * from idf_keywords_values")
idf = idf.withColumnRenamed("keyword", "keyword1")
result = textrank.join(idf,textrank.keyword==idf.keyword1)
keywords_res = result.withColumn("weights", result.textrank * result.idf).select(["article_id", "channel_id", "keyword", "weights"])

In [32]:
# 20个Keyword，对应的权重，文章ID，channel_id
keywords_res.show()

+----------+----------+----------+-------------------+
|article_id|channel_id|   keyword|            weights|
+----------+----------+----------+-------------------+
|    116645|        18|setuptools| 1.2251622925259384|
|    116642|        18|    import| 0.6320905751790109|
|    116645|        18|    import| 0.4569523948721447|
|    116648|        18|    import| 1.9502451027406746|
|    116650|        18|    import|  1.065590842371233|
|    116636|        18|  zhuanlan| 0.8234521565741966|
|    116649|        18|  zhuanlan| 0.9024517965997578|
|    116642|        18|        文件|0.33193437021645833|
|    116644|        18|      编程语言| 1.6653007690912947|
|    116642|        18|        .h|0.32597910527201746|
|    116648|        18|        .h| 0.5789892737748391|
|    116651|        18|     node1| 1.0381522229268267|
|    116644|        18|        细节| 1.0725286726606469|
|    116650|        18|       鸢尾花| 1.9262083951097058|
|    116641|        18|      code| 0.7644580359441208|
|    11664

In [33]:
keywords_res.registerTempTable("temptable")

In [34]:
keyword_weights_list = oa.spark.sql("select article_id, min(channel_id) channel_id, collect_list(keyword) keywords, collect_list(weights) weights from temptable group by article_id")

In [35]:
keyword_weights_list.show(truncate=False)

+----------+----------+------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|article_id|channel_id|keywords                                                                                                                            |weights                                                                                                                                                                                                                                                                                

In [36]:
def keyword_weights_to_dict(row):
    return row.article_id, row.channel_id, dict(zip(row.keywords, row.weights))
    
keywords = keyword_weights_list.rdd.map(keyword_weights_to_dict).toDF(['article_id', 'channel_id', 'keywords'])

In [37]:
keywords.show(truncate=False)

+----------+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|article_id|channel_id|keywords                                                                                                                                                                                                                                                                                                                                                                           

## 依据TF-IDF及TextRank得到主题词

In [38]:
topic_sql = """
                select t.article_id article_id2, collect_set(t.keyword) topics from tfidf_keywords_values t
                inner join 
                textrank_keywords_values r
                where t.keyword=r.keyword
                group by article_id2
                """

article_topics = oa.spark.sql(topic_sql)

In [39]:
#article_topics.show()

## 依据词权重和主题词得到文章画像

In [40]:
article_profile = keywords.join(article_topics, keywords.article_id==article_topics.article_id2).select(["article_id", "channel_id", "keywords", "topics"])


In [41]:
article_profile.show(truncate=False)

+----------+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|article_id|channel_id|keywords                                                                                                                                                                                            

## 词向量模型训练

In [42]:
# 通过少量数据来演示训练
from pyspark.ml.feature import Word2Vec


w2v = Word2Vec(vectorSize=100, inputCol='words', outputCol='model', minCount=3)
w2v_model = w2v.fit(words_df)
w2v_model.write().overwrite().save("hdfs://hadoop-master:9000/headlines/models/test.word2vec")

In [43]:
article_profile.show(truncate=False)

+----------+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|article_id|channel_id|keywords                                                                                                                                                                                            

In [44]:
# 求出增量文章的词向量，增量文章 一共10篇文章
# 1、加载某个频道模型，得到每个词的向量
from pyspark.ml.feature import Word2VecModel

word_vec = Word2VecModel.load("hdfs://hadoop-master:9000/headlines/models/test.word2vec")
vectors = word_vec.getVectors()

In [45]:
vectors.show(truncate=False)

+-------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## 得到文章的词及权重并展开

In [46]:
# 获取频道的文章画像，得到文章画像的关键词，找到这些文章关键词对应词向量
python_article_profile = article_profile.filter('channel_id=18')

In [47]:
python_article_profile.show(truncate=False)

+----------+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|article_id|channel_id|keywords                                                                                                                                                                                            

In [48]:
# 将文章画像的字典， 词语与权重 进行展开
python_article_profile.registerTempTable('profile')

_articlekeywordsweight = oa.spark.sql("select article_id,  channel_id, keyword, weight from profile LATERAL VIEW explode(keywords) AS keyword, weight")


_articlekeywordsweight.show()

+----------+----------+--------------+-------------------+
|article_id|channel_id|       keyword|             weight|
+----------+----------+--------------+-------------------+
|     48110|        18|            __|0.24830044465980028|
|     48110|        18|            r2|0.29319850231010564|
|     48110|        18|        python|0.19355146079597563|
|     48110|        18|        github| 0.4098877352297189|
|     48110|        18|         blank| 0.3525965608765685|
|     48110|        18|            形式|0.17104900722711897|
|     48110|        18|      Python实践| 0.8366135131877409|
|     48110|        18|      tomorrow| 1.0043957652237685|
|     48110|        18|            h2| 0.9201527334232663|
|     48110|        18|        target| 0.3357195251804835|
|     48110|        18|            pa| 0.6651385256756351|
|     48110|        18|            代码|0.08198859488038059|
|     48110|        18|           www|0.42582821479009686|
|     48110|        18|           pip| 0.302712115914703

## 合并词向量和 文章词及权重

In [49]:
# 将文章词权重 和 词向量 模型 进行 inner 合并
article_keyword_vec_weights = _articlekeywordsweight.join(vectors, vectors.word==_articlekeywordsweight.keyword, "inner")

In [50]:
article_keyword_vec_weights.show(truncate=False)

+----------+----------+--------+-------------------+--------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## 文章向量

In [51]:
# 得到文章向量
article_keyword_vec_weights.registerTempTable("temptable")

def func(row):
    x = 0
    for v in row.vectors:
        x += v
        
    return row.article_id, row.channel_id, x / len(row.vectors)

article_vector = oa.spark.sql("select article_id, min(channel_id) channel_id, collect_set(vector) vectors from temptable group by article_id").rdd.map(func).toDF(['article_id', 'channel_id', 'articlevector'])


In [52]:
article_vector.show(truncate=False)
# article_vector.write.insertInto("article_vector")

+----------+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## 根据文章向量计算文章相似度

In [53]:
from pyspark.ml.linalg import Vectors
def _array_to_vector(row):
    return row.article_id, Vectors.dense(row.articlevector)

train = article_vector.rdd.map(_array_to_vector).toDF(['article_id', 'article_vector'])

In [54]:
train

DataFrame[article_id: bigint, article_vector: vector]

In [55]:
# BRP进行fit
from pyspark.ml.feature import BucketedRandomProjectionLSH

BRP = BucketedRandomProjectionLSH(inputCol='article_vector', outputCol='hashes', numHashTables=4.0, bucketLength=10.0)
model = BRP.fit(train)

In [56]:
similar = model.approxSimilarityJoin(train, train, 2.0, distCol='EuclideanDistance')

In [57]:
similar.sort(['EuclideanDistance']).show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## 保存文章相似度

In [58]:
def save_hbase(partition):
    import happybase
    pool = happybase.ConnectionPool(size=3, host='hadoop-master')
    
    with pool.connection() as conn:
        # 建议表的连接
        table = conn.table('article_similar')
        for row in partition:
            if row.datasetA.article_id == row.datasetB.article_id:
                pass
            else:
                table.put(str(row.datasetA.article_id).encode(),
                         {"similar:{}".format(row.datasetB.article_id).encode(): b'%0.4f' % (row.EuclideanDistance)})
        # 手动关闭所有的连接
        conn.close()

similar.foreachPartition(save_hbase)