## 1. 合并数据

In [None]:
import os
import sys
# 如果当前代码文件运行测试需要加入修改路径，避免出现后导包问题
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.insert(0, os.path.join(BASE_DIR))
print(BASE_DIR)
PYSPARK_PYTHON = "/miniconda2/envs/reco_sys/bin/python"
# 当存在多个版本时，不指定很可能会导致出错
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON
from offline import SparkSessionBase

In [2]:
class OriginArticleData(SparkSessionBase):
    
    SPARK_APP_NAME = "mergeArticle"
    SPARK_URL = "spark://hadoop-master:7077"

    ENABLE_HIVE_SUPPORT = True
    
    def __init__(self):
        self.spark = self._create_spark_session()

In [3]:
oa = OriginArticleData()

In [4]:
# 进行文章 前两个表 的合并
oa.spark.sql("use toutiao")
# news_article_basic 与news_article_content, article_id
titlce_content = oa.spark.sql("select a.article_id, a.channel_id, a.title, b.content from news_article_basic a inner join news_article_content b on a.article_id=b.article_id where a.article_id=116636")


In [5]:
titlce_content.show()

+----------+----------+---------------+--------------------+
|article_id|channel_id|          title|             content|
+----------+----------+---------------+--------------------+
|    116636|        18|动态再平衡投资策略历史数据回测|<p>赚钱是个俗气的话题，但又是人...|
+----------+----------+---------------+--------------------+



In [6]:
# 进行title_content 与 文章频道名称合并
titlce_content.registerTempTable('temptable')

channel_title_content = oa.spark.sql("select t.*, n.channel_name from temptable t left join news_channel n on t.channel_id=n.channel_id")




In [7]:
channel_title_content.show()

+----------+----------+---------------+--------------------+------------+
|article_id|channel_id|          title|             content|channel_name|
+----------+----------+---------------+--------------------+------------+
|    116636|        18|动态再平衡投资策略历史数据回测|<p>赚钱是个俗气的话题，但又是人...|      python|
+----------+----------+---------------+--------------------+------------+



In [8]:
# 合并三个内容到一个字符串
import pyspark.sql.functions as F

sentence_df = channel_title_content.select("article_id", "channel_id", "channel_name", "title", "content", 
                            F.concat_ws(',', 
                                       channel_title_content.channel_name,
                                       channel_title_content.title,
                                       channel_title_content.content).alias('sentence'))


In [9]:
sentence_df.show()
# sentence_df.write.insertInto("article_data")

+----------+----------+------------+---------------+--------------------+--------------------+
|article_id|channel_id|channel_name|          title|             content|            sentence|
+----------+----------+------------+---------------+--------------------+--------------------+
|    116636|        18|      python|动态再平衡投资策略历史数据回测|<p>赚钱是个俗气的话题，但又是人...|python,动态再平衡投资策略历...|
+----------+----------+------------+---------------+--------------------+--------------------+



## 2. 文章分词

In [10]:

# 读取文章，进行每篇张分词
oa.spark.sql("use article")
article_data = oa.spark.sql("select * from article_data limit 10")
article_data.show()


+----------+----------+------------+--------------------+--------------------+--------------------+
|article_id|channel_id|channel_name|               title|             content|            sentence|
+----------+----------+------------+--------------------+--------------------+--------------------+
|         1|        17|          前端|     Vue props用法小结原荐|<p><strong>Vue pr...|前端,Vue props用法小结原...|
|         2|        17|          前端|vue.js响应式原理解析与实现—...|<p>上次我们已经分析了vue.j...|前端,vue.js响应式原理解析与...|
|         3|        17|          前端|JavaScript中浅拷贝和深拷...|<p>要理解 JavaScript...|前端,JavaScript中浅拷贝...|
|         4|        17|          前端|基于vue2.0 +vuex+ e...|<p>效果演示地址,</p><p>...|前端,基于vue2.0 +vuex...|
|         5|        17|          前端|immutability因Reac...|<p><img src="http...|前端,immutability因R...|
|         6|        17|          前端|简单了解 node npm cnp...|<span id="OSC_h1_...|前端,简单了解 node npm ...|
|         7|        17|          前端|       Web工程师以太坊入门原荐|<p>我经常构建使用以太坊的Web...|前端,Web工程师以太坊入门原荐,...|


In [11]:
# 文章数据进行分词处理,得到分词结果
# 分词
def segmentation(partition):
    import os
    import re

    import jieba
    import jieba.analyse
    import jieba.posseg as pseg
    import codecs

    abspath = "/root/words"

    # 结巴加载用户词典
    userDict_path = os.path.join(abspath, "ITKeywords.txt")
    jieba.load_userdict(userDict_path)

    # 停用词文本
    stopwords_path = os.path.join(abspath, "stopwords.txt")

    def get_stopwords_list():
        """返回stopwords列表"""
        stopwords_list = [i.strip()
                          for i in codecs.open(stopwords_path).readlines()]
        return stopwords_list

    # 所有的停用词列表
    stopwords_list = get_stopwords_list()

    # 分词
    def cut_sentence(sentence):
        """对切割之后的词语进行过滤，去除停用词，保留名词，英文和自定义词库中的词，长度大于2的词"""
        # print(sentence,"*"*100)
        # eg:[pair('今天', 't'), pair('有', 'd'), pair('雾', 'n'), pair('霾', 'g')]
        seg_list = pseg.lcut(sentence)
        seg_list = [i for i in seg_list if i.flag not in stopwords_list]
        filtered_words_list = []
        for seg in seg_list:
            # print(seg)
            if len(seg.word) <= 1:
                continue
            elif seg.flag == "eng":
                if len(seg.word) <= 2:
                    continue
                else:
                    filtered_words_list.append(seg.word)
            elif seg.flag.startswith("n"):
                filtered_words_list.append(seg.word)
            elif seg.flag in ["x", "eng"]:  # 是自定一个词语或者是英文单词
                filtered_words_list.append(seg.word)
        return filtered_words_list

    for row in partition:
        sentence = re.sub("<.*?>", "", row.sentence)    # 替换掉标签数据
        words = cut_sentence(sentence)
        yield row.article_id, row.channel_id, words


In [12]:
words_df = article_data.rdd.mapPartitions(segmentation).toDF(['article_id', 'channel_id', 'words'])

In [13]:
words_df.show()

+----------+----------+--------------------+
|article_id|channel_id|               words|
+----------+----------+--------------------+
|         1|        17|[Vue, props, 用法, ...|
|         2|        17|[vue, 响应式, 原理, mo...|
|         3|        17|[JavaScript, 浅拷贝,...|
|         4|        17|[vue2, vuex, elem...|
|         5|        17|[immutability, Re...|
|         6|        17|[node, npm, cnpm,...|
|         7|        17|[Web, 工程师, 以太坊, 入...|
|         8|        17|[Web, pa, api, we...|
|         9|        17|[vue, 中用, 数据驱动, 视...|
|        10|        17|[程序, WebSocket, 长...|
+----------+----------+--------------------+



## 3. 得到词频CV模型

In [14]:
# 先计算分词之后的每篇文章的词频，得到CV模型
# 统计所有文章不同的词，组成一个词列表 words_list = [1,2,3,,34,4,45,56,67,78,8.......,,,,.]
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol='words', outputCol='countFeatures', vocabSize=2000, minDF=1.0)
cv_model = cv.fit(words_df)

# 然后根据词频计算IDF以及词，得到IDF模型


In [15]:
cv_model.write().overwrite().save("hdfs://hadoop-master:9000/headlines/models/test.model")

In [16]:
from pyspark.ml.feature import CountVectorizerModel
cv_m = CountVectorizerModel.load("hdfs://hadoop-master:9000/headlines/models/test.model")

In [17]:
cv_result = cv_m.transform(words_df)

In [18]:
cv_result.show()

+----------+----------+--------------------+--------------------+
|article_id|channel_id|               words|       countFeatures|
+----------+----------+--------------------+--------------------+
|      4273|        15|[javascript, reac...|(986,[2,4,9,10,11...|
|      4274|        19|[java, java, 笔记, ...|(986,[0,1,8,16,17...|
|      4275|        19|[java, 传统, 方式, 类继...|(986,[1,2,8,16,18...|
|      4276|        15|[javascript, Vue,...|(986,[1,2,4,6,8,1...|
|      4278|        15|[javascript, 作用域链...|(986,[2,3,10,18,2...|
|      4279|        19|[java, springboot...|(986,[1,8,11,23,2...|
|      4280|        19|[java, Jedis, 工具类...|(986,[1,2,4,5,7,8...|
|      4281|        19|[java, java, 记录, ...|(986,[2,16,23,32,...|
|      4282|        15|[javascript, VueS...|(986,[2,4,6,10,16...|
|      4283|        15|[javascript, 体积, ...|(986,[2,3,4,10,11...|
+----------+----------+--------------------+--------------------+



## 4. 得到IDF模型

In [19]:
# IDF 模型
from pyspark.ml.feature import IDF
idf = IDF(inputCol="countFeatures", outputCol="idfFeatures")
idfModel = idf.fit(cv_result)
idfModel.write().overwrite().save("hdfs://hadoop-master:9000/headlines/models/testIDF.model")

In [20]:
# 可以进行转换
cv_m.vocabulary

['&#',
 'String',
 '代码',
 '作用域',
 'pa',
 'key',
 '客户端',
 'jedis',
 'public',
 'Hooks',
 '函数',
 'ul',
 '组件',
 'scope',
 'return',
 '模块',
 '方法',
 'import',
 '时候',
 'count',
 'res',
 '.h',
 'this',
 'java',
 '问题',
 'Long',
 'Override',
 'class',
 'com',
 '声明',
 'web',
 'name',
 '线程',
 'constructor',
 'value',
 '逻辑',
 'props',
 'useEffect',
 'node',
 'start',
 '插件',
 '项目',
 'field',
 'rams',
 'vue',
 'useState',
 'arg',
 'Jedis',
 'event',
 'command',
 'jedisPool',
 '服务端',
 'action',
 '例子',
 '官方',
 'Enumeration',
 '参数',
 'state',
 'util',
 'close',
 'function',
 '情况',
 'catch',
 'title',
 'const',
 '文件',
 'new',
 'set',
 'jedisCluster',
 'void',
 'redis',
 'getResource',
 'clients',
 '标识',
 'onParseClientResp',
 'Thread',
 'enu',
 '.a',
 'from',
 'hooks',
 '页面',
 '全局',
 'get',
 '大家',
 'end',
 'fireEvent',
 'react',
 'document',
 'times',
 'clicked',
 'You',
 'listener',
 '赋值',
 'Vector',
 'server',
 '结果',
 'client',
 'var',
 'bean',
 'isOnline',
 '降级',
 'tml',
 'android',
 'toast',
 'Strin

In [21]:
idfModel.idf.toArray()[:20]

array([1.29928298, 1.70474809, 0.31845373, 1.70474809, 0.        ,
       0.78845736, 1.29928298, 2.39789527, 1.70474809, 2.39789527,
       0.6061358 , 0.2006707 , 1.01160091, 1.70474809, 0.45198512,
       1.70474809, 0.45198512, 2.39789527, 0.45198512, 2.39789527])

## 5.依据CV及IDF得到TF-IDF

In [22]:
# IDF对CV结果进行计算TFIDF
from pyspark.ml.feature import IDFModel
idf_model = IDFModel.load("hdfs://hadoop-master:9000/headlines/models/testIDF.model")
tfidf_res = idf_model.transform(cv_result)

In [23]:
tfidf_res.show()

+----------+----------+--------------------+--------------------+--------------------+
|article_id|channel_id|               words|       countFeatures|         idfFeatures|
+----------+----------+--------------------+--------------------+--------------------+
|     22478|         4|[c++, poj, ul, ti...|(986,[4,11,69,78,...|(986,[4,11,69,78,...|
|     22479|         7|[数据库, 大众点评, 资深, 专...|(986,[4,5,11,16,1...|(986,[4,5,11,16,1...|
|     22480|         7|[数据库, 机器学习, 工程师, ...|(986,[4,10,11,16,...|(986,[4,10,11,16,...|
|     22481|         4|[c++, hdu, game, ...|(986,[4,14,19,28,...|(986,[4,14,19,28,...|
|     22482|         7|[数据库, 互联网金融, 企业, ...|(986,[18,24,61,11...|(986,[18,24,61,11...|
|     22483|         4|[c++, test, this,...|(986,[0,4,21,22,1...|(986,[0,4,21,22,1...|
|     22484|         7|[数据库, What, class...|(986,[2,4,8,11,12...|(986,[2,4,8,11,12...|
|     22485|         4|[c++, c++, 浅拷贝, c...|(986,[1,2,4,8,10,...|(986,[1,2,4,8,10,...|
|     22486|         7|[数据库, 互联网, 企业, 数据...

In [24]:
# 1265词的 {索引 以及 权重}
def func(partition):
    TOPK = 20
    for row in partition:
        # 找到索引与IDF值并进行排序
        _ = list(zip(row.idfFeatures.indices, row.idfFeatures.values))
        _ = sorted(_, key=lambda x: x[1], reverse=True)
        result = _[:TOPK]
        for word_index, tfidf in result:
            yield row.article_id, row.channel_id, int(word_index), round(float(tfidf), 4)
kewords_tfidf = tfidf_res.rdd.mapPartitions(func).toDF(['article_id', 'channel_id', 'index', 'weights'])

In [25]:
kewords_tfidf.show()

+----------+----------+-----+-------+
|article_id|channel_id|index|weights|
+----------+----------+-----+-------+
|     22478|         4|  441|24.2784|
|     22478|         4|  151|11.9895|
|     22478|         4|  268| 6.4964|
|     22478|         4|  456| 5.1142|
|     22478|         4|  358| 4.7958|
|     22478|         4|  808| 4.7958|
|     22478|         4|  130| 3.4095|
|     22478|         4|  863| 3.4095|
|     22478|         4|   69| 2.3979|
|     22478|         4|  126| 2.3979|
|     22478|         4|  259| 2.3979|
|     22478|         4|  284| 2.3979|
|     22478|         4|  300| 2.3979|
|     22478|         4|  526| 2.3979|
|     22478|         4|  536| 2.3979|
|     22478|         4|  749| 2.3979|
|     22478|         4|  798| 2.3979|
|     22478|         4|  915| 2.3979|
|     22478|         4|  137| 1.8079|
|     22478|         4|  134| 1.7047|
+----------+----------+-----+-------+
only showing top 20 rows



In [26]:
# 利用keywordsIndex = ktt.spark.sql("select keyword, index idx from idf_keywords_values")中标，知道索引对应的词
idf_keywords_values = oa.spark.sql("select keyword, index idx from idf_keywords_values")

In [27]:
keyword_str_tfidf = kewords_tfidf.join(idf_keywords_values, idf_keywords_values.idx==kewords_tfidf.index).select(["article_id", "channel_id", "keyword", "weights"])

keyword_str_tfidf.show()

+----------+----------+-------+-------+
|article_id|channel_id|keyword|weights|
+----------+----------+-------+-------+
|     22479|         7|   人工智能|  4.243|
|     22485|         4|   人工智能| 5.4552|
|     22481|         4|     标题| 4.7958|
|     22481|         4|    com| 7.1937|
|     22482|         7|  right|  6.819|
|     22483|         4|     &#|15.5914|
|     22487|         4|     图片| 2.0232|
|     22479|         7|    via| 4.7958|
|     22480|         7|    via| 2.3979|
|     22483|         4|     用户| 0.6061|
|     22478|         4|    web| 3.4095|
|     22482|         7|  cache| 2.3979|
|     22480|         7|display| 2.3979|
|     22478|         4|android| 2.3979|
|     22481|         4|android| 2.3979|
|     22483|         4|android| 2.3979|
|     22485|         4|android| 2.3979|
|     22487|         4|android| 2.3979|
|     22479|         7|     方案|  6.819|
|     22482|         7| string| 8.5237|
+----------+----------+-------+-------+
only showing top 20 rows



## 6. 得到TextRank

In [28]:
# texrank
# 分词
def textrank(partition):
    import os

    import jieba
    import jieba.analyse
    import jieba.posseg as pseg
    import codecs

    abspath = "/root/words"

    # 结巴加载用户词典
    userDict_path = os.path.join(abspath, "ITKeywords.txt")
    jieba.load_userdict(userDict_path)

    # 停用词文本
    stopwords_path = os.path.join(abspath, "stopwords.txt")

    def get_stopwords_list():
        """返回stopwords列表"""
        stopwords_list = [i.strip()
                          for i in codecs.open(stopwords_path).readlines()]
        return stopwords_list

    # 所有的停用词列表
    stopwords_list = get_stopwords_list()

    class TextRank(jieba.analyse.TextRank):
        def __init__(self, window=20, word_min_len=2):
            super(TextRank, self).__init__()
            self.span = window  # 窗口大小
            self.word_min_len = word_min_len  # 单词的最小长度
            # 要保留的词性，根据jieba github ，具体参见https://github.com/baidu/lac
            self.pos_filt = frozenset(
                ('n', 'x', 'eng', 'f', 's', 't', 'nr', 'ns', 'nt', "nw", "nz", "PER", "LOC", "ORG"))

        def pairfilter(self, wp):
            """过滤条件，返回True或者False"""

            if wp.flag == "eng":
                if len(wp.word) <= 2:
                    return False

            if wp.flag in self.pos_filt and len(wp.word.strip()) >= self.word_min_len \
                    and wp.word.lower() not in stopwords_list:
                return True
    # TextRank过滤窗口大小为5，单词最小为2
    textrank_model = TextRank(window=5, word_min_len=2)
    allowPOS = ('n', "x", 'eng', 'nr', 'ns', 'nt', "nw", "nz", "c")

    for row in partition:
        tags = textrank_model.textrank(row.sentence, topK=20, withWeight=True, allowPOS=allowPOS, withFlag=False)
        for tag in tags:
            yield row.article_id, row.channel_id, tag[0], tag[1]

In [29]:
textrank = article_data.rdd.mapPartitions(textrank).toDF(["article_id", "channel_id", "keyword", "textrank"])

In [30]:
textrank.show()

+----------+----------+-----------+-------------------+
|article_id|channel_id|    keyword|           textrank|
+----------+----------+-----------+-------------------+
|     22478|         4|         pa|                1.0|
|     22478|         4|      style| 0.9370618273497324|
|     22478|         4|      color| 0.4128538576949204|
|     22478|         4|      class|  0.359932662640729|
|     22478|         4|     number|0.31912097527684746|
|     22478|         4|        div|0.28447448207465204|
|     22478|         4|     puzzle|0.23862566202092025|
|     22478|         4|     Sample| 0.2259394055065273|
|     22478|         4|         头尾|0.22559704068724792|
|     22478|         4|      takes|0.22518122419602898|
|     22478|         4|       move|0.22308488704053098|
|     22478|         4|        pre|0.21394951379063445|
|     22478|         4|        row| 0.2095762616451642|
|     22478|         4|      cards| 0.2039234341394813|
|     22478|         4|     points| 0.2036436654

## 7.依据TF-IDF及TextRank得到词及其权重

In [33]:
# 计算关键词最后的权重，Textank  * IDF
idf = oa.spark.sql("select * from idf_keywords_values")
idf = idf.withColumnRenamed("keyword", "keyword1")
result = textrank.join(idf,textrank.keyword==idf.keyword1)
keywords_res = result.withColumn("weights", result.textrank * result.idf).select(["article_id", "channel_id", "keyword", "weights"])

In [34]:
# 20个Keyword，对应的权重，文章ID，channel_id
keywords_res.show()

+----------+----------+-----------+-------------------+
|article_id|channel_id|    keyword|            weights|
+----------+----------+-----------+-------------------+
|     22481|         4|      input| 0.5346733779728793|
|     22487|         4|   positive| 0.8459416122139333|
|     22480|         7|       解决问题| 1.4141939829843333|
|     22484|         7|perspective|  1.944927442774027|
|     22479|         7| imageView2| 3.4609112073758563|
|     22480|         7| imageView2|  8.313780177141174|
|     22484|         7| imageView2|  2.990045613531207|
|     22480|         7|  Component| 1.3777735958170947|
|     22484|         7|    classes| 1.2459725235044035|
|     22483|         4|         .h|0.44986033670933734|
|     22485|         4|         .h|0.13443772594302106|
|     22487|         4|   scenario| 1.0152874859556036|
|     22486|         7|         密钥| 0.6846628204088431|
|     22482|         7|         3d| 0.8676549955088683|
|     22483|         4|        std| 1.2161264249

In [35]:
keywords_res.registerTempTable("temptable")

In [36]:
keyword_weights_list = oa.spark.sql("select article_id, min(channel_id) channel_id, collect_list(keyword) keywords, collect_list(weights) weights from temptable group by article_id")

In [37]:
keyword_weights_list.show()

+----------+----------+--------------------+--------------------+
|article_id|channel_id|            keywords|             weights|
+----------+----------+--------------------+--------------------+
|         7|        17|[以太坊, code, 区块链, ...|[1.86215305525475...|
|         6|        17|[jpg, code, npm, ...|[1.42915555871433...|
|         9|        17|[icon, 思想, 关键, 对象...|[2.57576014807436...|
|         5|        17|[upload, code, he...|[0.80603375517041...|
|         1|        17|[childNode, code,...|[1.54330896515555...|
|        10|        17|[amp, .h, upload,...|[0.95265972946434...|
|         3|        17|[jpg, code, log, ...|[1.80831852158129...|
|         8|        17|[jpg, code, ul, 模...|[0.90150623804979...|
|         2|        17|[input, textReg, ...|[0.77476671719451...|
|         4|        17|[jpg, 文件, 文件夹, el...|[2.22301184125171...|
+----------+----------+--------------------+--------------------+



In [38]:
def keyword_weights_to_dict(row):
    return row.article_id, row.channel_id, dict(zip(row.keywords, row.weights))
    
keywords = keyword_weights_list.rdd.map(keyword_weights_to_dict).toDF(['article_id', 'channel_id', 'keywords'])

In [39]:
keywords.show()

+----------+----------+--------------------+
|article_id|channel_id|            keywords|
+----------+----------+--------------------+
|     22483|         4|Map(pre -> 0.6686...|
|     22480|         7|Map(机器学习 -> 2.259...|
|     22481|         4|Map(pre -> 0.7732...|
|     22484|         7|Map(imageView2 ->...|
|     22487|         4|Map(scenario -> 1...|
|     22482|         7|Map(脱敏 -> 3.85482...|
|     22479|         7|Map(InnoDB -> 0.8...|
|     22478|         4|Map(pre -> 0.7844...|
|     22486|         7|Map(脱敏 -> 1.28449...|
|     22485|         4|Map(对象 -> 0.10844...|
+----------+----------+--------------------+



## 8.依据TF-IDF及TextRank得到主题词

In [40]:
topic_sql = """
                select t.article_id article_id2, collect_set(t.keyword) topics from tfidf_keywords_values t
                inner join 
                textrank_keywords_values r
                where t.keyword=r.keyword
                group by article_id2
                """

article_topics = oa.spark.sql(topic_sql)

In [42]:
#article_topics.show()

## 9. 依据词权重和主题词得到文章画像

In [43]:
article_profile = keywords.join(article_topics, keywords.article_id==article_topics.article_id2).select(["article_id", "channel_id", "keywords", "topics"])


In [45]:
# article_profile.show()

# 10. 词向量模型训练

In [46]:
# 通过少量数据来演示训练
from pyspark.ml.feature import Word2Vec


w2v = Word2Vec(vectorSize=100, inputCol='words', outputCol='model', minCount=3)
w2v_model = w2v.fit(words_df)
w2v_model.write().overwrite().save("hdfs://hadoop-master:9000/headlines/models/test.word2vec")

In [49]:
article_profile.show()

+----------+----------+--------------------+--------------------+
|article_id|channel_id|            keywords|              topics|
+----------+----------+--------------------+--------------------+
|     22483|         4|Map(pre -> 0.6686...|[#include, blog, ...|
|     22480|         7|Map(机器学习 -> 2.259...|[线性, 课程, 监督学习, PC...|
|     22481|         4|Map(pre -> 0.7732...|[the, #include, w...|
|     22484|         7|Map(imageView2 ->...|[Class2, are, the...|
|     22487|         4|Map(scenario -> 1...|[the, exist, #inc...|
|     22482|         7|Map(脱敏 -> 3.85482...|[开发环境, 互联网金融, 数据库...|
|     22478|         4|Map(pre -> 0.7844...|[the, puzzle, Sam...|
|     22479|         7|Map(InnoDB -> 0.8...|[主键, 结构, 索引, 数据结构...|
|     22486|         7|Map(脱敏 -> 1.28449...|[企业, 鉴权, 流量, 密钥, ...|
|     22485|         4|Map(对象 -> 0.10844...|[#include, 重载函数, ...|
+----------+----------+--------------------+--------------------+



## 11.文章向量

In [50]:
# 求出增量文章的词向量，增量文章 一共10篇文章
# 1、加载某个频道模型，得到每个词的向量
from pyspark.ml.feature import Word2VecModel

word_vec = Word2VecModel.load("hdfs://hadoop-master:9000/headlines/models/test.word2vec")
vectors = word_vec.getVectors()

In [51]:
vectors.show()

+--------------+--------------------+
|          word|              vector|
+--------------+--------------------+
|        assert|[5.15189778525382...|
|        plugin|[-0.0160957667976...|
|          元素节点|[0.00193600240163...|
|defineReactive|[0.00345730711705...|
|            范围|[-0.0103137595579...|
|          配置文件|[0.00372160435654...|
|            视图|[0.00392060168087...|
|            函数|[-0.0508100353181...|
|           技术栈|[0.00446692341938...|
|           for|[0.00293453829362...|
|            源码|[-0.0073109660297...|
|            对象|[-0.0189802478998...|
|           Set|[-0.0037572700530...|
|        event1|[0.00187883735634...|
|        github|[0.00680689420551...|
|            模式|[-0.0023340869229...|
|        goLink|[-0.0023906398564...|
|      computed|[-7.4787792982533...|
|            腾讯|[-0.0025746505707...|
|          test|[0.00448889844119...|
+--------------+--------------------+
only showing top 20 rows



In [55]:
# 获取频道的文章画像，得到文章画像的关键词，找到这些文章关键词对应词向量
python_article_profile = article_profile.filter('channel_id=7')

In [56]:
python_article_profile.show()

+----------+----------+--------------------+--------------------+
|article_id|channel_id|            keywords|              topics|
+----------+----------+--------------------+--------------------+
|     22480|         7|Map(机器学习 -> 2.259...|[线性, 课程, 监督学习, PC...|
|     22484|         7|Map(imageView2 ->...|[the, Class2, are...|
|     22482|         7|Map(脱敏 -> 3.85482...|[开发环境, 互联网金融, 数据库...|
|     22479|         7|Map(InnoDB -> 0.8...|[主键, 索引, 结构, 数据结构...|
|     22486|         7|Map(脱敏 -> 1.28449...|[企业, 鉴权, 流量, 密钥, ...|
+----------+----------+--------------------+--------------------+



In [57]:
# 将文章画像的字典， 词语与权重 进行展开
python_article_profile.registerTempTable('profile')

_articlekeywordsweight = oa.spark.sql("select article_id,  channel_id, keyword, weight from profile LATERAL VIEW explode(keywords) AS keyword, weight")


_articlekeywordsweight.show()

+----------+----------+----------+-------------------+
|article_id|channel_id|   keyword|             weight|
+----------+----------+----------+-------------------+
|     22480|         7|      监督学习|  2.105937649590076|
|     22480|         7|      data| 0.8378037137518389|
|     22480|         7|     无监督学习|  2.486908523456845|
|     22480|         7|        模型|  1.399420809615264|
|     22480|         7|        概率| 1.2466269632225835|
|     22480|         7|      解决问题| 1.4141939829843333|
|     22480|         7|        智能| 1.1118132206713027|
|     22480|         7|       PCA|  3.212478707919032|
|     22480|         7|imageView2|  8.313780177141174|
|     22480|         7|        方法|0.45271982138359884|
|     22480|         7|        算法| 1.3551328514426744|
|     22480|         7|        数据| 0.7564744965158455|
|     22480|         7|      机器学习|  2.259546012742514|
|     22480|         7|      人工智能| 1.0557119355410218|
|     22480|         7|        标签| 0.9204897951866742|
|     2248

In [58]:
# 将文章词权重 和 词向量 模型 进行 inner 合并
article_keyword_vec_weights = _articlekeywordsweight.join(vectors, vectors.word==_articlekeywordsweight.keyword, "inner")

In [59]:
article_keyword_vec_weights.show()

+----------+----------+-------+-------------------+------+--------------------+
|article_id|channel_id|keyword|             weight|  word|              vector|
+----------+----------+-------+-------------------+------+--------------------+
|     22480|         7|   data| 0.8378037137518389|  data|[0.00160884996876...|
|     22480|         7|     智能| 1.1118132206713027|    智能|[0.00157674017827...|
|     22480|         7|     方法|0.45271982138359884|    方法|[-0.0550118535757...|
|     22480|         7|     数据| 0.7564744965158455|    数据|[-0.0378144755959...|
|     22480|         7|  https| 0.6369539173053653| https|[0.03507803753018...|
|     22480|         7|     课程|  1.452165241061537|    课程|[0.00388504215516...|
|     22484|         7|     属性| 0.7121983306475594|    属性|[0.00102691771462...|
|     22484|         7|     关系|  0.758845077140017|    关系|[-0.0104866530746...|
|     22484|         7|   data| 0.7625939542580085|  data|[0.00160884996876...|
|     22484|         7|     方法| 0.234578

In [60]:
# 得到文章向量
article_keyword_vec_weights.registerTempTable("temptable")

def func(row):
    x = 0
    for v in row.vectors:
        x += v
        
    return row.article_id, row.channel_id, x / len(row.vectors)

article_vector = oa.spark.sql("select article_id, min(channel_id) channel_id, collect_set(vector) vectors from temptable group by article_id").rdd.map(func).toDF(['article_id', 'channel_id', 'articlevector'])


In [61]:
article_vector.show()
# article_vector.write.insertInto("article_vector")

+----------+----------+--------------------+
|article_id|channel_id|       articlevector|
+----------+----------+--------------------+
|     22480|         7|[-0.0084462765565...|
|     22484|         7|[0.00893512158654...|
|     22482|         7|[-0.0085302157967...|
|     22479|         7|[-0.0075360939837...|
|     22486|         7|[0.01185075202956...|
+----------+----------+--------------------+

