In [1]:
import os
import sys
# 如果当前代码文件运行测试需要加入修改路径，避免出现后导包问题
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.insert(0, os.path.join(BASE_DIR))
print(BASE_DIR)
PYSPARK_PYTHON = "/miniconda2/envs/reco_sys/bin/python"
# 当存在多个版本时，不指定很可能会导致出错
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON
from offline import SparkSessionBase

/root/toutiao_project/reco_sys


In [2]:
class OriginArticleData(SparkSessionBase):
    
    SPARK_APP_NAME = "mergeArticle"
    SPARK_URL = "yarn"

    ENABLE_HIVE_SUPPORT = True
    
    def __init__(self):
        self.spark = self._create_spark_session()

In [3]:
oa = OriginArticleData()

In [4]:
# 进行文章 前两个表 的合并
oa.spark.sql("use toutiao")
# news_article_basic 与news_article_content, article_id
titlce_content = oa.spark.sql("select a.article_id, a.channel_id, a.title, b.content from news_article_basic a inner join news_article_content b on a.article_id=b.article_id where a.article_id=116636")


In [5]:
titlce_content.show()

+----------+----------+---------------+--------------------+
|article_id|channel_id|          title|             content|
+----------+----------+---------------+--------------------+
|    116636|        18|动态再平衡投资策略历史数据回测|<p>赚钱是个俗气的话题，但又是人...|
+----------+----------+---------------+--------------------+



In [7]:
# 进行title_content 与 文章频道名称合并
titlce_content.registerTempTable('temptable')

channel_title_content = oa.spark.sql("select t.*, n.channel_name from temptable t left join news_channel n on t.channel_id=n.channel_id")




In [8]:
channel_title_content.show()

+----------+----------+---------------+--------------------+------------+
|article_id|channel_id|          title|             content|channel_name|
+----------+----------+---------------+--------------------+------------+
|    116636|        18|动态再平衡投资策略历史数据回测|<p>赚钱是个俗气的话题，但又是人...|      python|
+----------+----------+---------------+--------------------+------------+



In [9]:
# 合并三个内容到一个字符串
import pyspark.sql.functions as F

sentence_df = channel_title_content.select("article_id", "channel_id", "channel_name", "title", "content", 
                            F.concat_ws(',', 
                                       channel_title_content.channel_name,
                                       channel_title_content.title,
                                       channel_title_content.content).alias('sentence'))


In [10]:
sentence_df.show()

+----------+----------+------------+---------------+--------------------+--------------------+
|article_id|channel_id|channel_name|          title|             content|            sentence|
+----------+----------+------------+---------------+--------------------+--------------------+
|    116636|        18|      python|动态再平衡投资策略历史数据回测|<p>赚钱是个俗气的话题，但又是人...|python,动态再平衡投资策略历...|
+----------+----------+------------+---------------+--------------------+--------------------+



In [4]:
# 读取文章，进行每篇张分词
oa.spark.sql("use article")
article_data = oa.spark.sql("select * from article_data limit 10")
article_data.show()


+----------+----------+------------+--------------------+--------------------+--------------------+
|article_id|channel_id|channel_name|               title|             content|            sentence|
+----------+----------+------------+--------------------+--------------------+--------------------+
|         1|        17|          前端|     Vue props用法小结原荐|<p><strong>Vue pr...|前端,Vue props用法小结原...|
|         2|        17|          前端|vue.js响应式原理解析与实现—...|<p>上次我们已经分析了vue.j...|前端,vue.js响应式原理解析与...|
|         3|        17|          前端|JavaScript中浅拷贝和深拷...|<p>要理解 JavaScript...|前端,JavaScript中浅拷贝...|
|         4|        17|          前端|基于vue2.0 +vuex+ e...|<p>效果演示地址,</p><p>...|前端,基于vue2.0 +vuex...|
|         5|        17|          前端|immutability因Reac...|<p><img src="http...|前端,immutability因R...|
|         6|        17|          前端|简单了解 node npm cnp...|<span id="OSC_h1_...|前端,简单了解 node npm ...|
|         7|        17|          前端|       Web工程师以太坊入门原荐|<p>我经常构建使用以太坊的Web...|前端,Web工程师以太坊入门原荐,...|


In [5]:
# 文章数据进行分词处理,得到分词结果
# 分词
def segmentation(partition):
    import os
    import re

    import jieba
    import jieba.analyse
    import jieba.posseg as pseg
    import codecs

    abspath = "/root/words"

    # 结巴加载用户词典
    userDict_path = os.path.join(abspath, "ITKeywords.txt")
    jieba.load_userdict(userDict_path)

    # 停用词文本
    stopwords_path = os.path.join(abspath, "stopwords.txt")

    def get_stopwords_list():
        """返回stopwords列表"""
        stopwords_list = [i.strip()
                          for i in codecs.open(stopwords_path).readlines()]
        return stopwords_list

    # 所有的停用词列表
    stopwords_list = get_stopwords_list()

    # 分词
    def cut_sentence(sentence):
        """对切割之后的词语进行过滤，去除停用词，保留名词，英文和自定义词库中的词，长度大于2的词"""
        # print(sentence,"*"*100)
        # eg:[pair('今天', 't'), pair('有', 'd'), pair('雾', 'n'), pair('霾', 'g')]
        seg_list = pseg.lcut(sentence)
        seg_list = [i for i in seg_list if i.flag not in stopwords_list]
        filtered_words_list = []
        for seg in seg_list:
            # print(seg)
            if len(seg.word) <= 1:
                continue
            elif seg.flag == "eng":
                if len(seg.word) <= 2:
                    continue
                else:
                    filtered_words_list.append(seg.word)
            elif seg.flag.startswith("n"):
                filtered_words_list.append(seg.word)
            elif seg.flag in ["x", "eng"]:  # 是自定一个词语或者是英文单词
                filtered_words_list.append(seg.word)
        return filtered_words_list

    for row in partition:
        sentence = re.sub("<.*?>", "", row.sentence)    # 替换掉标签数据
        words = cut_sentence(sentence)
        yield row.article_id, row.channel_id, words


In [6]:
words_df = article_data.rdd.mapPartitions(segmentation).toDF(['article_id', 'channel_id', 'words'])

In [7]:
words_df.show()

+----------+----------+--------------------+
|article_id|channel_id|               words|
+----------+----------+--------------------+
|         1|        17|[Vue, props, 用法, ...|
|         2|        17|[vue, 响应式, 原理, mo...|
|         3|        17|[JavaScript, 浅拷贝,...|
|         4|        17|[vue2, vuex, elem...|
|         5|        17|[immutability, Re...|
|         6|        17|[node, npm, cnpm,...|
|         7|        17|[Web, 工程师, 以太坊, 入...|
|         8|        17|[Web, pa, api, we...|
|         9|        17|[vue, 中用, 数据驱动, 视...|
|        10|        17|[程序, WebSocket, 长...|
+----------+----------+--------------------+



In [8]:
# 先计算分词之后的每篇文章的词频，得到CV模型
# 统计所有文章不同的词，组成一个词列表 words_list = [1,2,3,,34,4,45,56,67,78,8.......,,,,.]
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol='words', outputCol='countFeatures', vocabSize=2000, minDF=1.0)
cv_model = cv.fit(words_df)

# 然后根据词频计算IDF以及词，得到IDF模型


In [9]:
cv_model.write().overwrite().save("hdfs://hadoop-master:9000/headlines/models/test.model")

In [10]:
from pyspark.ml.feature import CountVectorizerModel
cv_m = CountVectorizerModel.load("hdfs://hadoop-master:9000/headlines/models/test.model")

In [11]:
cv_result = cv_m.transform(words_df)

In [12]:
cv_result.show()

+----------+----------+--------------------+--------------------+
|article_id|channel_id|               words|       countFeatures|
+----------+----------+--------------------+--------------------+
|      4273|        15|[javascript, reac...|(986,[2,4,9,10,11...|
|      4274|        19|[java, java, 笔记, ...|(986,[0,1,8,16,17...|
|      4275|        19|[java, 传统, 方式, 类继...|(986,[1,2,8,16,18...|
|      4276|        15|[javascript, Vue,...|(986,[1,2,4,6,8,1...|
|      4278|        15|[javascript, 作用域链...|(986,[2,3,10,18,2...|
|      4279|        19|[java, springboot...|(986,[1,8,11,23,2...|
|      4280|        19|[java, Jedis, 工具类...|(986,[1,2,4,5,7,8...|
|      4281|        19|[java, java, 记录, ...|(986,[2,16,23,32,...|
|      4282|        15|[javascript, VueS...|(986,[2,4,6,10,16...|
|      4283|        15|[javascript, 体积, ...|(986,[2,3,4,10,11...|
+----------+----------+--------------------+--------------------+



In [13]:
# IDF 模型
from pyspark.ml.feature import IDF
idf = IDF(inputCol="countFeatures", outputCol="idfFeatures")
idfModel = idf.fit(cv_result)
idfModel.write().overwrite().save("hdfs://hadoop-master:9000/headlines/models/testIDF.model")

In [14]:
# 可以进行转换
cv_m.vocabulary

['&#',
 'String',
 '代码',
 '作用域',
 'pa',
 'key',
 '客户端',
 'jedis',
 'public',
 'Hooks',
 '函数',
 'ul',
 '组件',
 'scope',
 'return',
 '模块',
 '方法',
 'import',
 '时候',
 'count',
 'res',
 '.h',
 'this',
 'java',
 '问题',
 'Long',
 'Override',
 'class',
 'com',
 '声明',
 'web',
 'name',
 '线程',
 'constructor',
 'value',
 '逻辑',
 'props',
 'useEffect',
 'node',
 'start',
 '插件',
 '项目',
 'field',
 'rams',
 'vue',
 'useState',
 'arg',
 'Jedis',
 'event',
 'command',
 'jedisPool',
 '服务端',
 'action',
 '例子',
 '官方',
 'Enumeration',
 '参数',
 'state',
 'util',
 'close',
 'function',
 '情况',
 'catch',
 'title',
 'const',
 '文件',
 'new',
 'set',
 'jedisCluster',
 'void',
 'redis',
 'getResource',
 'clients',
 '标识',
 'onParseClientResp',
 'Thread',
 'enu',
 '.a',
 'from',
 'hooks',
 '页面',
 '全局',
 'get',
 '大家',
 'end',
 'fireEvent',
 'react',
 'document',
 'times',
 'clicked',
 'You',
 'listener',
 '赋值',
 'Vector',
 'server',
 '结果',
 'client',
 'var',
 'bean',
 'isOnline',
 '降级',
 'tml',
 'android',
 'toast',
 'Strin

In [15]:
idfModel.idf.toArray()[:20]

array([1.70474809, 0.6061358 , 0.2006707 , 1.29928298, 0.6061358 ,
       1.70474809, 1.29928298, 1.70474809, 0.6061358 , 1.70474809,
       0.6061358 , 0.6061358 , 1.29928298, 1.29928298, 0.78845736,
       1.29928298, 0.45198512, 0.6061358 , 0.2006707 , 1.70474809])

In [16]:
# IDF对CV结果进行计算TFIDF
from pyspark.ml.feature import IDFModel
idf_model = IDFModel.load("hdfs://hadoop-master:9000/headlines/models/testIDF.model")
tfidf_res = idf_model.transform(cv_result)

In [17]:
tfidf_res.show()

+----------+----------+--------------------+--------------------+--------------------+
|article_id|channel_id|               words|       countFeatures|         idfFeatures|
+----------+----------+--------------------+--------------------+--------------------+
|      4273|        15|[javascript, reac...|(986,[2,4,9,10,11...|(986,[2,4,9,10,11...|
|      4274|        19|[java, java, 笔记, ...|(986,[0,1,8,16,17...|(986,[0,1,8,16,17...|
|      4275|        19|[java, 传统, 方式, 类继...|(986,[1,2,8,16,18...|(986,[1,2,8,16,18...|
|      4276|        15|[javascript, Vue,...|(986,[1,2,4,6,8,1...|(986,[1,2,4,6,8,1...|
|      4278|        15|[javascript, 作用域链...|(986,[2,3,10,18,2...|(986,[2,3,10,18,2...|
|      4279|        19|[java, springboot...|(986,[1,8,11,23,2...|(986,[1,8,11,23,2...|
|      4280|        19|[java, Jedis, 工具类...|(986,[1,2,4,5,7,8...|(986,[1,2,4,5,7,8...|
|      4281|        19|[java, java, 记录, ...|(986,[2,16,23,32,...|(986,[2,16,23,32,...|
|      4282|        15|[javascript, VueS...

In [18]:
# 1265词的 {索引 以及 权重}
def func(partition):
    TOPK = 20
    for row in partition:
        # 找到索引与IDF值并进行排序
        _ = list(zip(row.idfFeatures.indices, row.idfFeatures.values))
        _ = sorted(_, key=lambda x: x[1], reverse=True)
        result = _[:TOPK]
        for word_index, tfidf in result:
            yield row.article_id, row.channel_id, int(word_index), round(float(tfidf), 4)
kewords_tfidf = tfidf_res.rdd.mapPartitions(func).toDF(['article_id', 'channel_id', 'index', 'weights'])

In [19]:
kewords_tfidf.show()

+----------+----------+-----+-------+
|article_id|channel_id|index|weights|
+----------+----------+-----+-------+
|      4273|        15|    9|66.4852|
|      4273|        15|   12|44.1756|
|      4273|        15|   19|37.5045|
|      4273|        15|   36| 27.276|
|      4273|        15|   37|25.5712|
|      4273|        15|   45|23.8665|
|      4273|        15|   57|18.7522|
|      4273|        15|   35|15.5914|
|      4273|        15|   79|15.3427|
|      4273|        15|   22|14.1624|
|      4273|        15|   87| 13.638|
|      4273|        15|   88| 13.638|
|      4273|        15|   89| 13.638|
|      4273|        15|   90| 13.638|
|      4273|        15|   24|12.6153|
|      4273|        15|   99|11.9332|
|      4273|        15|   64|11.6935|
|      4273|        15|   63|10.3943|
|      4273|        15|   86|10.3943|
|      4273|        15|  123|10.2285|
+----------+----------+-----+-------+
only showing top 20 rows



In [20]:
# 利用keywordsIndex = ktt.spark.sql("select keyword, index idx from idf_keywords_values")中标，知道索引对应的词
idf_keywords_values = oa.spark.sql("select keyword, index idx from idf_keywords_values")

In [24]:
keyword_str_tfidf = kewords_tfidf.join(idf_keywords_values, idf_keywords_values.idx==kewords_tfidf.index).select(["article_id", "channel_id", "keyword", "weights"])

keyword_str_tfidf.show()




+----------+----------+-------+--------+
|article_id|channel_id|keyword| weights|
+----------+----------+-------+--------+
|         4|        17|  https|  5.1971|
|         6|        17|  https|  5.1971|
|         8|        17|  https|  2.5986|
|         1|        17|   人工智能|   9.095|
|         2|        17|   人工智能| 74.0591|
|         5|        17|   人工智能| 12.9928|
|         7|        17|   人工智能|  5.1971|
|         6|        17|     脚本|  5.1142|
|         8|        17|     价格|  8.5237|
|         4|        17|  close|  2.5986|
|         5|        17|     功能|  5.1971|
|         6|        17|     功能|   9.095|
|         8|        17|     功能|  3.8978|
|         3|        17|     &#| 49.4377|
|         6|        17|     &#| 10.2285|
|         1|        17|     阶段|  5.1971|
|         3|        17|   lang|   6.819|
|         8|        17|     架构|  1.7047|
|         6|        17|    互联网|   6.819|
|         2|        17|     用户|131.5081|
+----------+----------+-------+--------+
only showing top

In [25]:
# texrank
# 分词
def textrank(partition):
    import os

    import jieba
    import jieba.analyse
    import jieba.posseg as pseg
    import codecs

    abspath = "/root/words"

    # 结巴加载用户词典
    userDict_path = os.path.join(abspath, "ITKeywords.txt")
    jieba.load_userdict(userDict_path)

    # 停用词文本
    stopwords_path = os.path.join(abspath, "stopwords.txt")

    def get_stopwords_list():
        """返回stopwords列表"""
        stopwords_list = [i.strip()
                          for i in codecs.open(stopwords_path).readlines()]
        return stopwords_list

    # 所有的停用词列表
    stopwords_list = get_stopwords_list()

    class TextRank(jieba.analyse.TextRank):
        def __init__(self, window=20, word_min_len=2):
            super(TextRank, self).__init__()
            self.span = window  # 窗口大小
            self.word_min_len = word_min_len  # 单词的最小长度
            # 要保留的词性，根据jieba github ，具体参见https://github.com/baidu/lac
            self.pos_filt = frozenset(
                ('n', 'x', 'eng', 'f', 's', 't', 'nr', 'ns', 'nt', "nw", "nz", "PER", "LOC", "ORG"))

        def pairfilter(self, wp):
            """过滤条件，返回True或者False"""

            if wp.flag == "eng":
                if len(wp.word) <= 2:
                    return False

            if wp.flag in self.pos_filt and len(wp.word.strip()) >= self.word_min_len \
                    and wp.word.lower() not in stopwords_list:
                return True
    # TextRank过滤窗口大小为5，单词最小为2
    textrank_model = TextRank(window=5, word_min_len=2)
    allowPOS = ('n', "x", 'eng', 'nr', 'ns', 'nt', "nw", "nz", "c")

    for row in partition:
        tags = textrank_model.textrank(row.sentence, topK=20, withWeight=True, allowPOS=allowPOS, withFlag=False)
        for tag in tags:
            yield row.article_id, row.channel_id, tag[0], tag[1]

In [26]:
textrank = article_data.rdd.mapPartitions(textrank).toDF(["article_id", "channel_id", "keyword", "textrank"])

In [28]:
textrank.show()

+----------+----------+----------+-------------------+
|article_id|channel_id|   keyword|           textrank|
+----------+----------+----------+-------------------+
|      4273|        15|        pa|                1.0|
|      4273|        15|     class|  0.826167020155012|
|      4273|        15|      hljs| 0.7374857053060796|
|      4273|        15|        组件|0.44124935366123624|
|      4273|        15|     Hooks|0.33877451789133467|
|      4273|        15|        函数| 0.2194471428320325|
|      4273|        15|     props|0.19891852045726613|
|      4273|        15|   keyword| 0.1905742643729634|
|      4273|        15|        逻辑| 0.1875976416246032|
|      4273|        15|    strong|0.17119509383724216|
|      4273|        15|        .h| 0.1407131304974135|
|      4273|        15|      code|0.13019484484515423|
|      4273|        15|noreferrer|0.12350871873941668|
|      4273|        15|        官方|0.11352635275320319|
|      4273|        15|     react| 0.1125565606843615|
|      427

In [31]:
# 文章画像 关键词与权重合并
# textrank * idf
idf_keywords_values = oa.spark.sql("select * from idf_keywords_values")

In [32]:
idf_keywords_values.show()

+-------+------------------+-----+
|keyword|               idf|index|
+-------+------------------+-----+
|     &#| 1.417829594344155|    0|
|     pa|0.6651385256756351|    1|
|     ul|0.8070591229443697|    2|
|     代码|0.7368239176481552|    3|
|     方法|0.7506253985501485|    4|
|     数据|0.9375297590538404|    5|
| return|1.1584986818528347|    6|
|     对象|1.2765716628665975|    7|
|   name|1.3833429138490618|    8|
|   this|1.6247297855214076|    9|
| public|1.7540399682870398|   10|
|    int|1.6612207991983439|   11|
|    new|1.3335127364488795|   12|
|     问题|0.8151384673357938|   13|
|     函数|1.4147095597213706|   14|
|     .a|1.2475641921221166|   15|
|  class|1.3562548221032567|   16|
|     文件|1.2163286406564702|   17|
|    amp|1.5313880611157102|   18|
|    com|0.9229090811983397|   19|
+-------+------------------+-----+
only showing top 20 rows



In [33]:
keywords_res = textrank.join(idf_keywords_values, on=['keyword'], how='left')

In [34]:
keywords_res.show()

+---------+----------+----------+-------------------+------------------+------+
|  keyword|article_id|channel_id|           textrank|               idf| index|
+---------+----------+----------+-------------------+------------------+------+
|    input|         2|        17|  0.298715457651815|2.5936612831652797|   139|
|childNode|         1|        17|0.19610401758526286| 7.869848788205214| 20134|
|      amp|        10|        17| 0.6220890404292903|1.5313880611157102|    18|
|      jpg|         3|        17| 0.5186416300247046| 3.486643603011888|   727|
|      jpg|         4|        17| 0.6375793153425262| 3.486643603011888|   727|
|      jpg|         6|        17|0.40989436301426946| 3.486643603011888|   727|
|      jpg|         8|        17| 0.2585599047952723| 3.486643603011888|   727|
|  textReg|         2|        17|0.29488918445113477|10.741528413089226| 95140|
| fragment|         2|        17| 0.6411543089793412| 5.377111244836666|  2540|
|       文件|         4|        17| 0.3439

In [36]:
keywords_weights = keywords_res.withColumn('weights', keywords_res.textrank * keywords_res.idf).select(["article_id", "channel_id", "keyword", "weights"])




In [37]:
keywords_weights.show()

+----------+----------+---------+-------------------+
|article_id|channel_id|  keyword|            weights|
+----------+----------+---------+-------------------+
|         2|        17|    input| 0.7747667171945103|
|         1|        17|childNode|  1.543308965155555|
|        10|        17|      amp| 0.9526597294643435|
|         3|        17|      jpg| 1.8083185215812947|
|         4|        17|      jpg|  2.223011841251718|
|         6|        17|      jpg| 1.4291555587143352|
|         8|        17|      jpg| 0.9015062380497989|
|         2|        17|  textReg| 3.1675605534945737|
|         2|        17| fragment|  3.447558044488298|
|         4|        17|       文件|0.41832490256891997|
|         4|        17|      文件夹| 1.3030789658777715|
|         9|        17|     icon| 2.5757601480743695|
|        10|        17|       .h| 0.6363244670153154|
|         4|        17|  element|  1.589684137454334|
|         4|        17|    touzi| 3.8640914293104194|
|         5|        17|   up

In [38]:
keywords_weights.registerTempTable('temp')

keywords_weights = oa.spark.sql("select article_id, min(channel_id) channel_id, collect_list(keyword) keywords, collect_list(weights) weights from temp group by article_id")






In [39]:
keywords_weights.show()

+----------+----------+--------------------+--------------------+
|article_id|channel_id|            keywords|             weights|
+----------+----------+--------------------+--------------------+
|      4273|        15|[hljs, 逻辑, compon...|[3.69426241356356...|
|      4278|        15|[imageView2, code...|[7.40444917507044...|
|      4275|        19|[Thread, 类继承, cur...|[0.75086108179368...|
|      4281|        19|[调度, 运行时, 中断, 环境,...|[0.94025458334785...|
|      4279|        19|[utf, code, BINDI...|[0.48772006807450...|
|      4283|        15|[hljs, imageView2...|[2.83648176481845...|
|      4282|        15|[imageView2, 文件, ...|[8.31378017714117...|
|      4276|        15|[hljs, code, noop...|[4.01098232733758...|
|      4280|        19|[.h, beans, nnota...|[0.18233072091682...|
|      4274|        19|[Enumeration, Vec...|[2.27602837284537...|
+----------+----------+--------------------+--------------------+



In [40]:
# 合并关键词和权重到字典
def _func(row):
    return row.article_id, row.channel_id, dict(zip(row.keywords, row.weights))

article_kewords = keywords_weights.rdd.map(_func).toDF(['article_id', 'channel_id', 'keywords'])

In [41]:
article_kewords.show()

+----------+----------+--------------------+
|article_id|channel_id|            keywords|
+----------+----------+--------------------+
|      4273|        15|Map(函数 -> 0.31045...|
|      4278|        15|Map(imageView2 ->...|
|      4275|        19|Map(jvm -> 0.4719...|
|      4281|        19|Map(线程 -> 1.35536...|
|      4279|        19|Map(business -> 0...|
|      4283|        15|Map(imageView2 ->...|
|      4282|        15|Map(imageView2 ->...|
|      4276|        15|Map(viewplus -> 1...|
|      4280|        19|Map(Jedis -> 0.68...|
|      4274|        19|Map(pre -> 0.6368...|
+----------+----------+--------------------+



In [44]:
# 计算tfidf与texrank共同词作为主题词
topic_sql = "select t.article_id article_id2, collect_set(t.keyword) topics from tfidf_keywords_values t inner join textrank_keywords_values r where t.keyword=r.keyword group by article_id2"
article_topics = oa.spark.sql(topic_sql)


In [45]:
article_topics.show()

+-----------+--------------------+
|article_id2|              topics|
+-----------+--------------------+
|        148|[transform, solid...|
|        463|[clone, 按钮, 空格键, ...|
|        471|[font, DOCTYPE, m...|
|        496|[t02, lock, 线程, n...|
|        833|[modal, close, 属性...|
|       1088|[外边距, 宽度, 内边距, 像素...|
|       1238|[php, 服务员, 语言, 仓库...|
|       1342|[速度, 初速度, canvas,...|
|       1580|[vue, filename, r...|
|       1591|[内容, keywords, li...|
|       1645|[圆角, width, borde...|
|       1829|[mirrors, GCC, gn...|
|       1959|[GNU, openjdk, In...|
|       2122|[样式, CSS+DIV, 背景颜...|
|       2142|[tuple, fromkeys,...|
|       2366|[weights, 大话, &#,...|
|       2659|[pic, 定义, 绝对地址, 超...|
|       2866|[和子, 企业开发, class,...|
|       3175|[stretch, transfo...|
|       3749|[conpiler, callAs...|
+-----------+--------------------+
only showing top 20 rows



In [46]:
# 关键词与主题词结果合并，得到文章的最终完整画像
article_profile = article_kewords.join(article_topics, article_kewords.article_id==article_topics.article_id2).select(["article_id", "channel_id", "keywords", "topics"])
