## 1. 合并数据

In [1]:
import os
import sys
# 如果当前代码文件运行测试需要加入修改路径，避免出现后导包问题
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.insert(0, os.path.join(BASE_DIR))
print(BASE_DIR)
PYSPARK_PYTHON = "/miniconda2/envs/reco_sys/bin/python"
# 当存在多个版本时，不指定很可能会导致出错
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON
from offline import SparkSessionBase

/root/toutiao_project/reco_sys


In [2]:
class OriginArticleData(SparkSessionBase):
    
    SPARK_APP_NAME = "mergeArticle"
    SPARK_URL = "spark://hadoop-master:7077"

    ENABLE_HIVE_SUPPORT = True
    
    def __init__(self):
        self.spark = self._create_spark_session()

In [3]:
oa = OriginArticleData()

In [4]:
# 进行文章 前两个表 的合并
oa.spark.sql("use toutiao")
# news_article_basic 与news_article_content, article_id
titlce_content = oa.spark.sql("select a.article_id, a.channel_id, a.title, b.content from news_article_basic a inner join news_article_content b on a.article_id=b.article_id where a.article_id=116636")


In [5]:
titlce_content.show()

+----------+----------+---------------+--------------------+
|article_id|channel_id|          title|             content|
+----------+----------+---------------+--------------------+
|    116636|        18|动态再平衡投资策略历史数据回测|<p>赚钱是个俗气的话题，但又是人...|
+----------+----------+---------------+--------------------+



In [6]:
# 进行title_content 与 文章频道名称合并
titlce_content.registerTempTable('temptable')

channel_title_content = oa.spark.sql("select t.*, n.channel_name from temptable t left join news_channel n on t.channel_id=n.channel_id")




In [7]:
channel_title_content.show()

+----------+----------+---------------+--------------------+------------+
|article_id|channel_id|          title|             content|channel_name|
+----------+----------+---------------+--------------------+------------+
|    116636|        18|动态再平衡投资策略历史数据回测|<p>赚钱是个俗气的话题，但又是人...|      python|
+----------+----------+---------------+--------------------+------------+



In [8]:
# 合并三个内容到一个字符串
import pyspark.sql.functions as F

sentence_df = channel_title_content.select("article_id", "channel_id", "channel_name", "title", "content", 
                            F.concat_ws(',', 
                                       channel_title_content.channel_name,
                                       channel_title_content.title,
                                       channel_title_content.content).alias('sentence'))


In [9]:
sentence_df.show()
# sentence_df.write.insertInto("article_data")

+----------+----------+------------+---------------+--------------------+--------------------+
|article_id|channel_id|channel_name|          title|             content|            sentence|
+----------+----------+------------+---------------+--------------------+--------------------+
|    116636|        18|      python|动态再平衡投资策略历史数据回测|<p>赚钱是个俗气的话题，但又是人...|python,动态再平衡投资策略历...|
+----------+----------+------------+---------------+--------------------+--------------------+



## 2. 文章分词

In [10]:

# 读取文章，进行每篇张分词
oa.spark.sql("use article")
# article_data = oa.spark.sql("select * from article_data limit 10")
article_data = oa.spark.sql("select * from article_data where channel_id = 18 limit 10")
article_data.show()


+----------+----------+------------+--------------------+--------------------+--------------------+
|article_id|channel_id|channel_name|               title|             content|            sentence|
+----------+----------+------------+--------------------+--------------------+--------------------+
|     12237|        18|      python|想学习区块链？那就用 Python...|<div id="article_...|python,想学习区块链？那就用...|
|     12238|        18|      python|鲜为人知的 Python 语法 使...|<p>所有人（好吧，不是所有人）都...|python,鲜为人知的 Pyth...|
|     12243|        18|      python|手把手教你写网络爬虫（4）：Scr...|<div id="cnblogs_...|python,手把手教你写网络爬虫...|
|     12245|        18|      python|手把手教你写网络爬虫（5）：Pha...|<div id="cnblogs_...|python,手把手教你写网络爬虫...|
|     12247|        18|      python|用 Plumbum 开发 Pyth...|<div id="article_...|python,用 Plumbum ...|
|     12249|        18|      python|手把手教你写网络爬虫（6）：分布式...|<div id="cnblogs_...|python,手把手教你写网络爬虫...|
|     12251|        18|      python|手把手教你写网络爬虫（7）：URL...|<p><a href="http:...|python,手把手教你写网络爬虫...|


In [11]:
# 文章数据进行分词处理,得到分词结果
# 分词
def segmentation(partition):
    import os
    import re

    import jieba
    import jieba.analyse
    import jieba.posseg as pseg
    import codecs

    abspath = "/root/words"

    # 结巴加载用户词典
    userDict_path = os.path.join(abspath, "ITKeywords.txt")
    jieba.load_userdict(userDict_path)

    # 停用词文本
    stopwords_path = os.path.join(abspath, "stopwords.txt")

    def get_stopwords_list():
        """返回stopwords列表"""
        stopwords_list = [i.strip()
                          for i in codecs.open(stopwords_path).readlines()]
        return stopwords_list

    # 所有的停用词列表
    stopwords_list = get_stopwords_list()

    # 分词
    def cut_sentence(sentence):
        """对切割之后的词语进行过滤，去除停用词，保留名词，英文和自定义词库中的词，长度大于2的词"""
        # print(sentence,"*"*100)
        # eg:[pair('今天', 't'), pair('有', 'd'), pair('雾', 'n'), pair('霾', 'g')]
        seg_list = pseg.lcut(sentence)
        seg_list = [i for i in seg_list if i.flag not in stopwords_list]
        filtered_words_list = []
        for seg in seg_list:
            # print(seg)
            if len(seg.word) <= 1:
                continue
            elif seg.flag == "eng":
                if len(seg.word) <= 2:
                    continue
                else:
                    filtered_words_list.append(seg.word)
            elif seg.flag.startswith("n"):
                filtered_words_list.append(seg.word)
            elif seg.flag in ["x", "eng"]:  # 是自定一个词语或者是英文单词
                filtered_words_list.append(seg.word)
        return filtered_words_list

    for row in partition:
        sentence = re.sub("<.*?>", "", row.sentence)    # 替换掉标签数据
        words = cut_sentence(sentence)
        yield row.article_id, row.channel_id, words


In [12]:
words_df = article_data.rdd.mapPartitions(segmentation).toDF(['article_id', 'channel_id', 'words'])

In [13]:
words_df.show()

+----------+----------+--------------------+
|article_id|channel_id|               words|
+----------+----------+--------------------+
|     12237|        18|[python, 区块链, Pyt...|
|     12238|        18|[python, Python, ...|
|     12243|        18|[python, 手把手, 网络,...|
|     12245|        18|[python, 手把手, 网络,...|
|     12247|        18|[python, Plumbum,...|
|     12249|        18|[python, 手把手, 网络,...|
|     12251|        18|[python, 手把手, 网络,...|
|     12252|        18|[python, 手把手, 网络,...|
|     12253|        18|[python, 豆瓣, 大家, ...|
|     12254|        18|[python, Python, ...|
|     12255|        18|[python, shell, P...|
|     12256|        18|[python, Python, ...|
|     12257|        18|[python, Python, ...|
|     12258|        18|[python, Python, ...|
|     12259|        18|[python, Python, ...|
|     12260|        18|[python, 爬虫, 进阶, ...|
|     12261|        18|[python, Python, ...|
|     12262|        18|[python, Python, ...|
|     12263|        18|[python, Python, ...|
|     1226

## 3. 得到词频CV模型

In [14]:
# 先计算分词之后的每篇文章的词频，得到CV模型
# 统计所有文章不同的词，组成一个词列表 words_list = [1,2,3,,34,4,45,56,67,78,8.......,,,,.]
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol='words', outputCol='countFeatures', vocabSize=2000, minDF=1.0)
cv_model = cv.fit(words_df)

# 然后根据词频计算IDF以及词，得到IDF模型


In [15]:
cv_model.write().overwrite().save("hdfs://hadoop-master:9000/headlines/models/test.model")

In [16]:
from pyspark.ml.feature import CountVectorizerModel
cv_m = CountVectorizerModel.load("hdfs://hadoop-master:9000/headlines/models/test.model")

In [17]:
cv_result = cv_m.transform(words_df)

In [18]:
cv_result.show()

+----------+----------+--------------------+--------------------+
|article_id|channel_id|               words|       countFeatures|
+----------+----------+--------------------+--------------------+
|     12237|        18|[python, 区块链, Pyt...|(2000,[0,1,2,3,4,...|
|     12238|        18|[python, Python, ...|(2000,[0,1,4,5,7,...|
|     12243|        18|[python, 手把手, 网络,...|(2000,[0,2,3,4,5,...|
|     12245|        18|[python, 手把手, 网络,...|(2000,[0,1,3,5,6,...|
|     12247|        18|[python, Plumbum,...|(2000,[0,1,2,3,4,...|
|     12249|        18|[python, 手把手, 网络,...|(2000,[0,3,9,11,1...|
|     12251|        18|[python, 手把手, 网络,...|(2000,[0,5,9,10,1...|
|     12252|        18|[python, 手把手, 网络,...|(2000,[0,3,5,9,12...|
|     12253|        18|[python, 豆瓣, 大家, ...|(2000,[0,1,2,3,6,...|
|     12254|        18|[python, Python, ...|(2000,[0,1,3,5,8,...|
|     12255|        18|[python, shell, P...|(2000,[0,3,5,7,9,...|
|     12256|        18|[python, Python, ...|(2000,[0,5,7,9,20...|
|     1225

## 4. 得到IDF模型

In [19]:
# IDF 模型
from pyspark.ml.feature import IDF
idf = IDF(inputCol="countFeatures", outputCol="idfFeatures")
idfModel = idf.fit(cv_result)
idfModel.write().overwrite().save("hdfs://hadoop-master:9000/headlines/models/testIDF.model")

In [20]:
# 可以进行转换
cv_m.vocabulary

['&#',
 'print',
 '__',
 'pa',
 'self',
 'Python',
 'name',
 '代码',
 'def',
 'python',
 '函数',
 'ul',
 '方法',
 '数据',
 'import',
 '对象',
 'for',
 'return',
 '.a',
 '参数',
 '文件',
 'class',
 '结果',
 'data',
 'list',
 '列表',
 'get',
 '字符串',
 'com',
 'True',
 '模块',
 'from',
 'url',
 '问题',
 '元素',
 '内容',
 'str',
 'time',
 'the',
 '时候',
 '信息',
 '定义',
 'None',
 'res',
 'key',
 '类型',
 '方式',
 '属性',
 'request',
 '.h',
 'value',
 '程序',
 '字典',
 'test',
 '用户',
 'range',
 'https',
 'else',
 'type',
 'input',
 '模型',
 'init',
 'user',
 'False',
 'index',
 'set',
 'dict',
 'age',
 'http',
 'text',
 'int',
 'and',
 'num',
 '功能',
 'amp',
 '时间',
 'file',
 'main',
 'len',
 'utf',
 '项目',
 'count',
 'start',
 'func',
 '情况',
 'args',
 '字符',
 '线程',
 '进程',
 'object',
 'json',
 'not',
 '实例',
 'with',
 'item',
 'open',
 'www',
 '目录',
 '数字',
 'format',
 'content',
 'new',
 'app',
 '爬虫',
 'info',
 'django',
 'response',
 'plt',
 '命令',
 'ppend',
 'title',
 '索引',
 '过程',
 '版本',
 'while',
 '语句',
 '语言',
 'html',
 'line',
 'run',

In [21]:
idfModel.idf.toArray()[:20]

array([2.29385141, 0.66970014, 1.17276627, 0.61771575, 1.53059209,
       0.49503521, 0.91052588, 0.62439357, 0.88146756, 0.        ,
       0.91251001, 0.83376675, 0.70376959, 0.90731005, 0.75566754,
       1.13267884, 0.82145188, 1.06220192, 0.91624089, 1.0354578 ])

## 5.依据CV及IDF得到TF-IDF

In [22]:
# IDF对CV结果进行计算TFIDF
from pyspark.ml.feature import IDFModel
idf_model = IDFModel.load("hdfs://hadoop-master:9000/headlines/models/testIDF.model")
tfidf_res = idf_model.transform(cv_result)

In [23]:
tfidf_res.show()

+----------+----------+--------------------+--------------------+--------------------+
|article_id|channel_id|               words|       countFeatures|         idfFeatures|
+----------+----------+--------------------+--------------------+--------------------+
|     12237|        18|[python, 区块链, Pyt...|(2000,[0,1,2,3,4,...|(2000,[0,1,2,3,4,...|
|     12238|        18|[python, Python, ...|(2000,[0,1,4,5,7,...|(2000,[0,1,4,5,7,...|
|     12243|        18|[python, 手把手, 网络,...|(2000,[0,2,3,4,5,...|(2000,[0,2,3,4,5,...|
|     12245|        18|[python, 手把手, 网络,...|(2000,[0,1,3,5,6,...|(2000,[0,1,3,5,6,...|
|     12247|        18|[python, Plumbum,...|(2000,[0,1,2,3,4,...|(2000,[0,1,2,3,4,...|
|     12249|        18|[python, 手把手, 网络,...|(2000,[0,3,9,11,1...|(2000,[0,3,9,11,1...|
|     12251|        18|[python, 手把手, 网络,...|(2000,[0,5,9,10,1...|(2000,[0,5,9,10,1...|
|     12252|        18|[python, 手把手, 网络,...|(2000,[0,3,5,9,12...|(2000,[0,3,5,9,12...|
|     12253|        18|[python, 豆瓣, 大家, ...

In [24]:
# 1265词的 {索引 以及 权重}
def func(partition):
    TOPK = 20
    for row in partition:
        # 找到索引与IDF值并进行排序
        _ = list(zip(row.idfFeatures.indices, row.idfFeatures.values))
        _ = sorted(_, key=lambda x: x[1], reverse=True)
        result = _[:TOPK]
        for word_index, tfidf in result:
            yield row.article_id, row.channel_id, int(word_index), round(float(tfidf), 4)
kewords_tfidf = tfidf_res.rdd.mapPartitions(func).toDF(['article_id', 'channel_id', 'index', 'weights'])

In [25]:
kewords_tfidf.show()

+----------+----------+-----+---------+
|article_id|channel_id|index|  weights|
+----------+----------+-----+---------+
|     12237|        18|    0|1626.3407|
|     12237|        18| 1115| 561.4683|
|     12237|        18|  848| 458.1074|
|     12237|        18|  296| 285.3019|
|     12237|        18| 1090| 175.9988|
|     12237|        18|  911| 175.8358|
|     12237|        18|  909| 169.7768|
|     12237|        18| 1221| 168.8114|
|     12237|        18|  350| 167.0493|
|     12237|        18| 1473| 166.4656|
|     12237|        18|  101|  158.973|
|     12237|        18| 1381| 157.2973|
|     12237|        18|  931|  152.294|
|     12237|        18|   38| 145.3656|
|     12237|        18|    4| 119.3862|
|     12237|        18|  262| 119.3606|
|     12237|        18|  159|  98.8927|
|     12237|        18|  333|  94.1979|
|     12237|        18| 1203|  89.5603|
|     12237|        18|  408|  86.6918|
+----------+----------+-----+---------+
only showing top 20 rows



In [26]:
# 利用keywordsIndex = ktt.spark.sql("select keyword, index idx from idf_keywords_values")中标，知道索引对应的词
idf_keywords_values = oa.spark.sql("select keyword, index idx from idf_keywords_values")

In [27]:
keyword_str_tfidf = kewords_tfidf.join(idf_keywords_values, idf_keywords_values.idx==kewords_tfidf.index).select(["article_id", "channel_id", "keyword", "weights"])

keyword_str_tfidf.show()

+----------+----------+-------+-------+
|article_id|channel_id|keyword|weights|
+----------+----------+-------+-------+
|     14450|        18|    var|  1.332|
|     14509|        18|    var|11.9878|
|     14541|        18|    var|10.6558|
|     14564|        18|    var|14.6517|
|     14569|        18|    var|10.6558|
|     14597|        18|    var| 3.9959|
|     14730|        18|    var|13.3198|
|     14841|        18|    var|13.3198|
|     14864|        18|    var| 3.9959|
|     14936|        18|    var|59.9389|
|     15114|        18|    var|37.2953|
|     15218|        18|    var| 3.9959|
|     15258|        18|    var|27.9715|
|     15275|        18|    var|  1.332|
|     15364|        18|    var|65.2668|
|     15643|        18|    var| 3.9959|
|     15660|        18|    var|18.6477|
|     15693|        18|    var|50.6151|
|     15754|        18|    var| 3.9959|
|     15757|        18|    var|37.2953|
+----------+----------+-------+-------+
only showing top 20 rows



## 6. 得到TextRank

In [28]:
# texrank
# 分词
def textrank(partition):
    import os

    import jieba
    import jieba.analyse
    import jieba.posseg as pseg
    import codecs

    abspath = "/root/words"

    # 结巴加载用户词典
    userDict_path = os.path.join(abspath, "ITKeywords.txt")
    jieba.load_userdict(userDict_path)

    # 停用词文本
    stopwords_path = os.path.join(abspath, "stopwords.txt")

    def get_stopwords_list():
        """返回stopwords列表"""
        stopwords_list = [i.strip()
                          for i in codecs.open(stopwords_path).readlines()]
        return stopwords_list

    # 所有的停用词列表
    stopwords_list = get_stopwords_list()

    class TextRank(jieba.analyse.TextRank):
        def __init__(self, window=20, word_min_len=2):
            super(TextRank, self).__init__()
            self.span = window  # 窗口大小
            self.word_min_len = word_min_len  # 单词的最小长度
            # 要保留的词性，根据jieba github ，具体参见https://github.com/baidu/lac
            self.pos_filt = frozenset(
                ('n', 'x', 'eng', 'f', 's', 't', 'nr', 'ns', 'nt', "nw", "nz", "PER", "LOC", "ORG"))

        def pairfilter(self, wp):
            """过滤条件，返回True或者False"""

            if wp.flag == "eng":
                if len(wp.word) <= 2:
                    return False

            if wp.flag in self.pos_filt and len(wp.word.strip()) >= self.word_min_len \
                    and wp.word.lower() not in stopwords_list:
                return True
    # TextRank过滤窗口大小为5，单词最小为2
    textrank_model = TextRank(window=5, word_min_len=2)
    allowPOS = ('n', "x", 'eng', 'nr', 'ns', 'nt', "nw", "nz", "c")

    for row in partition:
        tags = textrank_model.textrank(row.sentence, topK=20, withWeight=True, allowPOS=allowPOS, withFlag=False)
        for tag in tags:
            yield row.article_id, row.channel_id, tag[0], tag[1]

In [29]:
textrank = article_data.rdd.mapPartitions(textrank).toDF(["article_id", "channel_id", "keyword", "textrank"])

In [30]:
textrank.show()

+----------+----------+-------+-------------------+
|article_id|channel_id|keyword|           textrank|
+----------+----------+-------+-------------------+
|     12237|        18| crayon|                1.0|
|     12237|        18|  class| 0.9501036070450275|
|     12237|        18|     pa| 0.6992489821528144|
|     12237|        18|    div|0.45287701701165306|
|     12237|        18|     &#|0.33654037380344864|
|     12237|        18|   line|0.20897055189510236|
|     12237|        18|     区块|0.14429613523841756|
|     12237|        18|    num|0.13727844489573357|
|     12237|        18|     节点|0.10960152318431637|
|     12237|        18| button|0.10778796016549291|
|     12237|        18|   code|0.10613517820935363|
|     12237|        18|striped|0.09980116915318452|
|     12237|        18|    区块链|0.09680906050323575|
|     12237|        18|   data|0.09083121725478936|
|     12237|        18|     交易|0.08936496548092669|
|     12237|        18|  chain|0.08366252481157677|
|     12237|

## 7.依据TF-IDF及TextRank得到词及其权重

In [31]:
# 计算关键词最后的权重，Textank  * IDF
idf = oa.spark.sql("select * from idf_keywords_values")
idf = idf.withColumnRenamed("keyword", "keyword1")
result = textrank.join(idf,textrank.keyword==idf.keyword1)
keywords_res = result.withColumn("weights", result.textrank * result.idf).select(["article_id", "channel_id", "keyword", "weights"])

In [32]:
# 20个Keyword，对应的权重，文章ID，channel_id
keywords_res.show()

+----------+----------+--------+-------------------+
|article_id|channel_id| keyword|            weights|
+----------+----------+--------+-------------------+
|     49531|        18|    BLEU| 2.6220949820228965|
|     15259|        18|      C#| 0.8302318750634707|
|     16217|        18|      C#| 0.2298910720861784|
|     16331|        18|      C#|0.28636277071116106|
|     16344|        18|      C#| 1.0336865923547185|
|     17360|        18|      C#| 1.2997082956979977|
|     17802|        18|      C#|0.24716768943850292|
|     49792|        18|      C#| 0.6959435616798894|
|     13285|        18|      C#| 2.2848330028444113|
|     13506|        18|      C#|   1.01260771691963|
|     13579|        18|      C#|   1.39950622733909|
|     13876|        18|      C#| 0.8821074765951309|
|     18506|        18|      C#| 0.5767786169503137|
|     19053|        18|      C#|0.30589550535607357|
|     44577|        18|      C#| 3.9274063388273364|
|     44660|        18|      C#|  4.0354060280

In [33]:
keywords_res.registerTempTable("temptable")

In [34]:
keyword_weights_list = oa.spark.sql("select article_id, min(channel_id) channel_id, collect_list(keyword) keywords, collect_list(weights) weights from temptable group by article_id")

In [35]:
keyword_weights_list.show()

+----------+----------+--------------------+--------------------+
|article_id|channel_id|            keywords|             weights|
+----------+----------+--------------------+--------------------+
|     15237|        18|[input, TANGRAM, ...|[0.22325206546519...|
|     17703|        18|[hljs, h2, Hannin...|[3.99268223953728...|
|     17971|        18|[hljs, imageView2...|[3.18293838018398...|
|     18730|        18|[hljs, jwt, 资源, 信...|[4.62965411949731...|
|     19141|        18|[hljs, 表达式, .h, c...|[5.00926646711115...|
|     19163|        18|[hljs, .h, code, ...|[3.64623759402293...|
|    117936|        18|[序列, 赋值, code, 元组...|[1.10120084310549...|
|    133164|        18|[transaction, ser...|[0.76851402647143...|
|     13098|        18|[repr, getPrice, ...|[0.63265901177161...|
|     14719|        18|[import, Thread, ...|[0.62073537067287...|
|     15322|        18|[import, code, ti...|[0.27776326224116...|
|     18295|        18|[routing, queue, ...|[1.94772357894299...|
|    11718

In [36]:
def keyword_weights_to_dict(row):
    return row.article_id, row.channel_id, dict(zip(row.keywords, row.weights))
    
keywords = keyword_weights_list.rdd.map(keyword_weights_to_dict).toDF(['article_id', 'channel_id', 'keywords'])

In [37]:
keywords.show()

+----------+----------+--------------------+
|article_id|channel_id|            keywords|
+----------+----------+--------------------+
|     15237|        18|Map(pre -> 0.5349...|
|     17703|        18|Map(函数 -> 0.79761...|
|     17971|        18|Map(imageView2 ->...|
|     18730|        18|Map(hljs -> 4.629...|
|     19141|        18|Map(函数 -> 0.22260...|
|     19163|        18|Map(对象 -> 0.20687...|
|    117936|        18|Map(pre -> 1.4263...|
|    133164|        18|Map(static -> 0.1...|
|     13098|        18|Map(pre -> 0.6040...|
|     14719|        18|Map(pre -> 0.8814...|
|     15322|        18|Map(pre -> 0.5762...|
|     18295|        18|Map(method -> 0.6...|
|    117180|        18|Map(pre -> 1.7729...|
|    118448|        18|Map(反序列化 -> 1.786...|
|    118628|        18|Map(pre -> 1.5351...|
|     49586|        18|Map(lightbox -> 2...|
|    117264|        18|Map(static -> 0.4...|
|    118408|        18|Map(对象 -> 0.88355...|
|    133142|        18|Map(imageView2 ->...|
|     1584

## 8.依据TF-IDF及TextRank得到主题词

In [38]:
topic_sql = """
                select t.article_id article_id2, collect_set(t.keyword) topics from tfidf_keywords_values t
                inner join 
                textrank_keywords_values r
                where t.keyword=r.keyword
                group by article_id2
                """

article_topics = oa.spark.sql(topic_sql)

In [39]:
#article_topics.show()

## 9. 依据词权重和主题词得到文章画像

In [40]:
article_profile = keywords.join(article_topics, keywords.article_id==article_topics.article_id2).select(["article_id", "channel_id", "keywords", "topics"])


In [42]:
article_profile.show()

+----------+----------+--------------------+--------------------+
|article_id|channel_id|            keywords|              topics|
+----------+----------+--------------------+--------------------+
|     13098|        18|Map(pre -> 0.6040...|[__, object, self...|
|     13248|        18|Map(有限元 -> 5.2929...|[npk1, eindex, 有限...|
|     13401|        18|Map(pre -> 0.2100...|[补码, 字符串, 李白, typ...|
|     13723|        18|Map(pre -> 2.1094...|[lis2, acc, bstr,...|
|     14719|        18|Map(pre -> 0.8814...|[__, ctime, Sep, ...|
|     14846|        18|Map(__ -> 2.54674...|[__, folders, fil...|
|     15173|        18|Map(人人 -> 0.73972...|[filecookiejar, c...|
|     15194|        18|Map(dif -> 0.7567...|[video2, display,...|
|     15237|        18|Map(pre -> 0.5349...|[__, send, try, s...|
|     15322|        18|Map(pre -> 0.5762...|[Pclass, replace,...|
|     15375|        18|Map(pre -> 1.7091...|[内存地址, list2, 浅拷贝...|
|     15432|        18|Map(模式 -> 0.44872...|[内存, 显示文件, 读后写, y...|
|     1543

# 10. 词向量模型训练

In [43]:
# 通过少量数据来演示训练
from pyspark.ml.feature import Word2Vec


w2v = Word2Vec(vectorSize=100, inputCol='words', outputCol='model', minCount=3)
w2v_model = w2v.fit(words_df)
w2v_model.write().overwrite().save("hdfs://hadoop-master:9000/headlines/models/test.word2vec")

In [44]:
article_profile.show()

+----------+----------+--------------------+--------------------+
|article_id|channel_id|            keywords|              topics|
+----------+----------+--------------------+--------------------+
|     13098|        18|Map(pre -> 0.6040...|[__, object, self...|
|     13248|        18|Map(有限元 -> 5.2929...|[npk1, eindex, 有限...|
|     13401|        18|Map(pre -> 0.2100...|[补码, 字符串, 李白, 元素,...|
|     13723|        18|Map(pre -> 2.1094...|[lis2, acc, bstr,...|
|     14719|        18|Map(pre -> 0.8814...|[__, ctime, Sep, ...|
|     14846|        18|Map(__ -> 2.54674...|[folders, files, ...|
|     15173|        18|Map(人人 -> 0.73972...|[filecookiejar, c...|
|     15194|        18|Map(dif -> 0.7567...|[video2, display,...|
|     15237|        18|Map(pre -> 0.5349...|[send, __, try, s...|
|     15322|        18|Map(pre -> 0.5762...|[Pclass, replace,...|
|     15375|        18|Map(pre -> 1.7091...|[内存地址, 浅拷贝, list2...|
|     15432|        18|Map(模式 -> 0.44872...|[内存, 显示文件, 读后写, y...|
|     1543

In [45]:
# 求出增量文章的词向量，增量文章 一共10篇文章
# 1、加载某个频道模型，得到每个词的向量
from pyspark.ml.feature import Word2VecModel

word_vec = Word2VecModel.load("hdfs://hadoop-master:9000/headlines/models/test.word2vec")
vectors = word_vec.getVectors()

In [46]:
vectors.show()

+------------------+--------------------+
|              word|              vector|
+------------------+--------------------+
|                广义|[-0.3124968707561...|
|                钟爱|[0.03830356895923...|
|c1c3387c24028915fc|[-0.0023575632367...|
|          failCnt0|[0.02495069056749...|
|       freeman1974|[-0.0393065623939...|
|                伙伴|[-0.1904211044311...|
|  testStationarity|[-0.0568049065768...|
|                箭头|[0.03213853389024...|
|        fieldsfrom|[-0.0137809459120...|
|      RoundrobinLB|[-0.0076495949178...|
|              COCO|[-0.0488479435443...|
|                拜拜|[0.02235601097345...|
|          quotient|[0.23557169735431...|
|                货币|[-0.1088096722960...|
|                人物|[0.13329826295375...|
|               wsy|[-0.0372068472206...|
|           serious|[0.06451868265867...|
|               跨进程|[0.03020830452442...|
|        fromParams|[0.01803738623857...|
|        MongoDB数据库|[0.04348663613200...|
+------------------+--------------

## 11. 得到文章的词及权重并展开

In [47]:
# 获取频道的文章画像，得到文章画像的关键词，找到这些文章关键词对应词向量
python_article_profile = article_profile.filter('channel_id=18')

In [48]:
python_article_profile.show()

+----------+----------+--------------------+--------------------+
|article_id|channel_id|            keywords|              topics|
+----------+----------+--------------------+--------------------+
|     13098|        18|Map(pre -> 0.6040...|[__, object, self...|
|     13248|        18|Map(有限元 -> 5.2929...|[npk1, eindex, 有限...|
|     13401|        18|Map(pre -> 0.2100...|[补码, 字符串, 李白, 元素,...|
|     13723|        18|Map(pre -> 2.1094...|[lis2, acc, bstr,...|
|     14719|        18|Map(pre -> 0.8814...|[__, ctime, Sep, ...|
|     14846|        18|Map(__ -> 2.54674...|[folders, __, fil...|
|     15173|        18|Map(人人 -> 0.73972...|[filecookiejar, c...|
|     15194|        18|Map(dif -> 0.7567...|[video2, display,...|
|     15237|        18|Map(pre -> 0.5349...|[__, send, try, s...|
|     15322|        18|Map(pre -> 0.5762...|[Pclass, replace,...|
|     15375|        18|Map(pre -> 1.7091...|[内存地址, 浅拷贝, list2...|
|     15432|        18|Map(模式 -> 0.44872...|[内存, 显示文件, 读后写, y...|
|     1543

In [49]:
# 将文章画像的字典， 词语与权重 进行展开
python_article_profile.registerTempTable('profile')

_articlekeywordsweight = oa.spark.sql("select article_id,  channel_id, keyword, weight from profile LATERAL VIEW explode(keywords) AS keyword, weight")


_articlekeywordsweight.show()

+----------+----------+--------+-------------------+
|article_id|channel_id| keyword|             weight|
+----------+----------+--------+-------------------+
|     13098|        18|      __| 2.5401122038114203|
|     13098|        18|    repr| 0.6326590117716192|
|     13098|        18|     pre| 0.6040062287555379|
|     13098|        18|      属性|0.23645924932468856|
|     13098|        18|    code| 0.9531379029975557|
|     13098|        18|     def| 0.5063435861497416|
|     13098|        18|      定义| 0.1554380122061322|
|     13098|        18|   color| 1.1337936117177925|
|     13098|        18| Student| 0.5033771372284416|
|     13098|        18|getPrice| 0.7404427038950527|
|     13098|        18|      方法|0.08080845613717194|
|     13098|        18|     div| 0.3434819820586186|
|     13098|        18|     str|0.35999033790156054|
|     13098|        18|      pa| 0.6651385256756351|
|     13098|        18|   slots| 0.6992789472129189|
|     13098|        18| cnblogs|0.339265861020

## 12. 合并词向量和 文章词及权重

In [51]:
# 将文章词权重 和 词向量 模型 进行 inner 合并
article_keyword_vec_weights = _articlekeywordsweight.join(vectors, vectors.word==_articlekeywordsweight.keyword, "inner")

In [52]:
article_keyword_vec_weights.show()

+----------+----------+--------+-------------------+--------+--------------------+
|article_id|channel_id| keyword|             weight|    word|              vector|
+----------+----------+--------+-------------------+--------+--------------------+
|     13098|        18|      __| 2.5401122038114203|      __|[-0.6546941995620...|
|     13098|        18|    repr| 0.6326590117716192|    repr|[-0.4608801603317...|
|     13098|        18|     pre| 0.6040062287555379|     pre|[-0.1207879632711...|
|     13098|        18|      属性|0.23645924932468856|      属性|[-0.4689875245094...|
|     13098|        18|    code| 0.9531379029975557|    code|[0.10663777589797...|
|     13098|        18|     def| 0.5063435861497416|     def|[-0.2050044685602...|
|     13098|        18|      定义| 0.1554380122061322|      定义|[-0.1936927884817...|
|     13098|        18|   color| 1.1337936117177925|   color|[-0.2678930759429...|
|     13098|        18| Student| 0.5033771372284416| Student|[-0.3861369490623...|
|   

## 13. 文章向量

In [54]:
# 得到文章向量
article_keyword_vec_weights.registerTempTable("temptable")

def func(row):
    x = 0
    for v in row.vectors:
        x += v
        
    return row.article_id, row.channel_id, x / len(row.vectors)

article_vector = oa.spark.sql("select article_id, min(channel_id) channel_id, collect_set(vector) vectors from temptable group by article_id").rdd.map(func).toDF(['article_id', 'channel_id', 'articlevector'])


In [55]:
article_vector.show()
# article_vector.write.insertInto("article_vector")

+----------+----------+--------------------+
|article_id|channel_id|       articlevector|
+----------+----------+--------------------+
|     13098|        18|[-0.1647327722050...|
|     13248|        18|[-0.0209909798577...|
|     13401|        18|[-0.1170251129660...|
|     13723|        18|[-0.0636651790235...|
|     14719|        18|[-0.1332618249114...|
|     14846|        18|[-0.0216992553789...|
|     15173|        18|[-0.0235749806277...|
|     15194|        18|[0.10394728556275...|
|     15237|        18|[0.19160119506220...|
|     15322|        18|[-0.0103950041934...|
|     15375|        18|[-0.1530114244669...|
|     15432|        18|[-0.0594050147570...|
|     15437|        18|[-0.1329826050273...|
|     15846|        18|[0.07257945776769...|
|     17499|        18|[0.05329569820314...|
|     17703|        18|[0.00514154327347...|
|     17971|        18|[-0.0809452034729...|
|     17979|        18|[0.01235814888625...|
|     18147|        18|[-0.1417260870435...|
|     1819

## 14.根据文章向量计算文章相似度

In [None]:
from pyspark.ml.linalg import Vectors
def _array_to_vector(row):
    return row.article_id, Vectors.dense(row.articlevector)

train = article_vector.rdd.map(_array_to_vector).toDF(['article_id', 'article_vector'])

In [None]:
train

In [None]:
# BRP进行fit
from pyspark.ml.feature import BucketedRandomProjectionLSH

BRP = BucketedRandomProjectionLSH(inputCol='article_vector', outputCol='hashes', numHashTables=4.0, bucketLength=10.0)
model = BRP.fit(train)

In [None]:
similar = model.approxSimilarityJoin(train, train, 2.0, distCol='EuclideanDistance')

In [None]:
similar.sort(['EuclideanDistance']).show()

## 15. 保存文章相似度

In [None]:
def save_hbase(partition):
    import happybase
    pool = happybase.ConnectionPool(size=3, host='hadoop-master')
    
    with pool.connection() as conn:
        # 建议表的连接
        table = conn.table('article_similar')
        for row in partition:
            if row.datasetA.article_id == row.datasetB.article_id:
                pass
            else:
                table.put(str(row.datasetA.article_id).encode(),
                         {"similar:{}".format(row.datasetB.article_id).encode(): b'%0.4f' % (row.EuclideanDistance)})
        # 手动关闭所有的连接
        conn.close()

similar.foreachPartition(save_hbase)