In [1]:
import os
import sys
# 如果当前代码文件运行测试需要加入修改路径，避免出现后导包问题
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.insert(0, os.path.join(BASE_DIR))
print(BASE_DIR)
PYSPARK_PYTHON = "/miniconda2/envs/reco_sys/bin/python"
# 当存在多个版本时，不指定很可能会导致出错
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON
from offline import SparkSessionBase

/root/toutiao_project/reco_sys


In [2]:
# - 初始化spark信息
class OriginArticleData(SparkSessionBase):
    SPARK_APP_NAME = 'originaldata'
    SPARK_URL = 'yarn'
    ENABLE_HIVE_SUPPORT = True
    
    def __init__(self):
        self.spark = self._create_spark_session()

In [3]:
oa = OriginArticleData()

In [4]:
oa.spark.sql("use toutiao")

DataFrame[]

In [6]:
# 读取new_basic, news_content两张表合并，取出article_id, channel_id, title, content
basic_content = oa.spark.sql("select a.article_id, a.channel_id, a.title, b.content from news_article_basic a inner join news_article_content b on a.article_id=b.article_id where a.article_id=116636")



In [7]:
basic_content.show()

+----------+----------+---------------+--------------------+
|article_id|channel_id|          title|             content|
+----------+----------+---------------+--------------------+
|    116636|        18|动态再平衡投资策略历史数据回测|<p>赚钱是个俗气的话题，但又是人...|
+----------+----------+---------------+--------------------+



In [9]:
# 文章内容去按照channel_id取出channel_name合并
basic_content.registerTempTable("temptable")
channel_basic_content = oa.spark.sql("select t.*, n.channel_name from temptable t left join news_channel n on t.channel_id=n.channel_id")




In [10]:
channel_basic_content.show()

+----------+----------+---------------+--------------------+------------+
|article_id|channel_id|          title|             content|channel_name|
+----------+----------+---------------+--------------------+------------+
|    116636|        18|动态再平衡投资策略历史数据回测|<p>赚钱是个俗气的话题，但又是人...|      python|
+----------+----------+---------------+--------------------+------------+



In [12]:
# 合并文章的一些词语到一个完整内容
import pyspark.sql.functions as F

oa.spark.sql("use article")
sentence_df = channel_basic_content.select("article_id", "channel_id", "channel_name", "title", "content", 
                             F.concat_ws(",", 
                                        channel_basic_content.channel_name,
                                        channel_basic_content.title,
                                        channel_basic_content.content).alias('sentence'))





In [13]:
sentence_df.show()

+----------+----------+------------+---------------+--------------------+--------------------+
|article_id|channel_id|channel_name|          title|             content|            sentence|
+----------+----------+------------+---------------+--------------------+--------------------+
|    116636|        18|      python|动态再平衡投资策略历史数据回测|<p>赚钱是个俗气的话题，但又是人...|python,动态再平衡投资策略历...|
+----------+----------+------------+---------------+--------------------+--------------------+



In [14]:
import gc
del basic_content
del channel_basic_content
gc.collect()

718

In [4]:
# 计算TFIDF
oa.spark.sql("use article")

article_data = oa.spark.sql("select * from article_data where channel_id=18 limit 10")

In [6]:
article_data.show()

+----------+----------+------------+--------------------+--------------------+--------------------+
|article_id|channel_id|channel_name|               title|             content|            sentence|
+----------+----------+------------+--------------------+--------------------+--------------------+
|     12237|        18|      python|想学习区块链？那就用 Python...|<div id="article_...|python,想学习区块链？那就用...|
|     12238|        18|      python|鲜为人知的 Python 语法 使...|<p>所有人（好吧，不是所有人）都...|python,鲜为人知的 Pyth...|
|     12243|        18|      python|手把手教你写网络爬虫（4）：Scr...|<div id="cnblogs_...|python,手把手教你写网络爬虫...|
|     12245|        18|      python|手把手教你写网络爬虫（5）：Pha...|<div id="cnblogs_...|python,手把手教你写网络爬虫...|
|     12247|        18|      python|用 Plumbum 开发 Pyth...|<div id="article_...|python,用 Plumbum ...|
|     12249|        18|      python|手把手教你写网络爬虫（6）：分布式...|<div id="cnblogs_...|python,手把手教你写网络爬虫...|
|     12251|        18|      python|手把手教你写网络爬虫（7）：URL...|<p><a href="http:...|python,手把手教你写网络爬虫...|


In [5]:
# 分词
def segmentation(partition):
    import os
    import re

    import jieba
    import jieba.analyse
    import jieba.posseg as pseg
    import codecs

    abspath = "/root/words"

    # 结巴加载用户词典
    userDict_path = os.path.join(abspath, "ITKeywords.txt")
    jieba.load_userdict(userDict_path)

    # 停用词文本
    stopwords_path = os.path.join(abspath, "stopwords.txt")

    def get_stopwords_list():
        """返回stopwords列表"""
        stopwords_list = [i.strip()
                          for i in codecs.open(stopwords_path).readlines()]
        return stopwords_list

    # 所有的停用词列表
    stopwords_list = get_stopwords_list()

    # 分词
    def cut_sentence(sentence):
        """对切割之后的词语进行过滤，去除停用词，保留名词，英文和自定义词库中的词，长度大于2的词"""
        # print(sentence,"*"*100)
        # eg:[pair('今天', 't'), pair('有', 'd'), pair('雾', 'n'), pair('霾', 'g')]
        seg_list = pseg.lcut(sentence)
        seg_list = [i for i in seg_list if i.flag not in stopwords_list]
        filtered_words_list = []
        for seg in seg_list:
            # print(seg)
            if len(seg.word) <= 1:
                continue
            elif seg.flag == "eng":
                if len(seg.word) <= 2:
                    continue
                else:
                    filtered_words_list.append(seg.word)
            elif seg.flag.startswith("n"):
                filtered_words_list.append(seg.word)
            elif seg.flag in ["x", "eng"]:  # 是自定一个词语或者是英文单词
                filtered_words_list.append(seg.word)
        return filtered_words_list

    for row in partition:
        sentence = re.sub("<.*?>", "", row.sentence)    # 替换掉标签数据
        words = cut_sentence(sentence)
        yield row.article_id, row.channel_id, words

In [6]:
words_df = article_data.rdd.mapPartitions(segmentation).toDF(['article_id', 'channel_id', 'words'])

In [7]:
words_df.show()

+----------+----------+--------------------+
|article_id|channel_id|               words|
+----------+----------+--------------------+
|     12237|        18|[python, 区块链, Pyt...|
|     12238|        18|[python, Python, ...|
|     12243|        18|[python, 手把手, 网络,...|
|     12245|        18|[python, 手把手, 网络,...|
|     12247|        18|[python, Plumbum,...|
|     12249|        18|[python, 手把手, 网络,...|
|     12251|        18|[python, 手把手, 网络,...|
|     12252|        18|[python, 手把手, 网络,...|
|     12253|        18|[python, 豆瓣, 大家, ...|
|     12254|        18|[python, Python, ...|
+----------+----------+--------------------+



In [10]:
# 训练模型
from pyspark.ml.feature import CountVectorizer

cv = CountVectorizer(inputCol='words', outputCol='countFeatures', vocabSize=200*10000, minDF=1.0)
cv_model = cv.fit(words_df)




In [11]:
cv_model.write().overwrite().save("hdfs://hadoop-master:9000/headlines/models/CV.model")

In [12]:
# 模型处理得到结果
from pyspark.ml.feature import CountVectorizerModel
cv_model = CountVectorizerModel.load("hdfs://hadoop-master:9000/headlines/models/CV.model")
cv_result = cv_model.transform(words_df)


In [25]:
cv_result.show()

+----------+----------+--------------------+--------------------+
|article_id|channel_id|               words|       countFeatures|
+----------+----------+--------------------+--------------------+
|         1|        17|[Vue, props, 用法, ...|(986,[1,2,3,4,10,...|
|         2|        17|[vue, 响应式, 原理, mo...|(986,[2,4,5,10,11...|
|         3|        17|[JavaScript, 浅拷贝,...|(986,[0,4,5,11,14...|
|         4|        17|[vue2, vuex, elem...|(986,[4,11,16,18,...|
|         5|        17|[immutability, Re...|(986,[4,5,10,11,1...|
|         6|        17|[node, npm, cnpm,...|(986,[0,2,4,10,11...|
|         7|        17|[Web, 工程师, 以太坊, 入...|(986,[2,4,6,8,10,...|
|         8|        17|[Web, pa, api, we...|(986,[2,4,11,13,1...|
|         9|        17|[vue, 中用, 数据驱动, 视...|(986,[2,4,5,12,14...|
|        10|        17|[程序, WebSocket, 长...|(986,[2,4,6,12,18...|
+----------+----------+--------------------+--------------------+



In [13]:
from pyspark.ml.feature import IDF
idf = IDF(inputCol="countFeatures", outputCol="idfFeatures")
idfModel = idf.fit(cv_result)
idfModel.write().overwrite().save("hdfs://hadoop-master:9000/headlines/models/IDF.model")

In [27]:
cv_model.vocabulary

['&#',
 'String',
 '代码',
 '作用域',
 'pa',
 'key',
 '客户端',
 'jedis',
 'public',
 'Hooks',
 '函数',
 'ul',
 '组件',
 'scope',
 'return',
 '模块',
 '方法',
 'import',
 '时候',
 'count',
 'res',
 '.h',
 'this',
 'java',
 '问题',
 'Long',
 'Override',
 'class',
 'com',
 '声明',
 'web',
 'name',
 '线程',
 'constructor',
 'value',
 '逻辑',
 'props',
 'useEffect',
 'node',
 'start',
 '插件',
 '项目',
 'field',
 'rams',
 'vue',
 'useState',
 'arg',
 'Jedis',
 'event',
 'command',
 'jedisPool',
 '服务端',
 'action',
 '例子',
 '官方',
 'Enumeration',
 '参数',
 'state',
 'util',
 'close',
 'function',
 '情况',
 'catch',
 'title',
 'const',
 '文件',
 'new',
 'set',
 'jedisCluster',
 'void',
 'redis',
 'getResource',
 'clients',
 '标识',
 'onParseClientResp',
 'Thread',
 'enu',
 '.a',
 'from',
 'hooks',
 '页面',
 '全局',
 'get',
 '大家',
 'end',
 'fireEvent',
 'react',
 'document',
 'times',
 'clicked',
 'You',
 'listener',
 '赋值',
 'Vector',
 'server',
 '结果',
 'client',
 'var',
 'bean',
 'isOnline',
 '降级',
 'tml',
 'android',
 'toast',
 'Strin

In [28]:
idfModel.idf.toArray()[:20]

array([1.29928298, 1.70474809, 0.31845373, 1.70474809, 0.        ,
       0.78845736, 1.29928298, 2.39789527, 1.70474809, 2.39789527,
       0.6061358 , 0.2006707 , 1.01160091, 1.70474809, 0.45198512,
       1.70474809, 0.45198512, 2.39789527, 0.45198512, 2.39789527])

In [8]:
# CV值与IDF的值结果， TFIDF的结果
from pyspark.ml.feature import CountVectorizerModel
cv_model = CountVectorizerModel.load("hdfs://hadoop-master:9000/headlines/models/CV.model")

from pyspark.ml.feature import IDFModel
idf_model = IDFModel.load("hdfs://hadoop-master:9000/headlines/models/IDF.model")


In [9]:
# [('代码', 1.29928298), (), ()]

# [['代码', 1.29928298, 0], ['', , ]]

# 必须保存词和索引的中间结果

In [10]:
cv_result = cv_model.transform(words_df)
tfidf_res = idf_model.transform(cv_result)

In [11]:
tfidf_res.show()

+----------+----------+--------------------+--------------------+--------------------+
|article_id|channel_id|               words|       countFeatures|         idfFeatures|
+----------+----------+--------------------+--------------------+--------------------+
|     14396|        18|[python, 项目, 平台, ...|(1837,[1,2,3,9,12...|(1837,[1,2,3,9,12...|
|     14400|        18|[python, PySide2,...|(1837,[1,3,13,15,...|(1837,[1,3,13,15,...|
|     14401|        18|[python, Python, ...|(1837,[2,3,7,9,13...|(1837,[2,3,7,9,13...|
|     14402|        18|[python, PySide2,...|(1837,[1,2,3,6,9,...|(1837,[1,2,3,6,9,...|
|     14405|        18|[python, Python3,...|(1837,[7,44,69,80...|(1837,[7,44,69,80...|
|     14406|        18|[python, Python, ...|(1837,[7,33,80,13...|(1837,[7,33,80,13...|
|     14407|        18|[python, python, ...|(1837,[2,6,7,11,1...|(1837,[2,6,7,11,1...|
|     14410|        18|[python, time, 模块...|(1837,[13,16,23,2...|(1837,[13,16,23,2...|
|     14411|        18|[python, random, ...

In [12]:
# 提取出文章的TFIDF的值，输出["article_id", "channel_id", "index", "tfidf"]
def _func(partitions):
    TOPK=20
    for row in partitions:
        _ = list(zip(row.idfFeatures.indices, row.idfFeatures.values))
        _ = sorted(_, key=lambda x: x[1], reverse=True)
        # 对结果取出TOPK
        result = _[:TOPK]
        for word_index, tfidf in result:
            yield row.article_id, row.channel_id, int(word_index), round(float(tfidf), 4)

_keywordsIndexOfTFIDF = tfidf_res.rdd.mapPartitions(_func).toDF(['article_id', 'channel_id', 'index', 'tfidf'])

In [13]:
_keywordsIndexOfTFIDF.show()

+----------+----------+-----+---------+
|article_id|channel_id|index|    tfidf|
+----------+----------+-----+---------+
|     12237|        18|    0|1700.1077|
|     12237|        18|    8| 220.6064|
|     12237|        18|   10| 211.0148|
|     12237|        18|   12| 134.6751|
|     12237|        18|    4| 132.9704|
|     12237|        18|   11| 121.0371|
|     12237|        18|   20|  115.099|
|     12237|        18|   39|  93.5179|
|     12237|        18|   43|  88.7221|
|     12237|        18|   48|  83.9263|
|     12237|        18|   49|  81.5284|
|     12237|        18|   52|  81.5284|
|     12237|        18|   55|  81.5284|
|     12237|        18|    1|  78.9049|
|     12237|        18|   28|  78.4184|
|     12237|        18|   59|  76.7326|
|     12237|        18|   60|  76.7326|
|     12237|        18|   61|  76.7326|
|     12237|        18|   62|  76.7326|
|     12237|        18|   68|  71.9369|
+----------+----------+-----+---------+
only showing top 20 rows



In [14]:
# 取出索引对应的词然后保存
idf_keywords_values = oa.spark.sql("select keyword, index idx from idf_keywords_values")

tfidf_keyword_values = _keywordsIndexOfTFIDF.join(idf_keywords_values, idf_keywords_values.idx==_keywordsIndexOfTFIDF.index).select(["article_id", "channel_id", "keyword", "tfidf"])






In [15]:
tfidf_keyword_values.show()

+----------+----------+--------+---------+
|article_id|channel_id| keyword|    tfidf|
+----------+----------+--------+---------+
|     12238|        18|     var|    18.19|
|     12251|        18|      属性|  18.2088|
|     12253|        18|   https|  71.9369|
|     12238|        18|      脚本|  19.1832|
|     12245|        18|document|   23.979|
|     12247|        18|     com| 131.8842|
|     12249|        18|      功能|  79.1305|
|     12237|        18|      &#|1700.1077|
|     12238|        18|      &#|1357.2087|
|     12243|        18|      &#|  527.537|
|     12245|        18|      &#| 335.7053|
|     12247|        18|      &#|2074.1794|
|     12249|        18|      &#|  19.1832|
|     12251|        18|      &#| 179.8421|
|     12252|        18|      &#| 134.2821|
|     12253|        18|      &#| 563.5054|
|     12254|        18|      &#| 505.9559|
|     12251|        18|      阶段|  19.1832|
|     12247|        18|      列表|  47.9579|
|     12238|        18|      图片|  16.1856|
+----------

In [16]:
# 对原始数据进行Texrank的计算
# 分词
def textrank(partition):
    import os

    import jieba
    import jieba.analyse
    import jieba.posseg as pseg
    import codecs

    abspath = "/root/words"

    # 结巴加载用户词典
    userDict_path = os.path.join(abspath, "ITKeywords.txt")
    jieba.load_userdict(userDict_path)

    # 停用词文本
    stopwords_path = os.path.join(abspath, "stopwords.txt")

    def get_stopwords_list():
        """返回stopwords列表"""
        stopwords_list = [i.strip()
                          for i in codecs.open(stopwords_path).readlines()]
        return stopwords_list

    # 所有的停用词列表
    stopwords_list = get_stopwords_list()

    class TextRank(jieba.analyse.TextRank):
        def __init__(self, window=20, word_min_len=2):
            super(TextRank, self).__init__()
            self.span = window  # 窗口大小
            self.word_min_len = word_min_len  # 单词的最小长度
            # 要保留的词性，根据jieba github ，具体参见https://github.com/baidu/lac
            self.pos_filt = frozenset(
                ('n', 'x', 'eng', 'f', 's', 't', 'nr', 'ns', 'nt', "nw", "nz", "PER", "LOC", "ORG"))

        def pairfilter(self, wp):
            """过滤条件，返回True或者False"""

            if wp.flag == "eng":
                if len(wp.word) <= 2:
                    return False

            if wp.flag in self.pos_filt and len(wp.word.strip()) >= self.word_min_len \
                    and wp.word.lower() not in stopwords_list:
                return True
    # TextRank过滤窗口大小为5，单词最小为2
    textrank_model = TextRank(window=5, word_min_len=2)
    allowPOS = ('n', "x", 'eng', 'nr', 'ns', 'nt', "nw", "nz", "c")

    for row in partition:
        tags = textrank_model.textrank(row.sentence, topK=20, withWeight=True, allowPOS=allowPOS, withFlag=False)
        for tag in tags:
            yield row.article_id, row.channel_id, tag[0], tag[1]

In [17]:
textrank_values = article_data.rdd.mapPartitions(textrank).toDF(["article_id", "channel_id", "keyword", "textrank"])

In [18]:
textrank_values.show()

+----------+----------+-------+-------------------+
|article_id|channel_id|keyword|           textrank|
+----------+----------+-------+-------------------+
|     14396|        18|  style|                1.0|
|     14396|        18|     pa| 0.9849467862313341|
|     14396|        18|  color|0.41896834977762765|
|     14396|        18|    org|0.24315619609735817|
|     14396|        18|     机构|0.23620100962683405|
|     14396|        18| models|0.21318650119167998|
|     14396|        18|    tml|0.15742118310210823|
|     14396|        18|     页面|0.14816744056498826|
|     14396|        18|     .h|0.13636906517549746|
|     14396|        18|cnblogs|0.13517472127989524|
|     14396|        18|   user|0.10779275550595868|
|     14396|        18|     课程|0.09872653607691473|
|     14396|        18|   code|0.08164071422330386|
|     14396|        18|    fav|0.08113864613085245|
|     14396|        18| detail|0.08087848891203411|
|     14396|        18|request|0.07755996658963021|
|     14396|

In [19]:
# 计算关键词最后的权重，Textank  * IDF


idf = oa.spark.sql("select * from idf_keywords_values")
idf = idf.withColumnRenamed("keyword", "keyword1")
result = textrank_values.join(idf,textrank_values.keyword==idf.keyword1)
keywords_res = result.withColumn("weights", result.textrank * result.idf).select(["article_id", "channel_id", "keyword", "weights"])



# 20个Keyword，对应的权重，文章ID，channel_id


In [20]:
keywords_res.show()

+----------+----------+----------+-------------------+
|article_id|channel_id|   keyword|            weights|
+----------+----------+----------+-------------------+
|     12237|        18|    import| 0.1252362492618782|
|     12251|        18|       amp| 0.3576306801736144|
|     12245|        18|       jpg| 0.6006144883033138|
|     12249|        18|       jpg| 0.6446667069452843|
|     12251|        18|       jpg| 0.5850464157387808|
|     12252|        18|       jpg| 0.8118225634390153|
|     12243|        18|        老张| 0.8903452854624969|
|     12243|        18|    Engine| 0.4449572692443991|
|     12243|        18|    Spider| 0.9320063800229654|
|     12247|        18|      应用程序|0.18128233155574108|
|     12253|        18|        3d| 0.2774573382517545|
|     12254|        18|        信息| 0.1444381388490212|
|     12237|        18|      code|0.28890263752763984|
|     12247|        18|      code| 1.2320096014955921|
|     12238|        18|  settings|0.25486933323286315|
|     1225

In [21]:
keywords_res.registerTempTable("temptable")

In [22]:
keyword_weights_list = oa.spark.sql("select article_id, min(channel_id) channel_id, collect_list(keyword) keywords, collect_list(weights) weights from temptable group by article_id")





In [23]:
keyword_weights_list.show()

+----------+----------+--------------------+--------------------+
|article_id|channel_id|            keywords|             weights|
+----------+----------+--------------------+--------------------+
|     14402|        18|[internationaliza...|[1.48381515956148...|
|     14413|        18|[文件, code, silver...|[0.07438046215156...|
|     14405|        18|[whl, img2018, al...|[3.05649292647403...|
|     14410|        18|[struct, h2, 格式化时...|[1.24688935273482...|
|     14400|        18|[国际化, code, uic, ...|[0.84764461910118...|
|     14411|        18|[形式, code, choice...|[0.48927309340723...|
|     14406|        18|[uri, href, douba...|[1.66602345862538...|
|     14401|        18|[文件, code, yjk137...|[0.19999460241549...|
|     14407|        18|[文件, code, 程序, h2...|[0.19371582994861...|
|     14396|        18|[.h, code, detail...|[0.19628123037228...|
+----------+----------+--------------------+--------------------+



In [24]:
def keyword_weights_to_dict(row):
    return row.article_id, row.channel_id, dict(zip(row.keywords, row.weights))
    
keywords = keyword_weights_list.rdd.map(keyword_weights_to_dict).toDF(['article_id', 'channel_id', 'keywords'])

In [25]:
keywords.show()

+----------+----------+--------------------+
|article_id|channel_id|            keywords|
+----------+----------+--------------------+
|     14402|        18|Map(TRANSLATOR ->...|
|     14413|        18|Map(pre -> 0.4482...|
|     14405|        18|Map(pre -> 3.3062...|
|     14410|        18|Map(pre -> 0.7255...|
|     14400|        18|Map(style -> 2.68...|
|     14411|        18|Map(pre -> 1.1641...|
|     14406|        18|Map(豆瓣 -> 2.69946...|
|     14401|        18|Map(__ -> 1.24893...|
|     14407|        18|Map(pre -> 0.4141...|
|     14396|        18|Map(fav -> 0.6634...|
+----------+----------+--------------------+



In [26]:
topic_sql = """
                select t.article_id article_id2, collect_set(t.keyword) topics from tfidf_keywords_values t
                inner join 
                textrank_keywords_values r
                where t.keyword=r.keyword
                group by article_id2
                """

article_topics = oa.spark.sql(topic_sql)


In [27]:
# article_topics.show()

In [28]:
article_profile = keywords.join(article_topics, keywords.article_id==article_topics.article_id2).select(["article_id", "channel_id", "keywords", "topics"])


In [29]:
article_profile.show()

+----------+----------+--------------------+--------------------+
|article_id|channel_id|            keywords|              topics|
+----------+----------+--------------------+--------------------+
|     14402|        18|Map(TRANSLATOR ->...|[__, ctionChinese...|
|     14405|        18|Map(pre -> 3.3062...|[Twisted, lfd, 32...|
|     14413|        18|Map(pre -> 0.4482...|[字符串, dic2, dic, ...|
|     14410|        18|Map(pre -> 0.7255...|[mktime, 时间差, beg...|
|     14400|        18|Map(style -> 2.68...|[uic, designer, s...|
|     14411|        18|Map(pre -> 1.1641...|[randomprint, lis...|
|     14406|        18|Map(豆瓣 -> 2.69946...|[com, trusted, xx...|
|     14401|        18|Map(__ -> 1.24893...|[__, run, area, m...|
|     14396|        18|Map(fav -> 0.6634...|[课程, courses, fav...|
|     14407|        18|Map(pre -> 0.4141...|[QiaoBa, copy, st...|
+----------+----------+--------------------+--------------------+



In [None]:
# 做词向量模型训练

In [31]:
# 通过少量数据来演示训练
from pyspark.ml.feature import Word2Vec


w2v = Word2Vec(vectorSize=100, inputCol='words', outputCol='model', minCount=3)
w2v_model = w2v.fit(words_df)
w2v_model.write().overwrite().save("hdfs://hadoop-master:9000/headlines/models/test.word2vec")

In [32]:
# 求出增量文章的词向量，增量文章 一共10篇文章
# 1、加载某个频道模型，得到每个词的向量
from pyspark.ml.feature import Word2VecModel

word_vec = Word2VecModel.load("hdfs://hadoop-master:9000/headlines/models/test.word2vec")
vectors = word_vec.getVectors()



In [34]:
vectors.show()

+-----------+--------------------+
|       word|              vector|
+-----------+--------------------+
|         合法|[-0.0025246678851...|
|         字母|[-0.0040163993835...|
|         范围|[-0.0020277597941...|
|         海东|[5.36677252966910...|
|        del|[-0.0019045806257...|
|        标准库|[0.00404426921159...|
|       反序列化|[1.56049733050167...|
|application|[-0.0236849114298...|
|     please|[0.00192741991486...|
|         函数|[-0.0512114949524...|
|       sha1|[0.00283131306059...|
|       read|[-0.0286798775196...|
|     number|[-0.0199127420783...|
|  Formatter|[0.00314382021315...|
|         小数|[8.44318245071917...|
|     format|[-0.0776508525013...|
|        for|[-0.0517615564167...|
|         对象|[-0.0239426605403...|
|     encode|[-0.0161938350647...|
|    Chinese|[-0.0029618265107...|
+-----------+--------------------+
only showing top 20 rows



In [35]:
# 获取频道的文章画像，得到文章画像的关键词，找到这些文章关键词对应词向量
python_article_profile = article_profile.filter('channel_id=18')

In [37]:
# python_article_profile.show()

In [38]:
# 将文章画像的字典， 词语与权重 进行展开
python_article_profile.registerTempTable('profile')

_articlekeywordsweight = oa.spark.sql("select article_id,  channel_id, keyword, weight from profile LATERAL VIEW explode(keywords) AS keyword, weight")


_articlekeywordsweight.show()




+----------+----------+--------------------+-------------------+
|article_id|channel_id|             keyword|             weight|
+----------+----------+--------------------+-------------------+
|     14402|        18|                 app|  0.208940517619879|
|     14402|        18|                  __|0.25509978230906344|
|     14402|        18|                  语言|0.33137882507083766|
|     14402|        18|               color| 1.2313221791323552|
|     14402|        18|                 src|0.20449518445715634|
|     14402|        18|           QtWidgets| 0.5726361541048846|
|     14402|        18|                maya| 0.7393521059248234|
|     14402|        18|internationalizat...| 1.4838151595614864|
|     14402|        18|                  pa| 0.6228657329981744|
|     14402|        18|               QtGui| 1.3120233092718885|
|     14402|        18|                  .a| 0.1762622725949752|
|     14402|        18|             cnblogs| 0.4812975085295756|
|     14402|        18|  

In [40]:
article_keyword_vec_weights.show()

+----------+----------+--------------------+-------------------+--------------------+--------------------+
|article_id|channel_id|             keyword|             weight|                word|              vector|
+----------+----------+--------------------+-------------------+--------------------+--------------------+
|     14402|        18|                 app|  0.208940517619879|                 app|[-0.0554441921412...|
|     14402|        18|                  __|0.25509978230906344|                  __|[-0.0427282340824...|
|     14402|        18|                  语言|0.33137882507083766|                  语言|[-0.0039124097675...|
|     14402|        18|                 src|0.20449518445715634|                 src|[-0.0818416625261...|
|     14402|        18|           QtWidgets| 0.5726361541048846|           QtWidgets|[-0.0041199713014...|
|     14402|        18|                maya| 0.7393521059248234|                maya|[-0.0028708176687...|
|     14402|        18|internationali

In [42]:
article_keyword_vec_weights.registerTempTable("temptable")

def func(row):
    x = 0
    for v in row.vectors:
        x += v
        
    return row.article_id, row.channel_id, x / len(row.vectors)

article_vector = oa.spark.sql("select article_id, min(channel_id) channel_id, collect_set(vector) vectors from temptable group by article_id").rdd.map(func).toDF(['article_id', 'channel_id', 'articlevector'])




In [43]:
article_vector.show()

+----------+----------+--------------------+
|article_id|channel_id|       articlevector|
+----------+----------+--------------------+
|     14402|        18|[-0.0223212199198...|
|     14405|        18|[-0.0333953801808...|
|     14413|        18|[-0.0455277313478...|
|     14410|        18|[-0.0175953126997...|
|     14400|        18|[-0.0198630368769...|
|     14411|        18|[-0.0407281115185...|
|     14406|        18|[-0.0398771598314...|
|     14401|        18|[-0.0343965302678...|
|     14396|        18|[-0.0302773189032...|
|     14407|        18|[-0.0404947499143...|
+----------+----------+--------------------+



In [50]:
# 1、读取数据（保存到表当中向量），进行类型处理(数组到Vector)
article_vector = oa.spark.sql("select article_id, articlevector from article_vector where channel_id=18 limit 10")

In [51]:
article_vector

DataFrame[article_id: int, articlevector: array<double>]

In [52]:
from pyspark.ml.linalg import Vectors
def _array_to_vector(row):
    return row.article_id, Vectors.dense(row.articlevector)

train = article_vector.rdd.map(_array_to_vector).toDF(['article_id', 'article_vector'])

In [53]:
train

DataFrame[article_id: bigint, article_vector: vector]

In [54]:
# BRP进行fit
from pyspark.ml.feature import BucketedRandomProjectionLSH

BRP = BucketedRandomProjectionLSH(inputCol='article_vector', outputCol='hashes', numHashTables=4.0, bucketLength=10.0)
model = BRP.fit(train)

In [55]:
similar = model.approxSimilarityJoin(train, train, 2.0, distCol='EuclideanDistance')

In [56]:
# datasetA:第一个train
# datasetB:第二个train
similar

DataFrame[datasetA: struct<article_id:bigint,article_vector:vector,hashes:array<vector>>, datasetB: struct<article_id:bigint,article_vector:vector,hashes:array<vector>>, EuclideanDistance: double]

In [57]:
similar.sort(['EuclideanDistance']).show()

+--------------------+--------------------+------------------+
|            datasetA|            datasetB| EuclideanDistance|
+--------------------+--------------------+------------------+
|[13401,[0.0615712...|[14805,[0.1102852...|   0.4593524843945|
|[13401,[0.0615712...|[17370,[0.0887140...|0.5058230219573121|
|[15194,[0.0860524...|[14805,[0.1102852...|0.5690720608173139|
|[14846,[0.1794535...|[14805,[0.1102852...| 0.596659479571255|
|[15237,[0.0201966...|[14805,[0.1102852...|0.6234351626148141|
|[13723,[0.2070807...|[14805,[0.1102852...|0.6299626564819212|
|[13098,[0.1033995...|[14805,[0.1102852...|0.6400357825139292|
|[14846,[0.1794535...|[17370,[0.0887140...|0.6486569730194188|
|[15237,[0.0201966...|[17370,[0.0887140...|0.6741832657291263|
|[15194,[0.0860524...|[17370,[0.0887140...| 0.682628319443476|
|[13098,[0.1033995...|[17370,[0.0887140...|0.6965130281511644|
|[13723,[0.2070807...|[17370,[0.0887140...|0.7355710550707689|
|[13401,[0.0615712...|[15921,[0.1067969...|0.8067496608

In [None]:
def save_hbase(partition):
    import happybase
    pool = happybase.ConnectionPool(size=3, host='hadoop-master')
    
    with pool.connection() as conn:
        # 建议表的连接
        table = conn.table('article_similar')
        for row in partition:
            if row.datasetA.article_id == row.datasetB.article_id:
                pass
            else:
                table.put(str(row.datasetA.article_id).encode(),
                         {"similar:{}".format(row.datasetB.article_id).encode(): b'%0.4f' % (row.EuclideanDistance)})
        # 手动关闭所有的连接
        conn.close()

similar.foreachPartition(save_hbase)