In [1]:
%config ZMQInteractiveShell.ast_node_interactivity='all'

In [2]:
import os
import sys
# 如果当前代码文件运行测试需要加入修改路径，避免出现后导包问题
BASE_DIR = "/opt/codes/lesson1"
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.insert(0, os.path.join(BASE_DIR))

PYSPARK_PYTHON = "/usr/bin/python3.6"
# 当存在多个版本时，不指定很可能会导致出错
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON
os.environ["JAVA_HOME"] = '/usr/lib/jvm/java-8-openjdk-amd64'

from offline import SparkSessionBase
from setting.default import CHANNEL_INFO
from pyspark.ml.feature import Word2Vec

class TrainWord2VecModel(SparkSessionBase):

    SPARK_APP_NAME = "Word2Vec"
    SPARK_URL = "yarn"
    
    ENABLE_HIVE_SUPPORT = True

    def __init__(self):
        self.spark = self._create_spark_session()

w2v = TrainWord2VecModel()

True (('spark.app.name', 'Word2Vec'), ('spark.executor.memory', '16g'), ('spark.executor.cores', 6), ('spark.executor.instances', 6), ('hive.metastore.uris', 'thrift://172.18.0.2:9083'))


In [3]:
# 训练一个频道的模型
w2v.spark.sql("use article")

article_data = w2v.spark.sql("select * from article_data where article_id=18")

article_data.count()
article_data.show()
article_data.head()

DataFrame[]

1

+----------+----------+------------+--------------------------+----------------------------+----------------------------+
|article_id|channel_id|channel_name|                     title|                     content|                    sentence|
+----------+----------+------------+--------------------------+----------------------------+----------------------------+
|        18|        17|        前端|从零开始搭建webpack+rea...|<h2>环境主要依赖版本</h2>...|前端,从零开始搭建webpack+...|
+----------+----------+------------+--------------------------+----------------------------+----------------------------+



Row(article_id=18, channel_id=17, channel_name='前端', title='从零开始搭建webpack+react开发环境原荐', content='<h2>环境主要依赖版本</h2><ul><li><a href="mailto:webpack@4.8.1">webpack@4.8.1</a></li><li><a href="mailto:webpack-cli@2.1.3">webpack-cli@2.1.3</a></li><li><a href="mailto:webpack-dev-server@3.1.4">webpack-dev-server@3.1.4</a></li><li><a href="mailto:react@16.3.2">react@16.3.2</a></li><li><a href="mailto:babel-core@6.26.3">babel-core@6.26.3</a></li><li><a href="mailto:babel-preset-env@1.6.1">babel-preset-env@1.6.1</a></li><li><a href="mailto:bable-preset-react@6.24.1">bable-preset-react@6.24.1</a></li></ul><h2>webpack安装及配置</h2><h3>1. 起步</h3><p>新建项目目录，初始化npm，新建开发源目录</p><pre><code>mkdir webpack-react &amp;&amp; cd webpack-reactnpm init -ymkdir src</code></pre><h3>2.webpack-cli</h3><blockquote><p>webpack从4.x版本开始，需要同时安装webpack，webpack-cli(此工具用于在命令行中运行webpack)。</p></blockquote><pre><code>npm install webpack webpack-cli --save-dev</code></pre><h3>3.wepback配置文件</h3><p>在项目根目录新建webpack.config.js文件，此文件为webpac

In [4]:
# 文章数据进行分词处理,得到分词结果
# 分词
def segmentation(partition):
    import os
    import re

    import jieba
    import jieba.analyse
    import jieba.posseg as pseg
    import codecs

    abspath = "/opt/words"

    # 结巴加载用户词典
    userDict_path = os.path.join(abspath, "ITKeywords.txt")
    jieba.load_userdict(userDict_path)

    # 停用词文本
    stopwords_path = os.path.join(abspath, "stopwords.txt")

    def get_stopwords_list():
        """返回stopwords列表"""
        stopwords_list = [i.strip()
                          for i in codecs.open(stopwords_path, encoding='utf-8').readlines()]
        return stopwords_list

    # 所有的停用词列表
    stopwords_list = get_stopwords_list()

    # 分词
    def cut_sentence(sentence):
        """对切割之后的词语进行过滤，去除停用词，保留名词，英文和自定义词库中的词，长度大于2的词"""
        # print(sentence,"*"*100)
        # eg:[pair('今天', 't'), pair('有', 'd'), pair('雾', 'n'), pair('霾', 'g')]
        seg_list = pseg.lcut(sentence)
        seg_list = [i for i in seg_list if i.flag not in stopwords_list]
        filtered_words_list = []
        for seg in seg_list:
            # print(seg)
            if len(seg.word) <= 1:
                continue
            elif seg.flag == "eng":
                if len(seg.word) <= 2:
                    continue
                else:
                    filtered_words_list.append(seg.word)
            elif seg.flag.startswith("n"):
                filtered_words_list.append(seg.word)
            elif seg.flag in ["x", "eng"]:  # 是自定一个词语或者是英文单词
                filtered_words_list.append(seg.word)
        return filtered_words_list

    for row in partition:
        sentence = re.sub("<.*?>", "", row.sentence)    # 替换掉标签数据
        words = cut_sentence(sentence)
        yield row.article_id, row.channel_id, words

In [5]:
words_df = article_data.rdd.mapPartitions(segmentation).toDF(['article_id', 'channel_id', 'words'])

In [5]:
words_df.show()

+----------+----------+--------------------+
|article_id|channel_id|               words|
+----------+----------+--------------------+
|        18|        17|[web, pa, react, ...|
+----------+----------+--------------------+



In [6]:
# 直接调用word2vec训练
w2v_model = Word2Vec(vectorSize=100, inputCol='words', outputCol='model', minCount=3)

In [7]:
model = w2v_model.fit(words_df)
model.write().overwrite().save("hdfs://hadoop-master:9000/headlines/models/word2vec_model/channel_18_python.word2vec")

In [8]:
# 1、加载某个频道模型，得到每个词的向量
from pyspark.ml.feature import Word2VecModel
wv = Word2VecModel.load("hdfs://hadoop-master:9000/headlines/models/word2vec_model/channel_18_python.word2vec")
vectors = wv.getVectors()


In [10]:
vectors.show()

+----------+--------------------+
|      word|              vector|
+----------+--------------------+
|    plugin|[0.00120834761764...|
|    preset|[-6.0988147743046...|
|        __|[-0.0020910326857...|
|      test|[-6.8908928369637...|
|production|[-0.0033000609837...|
|      body|[-0.0029396014288...|
|       jpg|[0.00549136288464...|
|       svg|[-0.0011241460451...|
|   require|[0.00287217041477...|
|       png|[0.00199300050735...|
|   plugins|[0.00186492933426...|
|       ttf|[-0.0022360032889...|
|  开发环境|[0.00267201638780...|
|    server|[5.27077063452452...|
|   babelrc|[-0.0016424764180...|
|       css|[0.00480731157585...|
|       jsx|[0.00467150565236...|
|   dirname|[0.00223056948743...|
|       mod|[-0.0014713848941...|
|    根目录|[-0.0028706465382...|
+----------+--------------------+
only showing top 20 rows



In [12]:

#2、获取频道的文章画像，得到文章画像的关键词(接着之前增量更新的文章article_profile)
# 获取这些文章20个关键词名称，对应名称找到词向量
article_profile = w2v.spark.sql("select * from article_profile where channel_id=18")

#3、计算得到文章每个词的向量
article_profile.registerTempTable('profile')
keyword_weight = w2v.spark.sql("select article_id, channel_id, kw as keyword, weight from profile LATERAL VIEW explode(keyword) AS kw, weight")
keyword_weight.show()


#4、计算得到文章的平均词向量即文章的向量

+----------+----------+----------+-------------------+
|article_id|channel_id|   keyword|             weight|
+----------+----------+----------+-------------------+
|     13098|        18|       pre| 0.6040228544713039|
|     13098|        18|        __| 2.5402128657950813|
|     13098|        18|      定义|0.15578462141575192|
|     13098|        18|       def| 0.5063617263587876|
|     13098|        18|     slots|0.08082260936617185|
|     13098|        18|     class|0.28893470393859555|
|     13098|        18|     style| 2.4777946385233336|
|     13098|        18|      属性|0.23653656504345583|
|     13098|        18|       div| 0.3434956778611915|
|     13098|        18|      code| 0.9531732426224286|
|     13098|        18|       str|0.36000343733081475|
|     13098|        18|        pa| 0.6652394505803828|
|     13098|        18|      repr|0.14990545346405398|
|     13098|        18|  getPrice| 0.5033878080446283|
|     13098|        18|     color| 1.1338340968062202|
|     13098|  

In [13]:
article_profile.show(10)

+----------+----------+----------------------+----------------------------+
|article_id|channel_id|               keyword|                      topics|
+----------+----------+----------------------+----------------------------+
|     13098|        18|  [pre -> 0.6040228...|        [def, False, Pers...|
|     13248|        18|  [code -> 2.988241...|      [实例, import, prin...|
|     13401|        18|  [block -> 0.11219...|      [type, update, 个数...|
|     13723|        18|  [pre -> 2.1094908...|    [地址, 对象, float, i...|
|     14719|        18|  [pre -> 0.4902203...|        [ul, Thread, True...|
|     14846|        18|  [__ -> 2.54684769...|    [ul, 内存, 列表, Pyth...|
|     15173|        18|[版本 -> 0.57014787...|[加密, 基础, 实例, 笔记, ...|
|     15194|        18|  [img -> 0.1262819...|        [type, group, hre...|
|     15237|        18|  [pre -> 0.5349349...|        [__, click, eleme...|
|     15322|        18|  [pre -> 0.5762606...|        [import, plt, pri...|
+----------+----------+-----------

In [14]:
# 合并文章关键词与词向量
_keywords_vector = keyword_weight.join(vectors, vectors.word==keyword_weight.keyword, 'inner')
_keywords_vector.show()

+----------+----------+-------+-------------------+------+--------------------+
|article_id|channel_id|keyword|             weight|  word|              vector|
+----------+----------+-------+-------------------+------+--------------------+
|     13098|        18|     __| 2.5402128657950813|    __|[-0.0020910326857...|
|     13098|        18|  style| 2.4777946385233336| style|[0.00234883488155...|
|     13098|        18|     pa| 0.6652394505803828|    pa|[0.00481989281252...|
|     13401|        18|  style| 1.5892130542268426| style|[0.00234883488155...|
|     13401|        18|     pa| 0.6652394505803828|    pa|[0.00481989281252...|
|     13401|        18|     ul|0.08197664260942536|    ul|[0.00289023527875...|
|     13723|        18|  style|  2.478027227367427| style|[0.00234883488155...|
|     13723|        18|     pa| 0.6158105474133994|    pa|[0.00481989281252...|
|     14719|        18|     __| 1.5781501213234967|    __|[-0.0020910326857...|
|     14719|        18| import| 0.911124

In [15]:
def compute_vector(row):
    return row.article_id, row.channel_id, row.keyword, row.weight * row.vector
articleKeywordVectors = _keywords_vector.rdd.map(compute_vector).toDF(["article_id", "channel_id", "keyword", "weightingVector"])

articleKeywordVectors.show()

+----------+----------+-------+--------------------+
|article_id|channel_id|keyword|     weightingVector|
+----------+----------+-------+--------------------+
|     13098|        18|     __|[-0.0053116681310...|
|     13098|        18|  style|[0.00581993047630...|
|     13098|        18|     pa|[0.00320638284645...|
|     13401|        18|  style|[0.00373279905599...|
|     13401|        18|     pa|[0.00320638284645...|
|     13401|        18|     ul|[2.36931784503686...|
|     13723|        18|  style|[0.00582047678909...|
|     13723|        18|     pa|[0.00296814083135...|
|     14719|        18|     __|[-0.0032999634866...|
|     14719|        18| import|[3.24772371915024...|
|     14719|        18|     ul|[0.00179416411002...|
|     14846|        18|     __|[-0.0053255417852...|
|     14846|        18|  style|[0.00280351882673...|
|     14846|        18|     pa|[0.00195149893832...|
|     14846|        18|     ul|[5.58565565706795...|
|     15173|        18|    tml|[0.001282950965

In [16]:
# 4、计算得到文章的平均词向量即文章的向量
articleKeywordVectors.registerTempTable('temptable')
articleKeywordVectors = w2v.spark.sql("select article_id, min(channel_id) channel_id, collect_set(weightingVector) vectors from temptable group by article_id")




In [17]:
articleKeywordVectors.show()

+----------+----------+--------------------+
|article_id|channel_id|             vectors|
+----------+----------+--------------------+
|     13098|        18|[[0.0032063828464...|
|     13401|        18|[[0.0032063828464...|
|     13723|        18|[[0.0058204767890...|
|     14719|        18|[[3.2477237191502...|
|     14846|        18|[[0.0019514989383...|
|     15173|        18|[[7.3615191913076...|
|     15194|        18|[[0.0030381512744...|
|     15237|        18|[[-6.892385388656...|
|     15322|        18|[[0.0026939431307...|
|     15432|        18|[[-0.001506927280...|
|     15437|        18|[[-4.136856502859...|
|     15846|        18|[[-0.001451481229...|
|     17499|        18|[[4.0504865963298...|
|     17703|        18|[[0.0038379888374...|
|     17971|        18|[[0.0032063828464...|
|     17979|        18|[[0.0018701040759...|
|     18147|        18|[[0.0012066764930...|
|     18196|        18|[[-0.001228314627...|
|     18730|        18|[[0.0032063828464...|
|     1914

In [18]:
# 求平均值
def compute_avg_vectors(row):
    x = 0
    for i in row.vectors:
        x += i
    
    # 求平均值
    return row.article_id, row.channel_id, x / len(row.vectors)

article_vector = articleKeywordVectors.rdd.map(compute_avg_vectors).toDF(['article_id', 'channel_id', 'vector'])

In [19]:
article_vector.show()

+----------+----------+--------------------+
|article_id|channel_id|              vector|
+----------+----------+--------------------+
|     13098|        18|[0.00123821506390...|
|     13401|        18|[0.00239203789565...|
|     13723|        18|[0.00439430881022...|
|     14719|        18|[-3.9367566823718...|
|     14846|        18|[-2.9896136321805...|
|     15173|        18|[9.15327903963126...|
|     15194|        18|[0.00466878295697...|
|     15237|        18|[-6.8923853886567...|
|     15322|        18|[9.19274869505007...|
|     15432|        18|[-0.0024344387790...|
|     15437|        18|[-4.1368565028592...|
|     15846|        18|[0.00251779999388...|
|     17499|        18|[8.22500038168793...|
|     17703|        18|[0.00383798883747...|
|     17971|        18|[0.00320638284645...|
|     17979|        18|[0.00143617684985...|
|     18147|        18|[0.00120667649304...|
|     18196|        18|[-0.0012283146279...|
|     18730|        18|[0.00320638284645...|
|     1914

In [27]:
def toArray(row):
    return row.article_id, row.channel_id, [float(i) for i in row.vector]

article_vector = article_vector.rdd.map(toArray).toDF(['article_id', 'channel_id', 'vector'])
article_vector.show()

+----------+----------+--------------------+
|article_id|channel_id|              vector|
+----------+----------+--------------------+
|     13098|        18|[0.00123821506390...|
|     13401|        18|[0.00239203789565...|
|     13723|        18|[0.00439430881022...|
|     14719|        18|[-3.9367566823718...|
|     14846|        18|[-2.9896136321805...|
|     15173|        18|[9.15327903963126...|
|     15194|        18|[0.00466878295697...|
|     15237|        18|[-6.8923853886567...|
|     15322|        18|[9.19274869505007...|
|     15432|        18|[-0.0024344387790...|
|     15437|        18|[-4.1368565028592...|
|     15846|        18|[0.00251779999388...|
|     17499|        18|[8.22500038168793...|
|     17703|        18|[0.00383798883747...|
|     17971|        18|[0.00320638284645...|
|     17979|        18|[0.00143617684985...|
|     18147|        18|[0.00120667649304...|
|     18196|        18|[-0.0012283146279...|
|     18730|        18|[0.00320638284645...|
|     1914

In [28]:
article_vector.write.insertInto("article_vector")

In [7]:
from pyspark.ml.linalg import Vectors
Vectors.dense(article_vector.head().vector)

DenseVector([0.0012, 0.0009, -0.0054, 0.0028, 0.0059, 0.0076, -0.0041, 0.014, 0.0002, 0.0005, -0.0048, 0.0022, -0.0059, 0.01, -0.0008, -0.0056, -0.0028, -0.0048, 0.0119, 0.0033, 0.0155, 0.0015, 0.004, 0.0005, 0.0057, 0.004, 0.0034, 0.0031, -0.0016, 0.0051, -0.0072, -0.0026, 0.006, 0.0011, 0.0012, -0.0058, -0.0011, -0.0024, 0.0035, 0.003, -0.0077, 0.0066, 0.0023, 0.0051, 0.0044, 0.0009, -0.0014, -0.0007, 0.0016, -0.0013, -0.0029, -0.0036, -0.0004, 0.0012, -0.004, -0.0004, -0.0059, 0.0082, 0.0007, 0.001, 0.0007, 0.01, 0.0016, -0.0057, 0.0081, 0.0156, -0.003, -0.0041, -0.0093, 0.0081, -0.0035, -0.0081, 0.0014, -0.0024, 0.0018, 0.0017, -0.0048, 0.0057, -0.0007, 0.005, 0.0037, 0.0009, -0.0053, 0.0084, 0.0036, -0.0006, -0.0044, 0.0108, -0.0023, 0.0035, 0.0047, 0.0048, -0.001, -0.0049, 0.0023, -0.0117, -0.0022, 0.011, 0.0041, 0.0021])

In [None]:
# 1、拿到Python频道的所有文章数据，10片测试
from pyspark.ml.linalg import Vectors

def toVector(row):
    return row.article_id, Vectors.dense(row.vector)

train = article_vector.rdd.map(toVector).toDF(['article_id', 'vector'])

In [8]:
# 【修改后】1、拿到Python频道的所有文章数据，10片测试
from pyspark.ml.linalg import Vectors

def toVector(row):
    return row.article_id, Vectors.dense(row.articlevector)

w2v.spark.sql("use article")
article_vector = w2v.spark.sql("select * from article_vector")
train = article_vector.rdd.map(toVector).toDF(['article_id', 'vector'])

DataFrame[]

In [12]:
train.head()

Row(article_id=13098, vector=DenseVector([0.0012, 0.0009, -0.0054, 0.0028, 0.0059, 0.0076, -0.0041, 0.014, 0.0002, 0.0005, -0.0048, 0.0022, -0.0059, 0.01, -0.0008, -0.0056, -0.0028, -0.0048, 0.0119, 0.0033, 0.0155, 0.0015, 0.004, 0.0005, 0.0057, 0.004, 0.0034, 0.0031, -0.0016, 0.0051, -0.0072, -0.0026, 0.006, 0.0011, 0.0012, -0.0058, -0.0011, -0.0024, 0.0035, 0.003, -0.0077, 0.0066, 0.0023, 0.0051, 0.0044, 0.0009, -0.0014, -0.0007, 0.0016, -0.0013, -0.0029, -0.0036, -0.0004, 0.0012, -0.004, -0.0004, -0.0059, 0.0082, 0.0007, 0.001, 0.0007, 0.01, 0.0016, -0.0057, 0.0081, 0.0156, -0.003, -0.0041, -0.0093, 0.0081, -0.0035, -0.0081, 0.0014, -0.0024, 0.0018, 0.0017, -0.0048, 0.0057, -0.0007, 0.005, 0.0037, 0.0009, -0.0053, 0.0084, 0.0036, -0.0006, -0.0044, 0.0108, -0.0023, 0.0035, 0.0047, 0.0048, -0.001, -0.0049, 0.0023, -0.0117, -0.0022, 0.011, 0.0041, 0.0021]))

In [13]:
# 计算相似的文章
from pyspark.ml.feature import BucketedRandomProjectionLSH
brp = BucketedRandomProjectionLSH(inputCol='vector', outputCol='hashes', numHashTables=4.0, bucketLength=10.0)
model = brp.fit(train)
similar = model.approxSimilarityJoin(train, train, 2.0, distCol='EuclideanDistance')

In [22]:
similar.head()

Row(datasetA=Row(article_id=15322, vector=DenseVector([0.0009, -0.0005, -0.0022, -0.001, -0.0008, 0.0, -0.0015, -0.0011, 0.0037, -0.0014, 0.0023, -0.0016, 0.0008, 0.0012, 0.0002, -0.0018, 0.0009, 0.002, -0.0028, -0.001, 0.0011, -0.0001, -0.0015, -0.001, -0.0004, -0.0015, 0.0001, 0.0033, -0.001, -0.0008, -0.0027, 0.0011, 0.0014, -0.0021, -0.0014, 0.0006, 0.0001, 0.0009, -0.0003, 0.0006, 0.0013, -0.0018, 0.0021, 0.0011, 0.0006, 0.0025, 0.0024, -0.0009, -0.0026, 0.0018, 0.0014, 0.0007, 0.0019, -0.0013, -0.0, -0.0019, -0.0004, -0.0029, -0.0022, 0.0017, 0.002, 0.0007, -0.0008, -0.0017, 0.0014, 0.0016, -0.0047, 0.0017, 0.0017, -0.0024, 0.0018, 0.0, 0.0009, 0.0014, 0.0001, 0.0008, 0.0023, -0.0019, 0.0012, -0.0002, -0.0014, 0.0021, 0.001, 0.001, 0.0012, -0.0017, 0.0001, -0.0001, 0.0005, 0.0006, 0.0004, 0.0017, -0.0012, -0.0012, -0.0018, 0.0005, -0.0002, 0.0031, -0.0009, 0.0007]), hashes=[DenseVector([-1.0]), DenseVector([-1.0]), DenseVector([0.0]), DenseVector([0.0])]), datasetB=Row(article_id

In [23]:
similar.count()

432081

In [25]:
similar.show()

+--------------------+--------------------+--------------------+
|            datasetA|            datasetB|   EuclideanDistance|
+--------------------+--------------------+--------------------+
|[15322, [9.192748...|[17708, [0.001364...|0.024286735235304065|
|[15322, [9.192748...|[17634, [0.003628...| 0.05397601741656149|
|[15322, [9.192748...|[116872, [0.00282...|0.042197511883181535|
|[15846, [0.002517...|[138791, [7.81805...| 0.05340404437826069|
|[15846, [0.002517...|[69037, [0.003087...|  0.0749982970707721|
|[15846, [0.002517...|[117864, [0.00342...| 0.06136816217889451|
|[15846, [0.002517...|[44758, [0.004367...|  0.0245824867666197|
|[15846, [0.002517...|[118804, [8.86569...| 0.03899279379139544|
|[15846, [0.002517...|[62110, [7.151850...| 0.03799009007180473|
|[15846, [0.002517...|[118961, [2.50550...|0.043122579225034666|
|[15846, [0.002517...|[67882, [0.001056...|0.034630896771286314|
|[15846, [0.002517...|[68835, [9.474378...|0.038597868827167994|
|[18147, [0.001206...|[14

In [52]:
similar.selectExpr('min(EuclideanDistance)', 'max(EuclideanDistance)', 'avg(EuclideanDistance)').show()
similar.select('EuclideanDistance').rdd.max()
similar.select('EuclideanDistance').rdd.min()

+----------------------+----------------------+----------------------+
|min(EuclideanDistance)|max(EuclideanDistance)|avg(EuclideanDistance)|
+----------------------+----------------------+----------------------+
|                   0.0|    0.4892682562419099|   0.05549577116298516|
+----------------------+----------------------+----------------------+



Row(EuclideanDistance=0.4892682562419099)

Row(EuclideanDistance=0.0)

In [51]:
import faiss
import numpy as np

article_vector = w2v.spark.sql("select articlevector from article_vector")
vector = np.array(article_vector.collect(), dtype=np.dtype('float32')).reshape([article_vector.count(), 100])
print(vector.shape)
nlist = 5  # number of clusters
dimension = 100
quantiser = faiss.IndexFlatL2(dimension)
index = faiss.IndexIVFFlat(quantiser, dimension, nlist, faiss.METRIC_L2)

print(index.is_trained)   # False
index.train(vector)  # train on the database vectors
print(index.ntotal)   # 0
index.add(vector)   # add the vectors and update the index
print(index.is_trained)  # True
print(index.ntotal)   # 200

nprobe = 2  # find 2 most similar clusters
n_query = 10
k = 3  # return 3 nearest neighbours
np.random.seed(0)
query_vectors = np.random.random((n_query, dimension)).astype('float32')
distances, indices = index.search(query_vectors, k)

print(distances)
print(indices)


(8658, 100)
False
0
True
8658
[[30.301449 30.313164 30.325768]
 [35.108173 35.11     35.132874]
 [34.764065 34.773903 34.784462]
 [30.326086 30.329458 30.333145]
 [33.85162  33.857372 33.863617]
 [34.244614 34.259373 34.275196]
 [31.169395 31.169542 31.169682]
 [30.771603 30.775417 30.779568]
 [34.173016 34.186115 34.200172]
 [32.87268  32.874435 32.877934]]
[[1457 7358 6927]
 [5070 4677 2246]
 [1457 7358 6927]
 [1457 7358 6927]
 [1457 7358 6927]
 [1457 7358 6927]
 [4677 5070 2246]
 [1457 7358 6927]
 [1457 7358 6927]
 [5678  520 8623]]
