# demo - clustering

In [4]:
# Install dependencies.
!pip install top2vec
!pip install tensorflow

Collecting tensorflow
  Obtaining dependency information for tensorflow from https://files.pythonhosted.org/packages/4e/d0/167c25cf115d6604adf11bb075a689dcb35bc5e351501e7dfdedef294498/tensorflow-2.13.0-cp310-cp310-macosx_12_0_arm64.whl.metadata
  Downloading tensorflow-2.13.0-cp310-cp310-macosx_12_0_arm64.whl.metadata (2.6 kB)
Collecting tensorflow-macos==2.13.0 (from tensorflow)
  Obtaining dependency information for tensorflow-macos==2.13.0 from https://files.pythonhosted.org/packages/77/29/b3a46ade07623f29d64cb43433aa1c6ba2bfe7419daee76f0cc9a6f7213a/tensorflow_macos-2.13.0-cp310-cp310-macosx_12_0_arm64.whl.metadata
  Downloading tensorflow_macos-2.13.0-cp310-cp310-macosx_12_0_arm64.whl.metadata (3.2 kB)
Collecting tensorboard<2.14,>=2.13 (from tensorflow-macos==2.13.0->tensorflow)
  Downloading tensorboard-2.13.0-py3-none-any.whl (5.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m31m34.7 MB/s[0

## get data from elasticsearch

In [2]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan as escan
import pandas as pd
from collections import deque
import os

# Elasticsearch configuration
host = os.environ.get("ELASTICSEARCH_HOST")
username = os.environ.get("ELASTICSEARCH_USERNAME")
password = os.environ.get("ELASTICSEARCH_PASSWORD")
es = Elasticsearch([host], http_auth=(username, password))
index = 'pages_alias'


query = {
  "query": {
    "term": {
      "userId": {
        "value": os.environ.get("USER_ID")
      }
    }
  },
  "_source": {
    "includes": [
        "id",
        "title",
        "author",
        "description",
        "content",
        "readingProgressPercent"
    ]
  },
    "size": 1000
}
# Scan API for larger library
response = escan(client=es, index=index, query=query, request_timeout=30, size=1000)

# Initialize a double ended queue
output = deque()
# Extend deque with iterator
output.extend(response)
# Convert deque to DataFrame
# df = json_normalize(output)
# df = df[[x for x in df.columns if "_source." in x]]

# Search API
# search_result = es.search(index=index, body=query)
# hits = search_result["hits"]["hits"]
results = []
for hit in output:
    source = hit["_source"]
    source["id"] = hit["_id"]
    source["progress"] = source["readingProgressPercent"]
    # description could be null
    if 'description' not in source or source['description'] is None:
        source['description'] = ''
    # author could be null
    if 'author' not in source or source['author'] is None:
        source['author'] = ''
    results.append(source)
    
df = pd.DataFrame.from_records(results, exclude=['readingProgressPercent'])
df.head()

Unnamed: 0,author,description,id,title,content,progress
0,Mike Allen,Start and end your work day with the stories t...,dd1b02a2-a902-11ec-9d43-f3b9df394c94,Axios Finish Line,"<div class=""page"" id=""readability-page-1""><div...",0.0
1,Mike Allen,Start and end your work day with the stories t...,ea0ac5d0-a955-11ec-8f60-eb1aa2b42b51,Axios Finish Line,"<div class=""page"" id=""readability-page-1""><div...",0.0
2,Mike Allen,Start and end your work day with the stories t...,c48edf82-a9cb-11ec-94a3-e788a163c9be,Axios Finish Line,"<div class=""page"" id=""readability-page-1""><div...",0.0
3,Bloomberg's Big Take,,e090cb1a-a52e-11ec-821e-f7ca66bd6dc1,The Big Take,"<div class=""page"" id=""readability-page-1""><div...",0.0
4,The Pragmatic Engineer,Approaches for shipping code to production rel...,jb609oABKc5BngQzn5dY,Shipping to Production,"<DIV class=""page"" id=""readability-page-1""><div...",0.0


## clustering

In [5]:
from top2vec import Top2Vec

documents = df['title'].to_list()
model = Top2Vec(documents, embedding_model='paraphrase-multilingual-MiniLM-L12-v2')

2023-08-08 11:22:00,757 - top2vec - INFO - Pre-processing documents for training
2023-08-08 11:22:00,921 - top2vec - INFO - Downloading paraphrase-multilingual-MiniLM-L12-v2 model
2023-08-08 11:22:02,867 - top2vec - INFO - Creating joint document/word embedding
2023-08-08 11:22:30,923 - top2vec - INFO - Creating lower dimension embedding of documents
2023-08-08 11:22:43,559 - top2vec - INFO - Finding dense areas of documents
2023-08-08 11:22:43,700 - top2vec - INFO - Finding topics


In [6]:
model.get_num_topics()

89

In [8]:
model.get_topics(num_topics=10)

(array([['co', 'ev', 'dw', 'twitter', 'to', 'ie', 'oc', 'the', 'tl',
         'and', 'pro', 'ai', 'in', 'on', 'of', 'ep', 'it', 'says', 'by',
         'was', 'ars', 'but', 'this', 'what', 'or', 'an', 'its', 'do',
         'com', 'that', 'my', 'things', 'man', 'for', 'you', 'more',
         'dhbzoi', 'he', 'about', 'from', 'be', 'is', 'axios', 'as',
         'know', 'his', 'at', 'biden', 'how', 'are'],
        ['trump', 'politico', 'com', 'charged', 'he', 'news', 'us',
         'twitter', 'his', 'latest', 'court', 'man', 'case', 'axios',
         'now', 'no', 'says', 'huffpost', 'google', 'gop', 'co',
         'against', 'today', 'articles', 'https', 'who', 'russia',
         'stack', 'this', 'next', 'www', 'authentication', 'take', 'that',
         'top', 'here', 'data', 'crypto', 'oc', 'five', 'by', 'was',
         'thejournal', 'in', 'tl', 'search', 'amazon', 'do', 'musk',
         'login'],
        ['entertainment', 'com', 'twitter', 'youtube', 'co', 'on', 'news',
         'huffpost